forked from wylab/llama.cpp
Compare commits
44 Commits
b8348
...
enhance_fa
| Author | SHA1 | Date | |
|---|---|---|---|
| 2985be3324 | |||
| 8dc96153c3 | |||
| b6c83aad55 | |||
| 2e4a6edd4a | |||
| d34ff7eb5b | |||
| 45172df4d6 | |||
| 9b342d0a9f | |||
| 55e87026f7 | |||
| cf21cdf36c | |||
| 0ed992973b | |||
| 1bbec6a75d | |||
| f47a246a08 | |||
| c0ccbd1f86 | |||
| f6da02c3f2 | |||
| dddca026bf | |||
| 3c8521c4f5 | |||
| 67a2209fab | |||
| d65c4f2dc9 | |||
| d8c331c0af | |||
| 46dba9fce8 | |||
| de8f01c2d7 | |||
| 079e5a45f0 | |||
| d3936498a3 | |||
| 34818ea6c0 | |||
| 9e2e2198b0 | |||
| 88915cb55c | |||
| ebbf544ed1 | |||
| b91d7dfe5b | |||
| ae40cd27c8 | |||
| ceef6b5233 | |||
| 07c6a59b4f | |||
| 8b7d340b6f | |||
| 559646472d | |||
| cf45437d35 | |||
| 9cd4ebcfb1 | |||
| 89d0aec042 | |||
| b9da4444df | |||
| 617db241aa | |||
| 1a3d8edbba | |||
| 6b10a82c00 | |||
| d23355afc3 | |||
| b30a5fdf37 | |||
| b4768955c4 | |||
| fc350fdf96 |
@@ -53,10 +53,11 @@ RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
build-essential \
|
||||
git \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3.13 \
|
||||
python3.13-dev \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -104,3 +104,20 @@ OpenCL:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-opencl.h
|
||||
- ggml/src/ggml-opencl/**
|
||||
- docs/backend/OPENCL.md
|
||||
Hexagon:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-hexagon.h
|
||||
- ggml/src/ggml-hexagon/**
|
||||
WebGPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-webgpu.h
|
||||
- ggml/src/ggml-webgpu/**
|
||||
OpenVINO:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-openvino.h
|
||||
- ggml/src/ggml-openvino/**
|
||||
- docs/backend/OPENVINO.md
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
name: CI (3rd-party)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-3rd-party.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-llguidance:
|
||||
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_LLGUIDANCE=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
name: CI (android)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-android.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-android.yml',
|
||||
'examples/llama.android/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
android:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# Disabled due to size (400MB) and always 0 cache hits
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: android-build
|
||||
# evict-old-files: 1d
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@v3
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
./gradlew build --no-daemon
|
||||
|
||||
android-ndk:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
OPENCL_VERSION: 2025.07.22
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'arm64-cpu'
|
||||
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
|
||||
- build: 'arm64-snapdragon'
|
||||
defines: '--preset arm64-android-snapdragon-release'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
run: |
|
||||
mkdir opencl
|
||||
curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
|
||||
tar -xaf opencl/headers.tar.gz -C opencl
|
||||
tar -xaf opencl/clhpp.tar.gz -C opencl
|
||||
tar -xaf opencl/icd-loader.tar.gz -C opencl
|
||||
sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
||||
sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
|
||||
cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
|
||||
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
|
||||
cmake --build build
|
||||
sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
||||
rm -rf opencl
|
||||
|
||||
- name: Install Hexagon SDK
|
||||
id: install_hexsdk
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
env:
|
||||
HEXSDK_VER: 6.4.0.2
|
||||
HEXTLS_VER: 19.0.04
|
||||
run: |
|
||||
curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
|
||||
mkdir hex-sdk
|
||||
tar -xaf hex-sdk.tar.gz -C hex-sdk
|
||||
ls -l hex-sdk
|
||||
sudo mv hex-sdk /opt/hexagon
|
||||
echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV"
|
||||
echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV"
|
||||
echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Update CMake presets
|
||||
id: update_presets
|
||||
if: ${{ matrix.build == 'arm64-snapdragon' }}
|
||||
run: |
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
|
||||
- name: Build
|
||||
id: ndk_build
|
||||
run: |
|
||||
cmake ${{ matrix.defines }} -B build
|
||||
cmake --build build
|
||||
cmake --install build --prefix pkg-adb/llama.cpp
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
echo "FIXME: test on devices"
|
||||
@@ -0,0 +1,214 @@
|
||||
name: CI (apple)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-apple.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-apple.yml',
|
||||
'ggml/src/ggml-metal/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macOS-latest-ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-ios
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macos-latest-ios-xcode:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Xcode
|
||||
uses: ggml-org/setup-xcode@v1
|
||||
with:
|
||||
xcode-version: latest-stable
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
./build-xcframework.sh
|
||||
|
||||
- name: Upload xcframework artifact
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: llama-xcframework
|
||||
path: build-apple/llama.xcframework/
|
||||
retention-days: 1
|
||||
|
||||
- name: Build Xcode project
|
||||
run: |
|
||||
xcodebuild -downloadPlatform iOS
|
||||
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
macOS-latest-tvos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-tvos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-visionos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-swift:
|
||||
runs-on: macos-latest
|
||||
needs: macos-latest-ios-xcode
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-swift
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Download xcframework artifact
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: llama-xcframework
|
||||
path: build-apple/llama.xcframework/
|
||||
|
||||
- name: Build llama.cpp with CMake
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
@@ -37,37 +37,37 @@ jobs:
|
||||
path: ./vulkan_sdk
|
||||
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||
|
||||
ubuntu-24-spacemit-cache:
|
||||
runs-on: ubuntu-24.04
|
||||
#ubuntu-24-spacemit-cache:
|
||||
# runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build-linux-cross.yml
|
||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||
# env:
|
||||
# # Make sure this is in sync with build-linux-cross.yml
|
||||
# SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-toolchain
|
||||
with:
|
||||
path: ./spacemit_toolchain
|
||||
key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# - name: Setup Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup SpacemiT Toolchain
|
||||
if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-spacemit
|
||||
with:
|
||||
path: ./spacemit_toolchain
|
||||
version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
|
||||
# - name: Setup SpacemiT Toolchain
|
||||
# if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
# uses: ./.github/actions/linux-setup-spacemit
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
|
||||
|
||||
ubuntu-24-openvino-cache:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
name: CI (cann)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cann.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-cann.yml',
|
||||
'ggml/src/ggml-cann/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
openEuler-latest-cann:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [x86, aarch64]
|
||||
chip_type: ['910b', '310p']
|
||||
build: ['Release']
|
||||
use_acl_graph: ['on', 'off']
|
||||
exclude:
|
||||
# 310P does not support USE_ACL_GRAPH=on
|
||||
- chip_type: '310p'
|
||||
use_acl_graph: 'on'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
@@ -1,7 +1,24 @@
|
||||
name: Build on Linux using cross-compiler
|
||||
name: CI (cross)
|
||||
on:
|
||||
# only manual triggers due to low-importance of the workflows
|
||||
# TODO: for regular runs, provision dedicated self-hosted runners
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cross.yml',
|
||||
'ggml/src/spacemit/*',
|
||||
'ggml/src/arch/loongarch/*'
|
||||
]
|
||||
# run once every week
|
||||
schedule:
|
||||
- cron: '0 0 * * 0'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
|
||||
jobs:
|
||||
# ubuntu-24-riscv64-cpu-cross:
|
||||
@@ -264,15 +281,15 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Use SpacemiT Toolchain Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-toolchain
|
||||
with:
|
||||
path: ./spacemit_toolchain
|
||||
key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
#- name: Use SpacemiT Toolchain Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup SpacemiT Toolchain
|
||||
if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
#if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-spacemit
|
||||
with:
|
||||
path: ./spacemit_toolchain
|
||||
@@ -0,0 +1,72 @@
|
||||
name: CI (msys)
|
||||
|
||||
on:
|
||||
# only manual triggers due to low-importance of the workflows
|
||||
# TODO: for regular runs, provision dedicated self-hosted runners
|
||||
workflow_dispatch:
|
||||
# run once every week
|
||||
schedule:
|
||||
- cron: '0 0 * * 0'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-msys2:
|
||||
runs-on: windows-2025
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
|
||||
- { sys: CLANG64, env: clang-x86_64, build: Release }
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: windows-msys2
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Setup ${{ matrix.sys }}
|
||||
uses: msys2/setup-msys2@v2
|
||||
with:
|
||||
update: true
|
||||
msystem: ${{matrix.sys}}
|
||||
install: >-
|
||||
base-devel
|
||||
git
|
||||
mingw-w64-${{matrix.env}}-toolchain
|
||||
mingw-w64-${{matrix.env}}-cmake
|
||||
mingw-w64-${{matrix.env}}-openblas
|
||||
|
||||
- name: Build using CMake
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cmake -B build
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
- name: Clean after building using CMake
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
rm -rf build
|
||||
|
||||
- name: Build using CMake w/ OpenBLAS
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
@@ -0,0 +1,136 @@
|
||||
name: CI (riscv)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-riscv.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-riscv.yml',
|
||||
'ggml/src/ggml-cpu/arch/riscv/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-riscv64-native-sanitizer:
|
||||
runs-on: RISCV64
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||
|
||||
# Install Rust stable version
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: GCC version check
|
||||
run: |
|
||||
gcc --version
|
||||
g++ --version
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
# Unique cache directory per matrix combination
|
||||
export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
|
||||
mkdir -p "$CCACHE_DIR"
|
||||
|
||||
# Configure ccache
|
||||
ccache --set-config=max_size=5G
|
||||
ccache --set-config=compression=true
|
||||
ccache --set-config=compression_level=6
|
||||
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||
ccache --set-config=hash_dir=false
|
||||
|
||||
# Export for subsequent steps
|
||||
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
@@ -0,0 +1,87 @@
|
||||
name: CI (sanitize)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-sanitize.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
@@ -0,0 +1,245 @@
|
||||
name: CI (self-hosted)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-self-hosted.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ggml-ci-nvidia-cuda:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm2:
|
||||
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
# runs-on: [self-hosted, Linux, CPU, AMX]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-vulkan:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-rocm:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v2.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
curl -L -o artifact.zip \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
mkdir dawn
|
||||
unzip artifact.zip
|
||||
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
@@ -0,0 +1,96 @@
|
||||
name: CI (vulkan)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-vulkan.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.comp',
|
||||
'**/*.glsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-vulkan.yml',
|
||||
'ggml/src/ggml-vulkan/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-vulkan-llvmpipe:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-24-vulkan-llvmpipe
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
|
||||
- name: Get latest Vulkan SDK version
|
||||
id: vulkan_sdk_version
|
||||
run: |
|
||||
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Use Vulkan SDK Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-vulkan-llvmpipe
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./vulkan_sdk/setup-env.sh
|
||||
cmake -B build \
|
||||
-DGGML_VULKAN=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
export GGML_VK_VISIBLE_DEVICES=0
|
||||
export GGML_VK_DISABLE_F16=1
|
||||
export GGML_VK_DISABLE_COOPMAT=1
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 4800
|
||||
+105
-1091
File diff suppressed because it is too large
Load Diff
@@ -4,10 +4,16 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
paths: [
|
||||
'.github/workflows/python-lint.yml',
|
||||
'**/*.py'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
paths: [
|
||||
'.github/workflows/python-lint.yml',
|
||||
'**/*.py'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
|
||||
@@ -10,7 +10,22 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
paths: [
|
||||
'.github/workflows/release.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
@@ -34,7 +49,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64
|
||||
key: macOS-latest-arm64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Build
|
||||
@@ -81,7 +96,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
key: macOS-latest-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Build
|
||||
@@ -140,7 +155,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-cmake-${{ matrix.build }}
|
||||
key: ubuntu-cpu-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
@@ -191,7 +206,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
key: ubuntu-22-vulkan
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
@@ -238,7 +253,7 @@ jobs:
|
||||
openvino_version: ${{ steps.openvino_version.outputs.value }}
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
@@ -256,7 +271,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-24-cmake-openvino-release-no-preset-v1
|
||||
key: ubuntu-24-openvino-release-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
@@ -329,7 +344,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-cpu-${{ matrix.arch }}
|
||||
key: windows-latest-cpu-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
@@ -390,7 +405,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
@@ -536,7 +551,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
@@ -616,7 +631,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
|
||||
key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
@@ -726,7 +741,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
||||
key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install ROCm
|
||||
|
||||
@@ -0,0 +1,105 @@
|
||||
name: Server (sanitize)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
slow_tests:
|
||||
description: 'Run slow tests'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/server-sanitize.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/Makefile',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'tools/server/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
|
||||
build_type: [RelWithDebInfo]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON \
|
||||
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
|
||||
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.11'
|
||||
pip-install: -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
@@ -1,4 +1,4 @@
|
||||
name: Server-Metal
|
||||
name: Server (self-hosted)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
@@ -14,7 +14,19 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
paths: [
|
||||
'.github/workflows/server-self-hosted.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/Makefile',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'tools/server/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
@@ -28,7 +40,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
server-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
runs-on: [self-hosted, llama-server, macOS, ARM64]
|
||||
|
||||
name: server-metal (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
@@ -71,3 +83,42 @@ jobs:
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
@@ -1,4 +1,3 @@
|
||||
# Server WebUI build and tests
|
||||
name: Server WebUI
|
||||
|
||||
on:
|
||||
@@ -11,10 +10,20 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
|
||||
paths: [
|
||||
'.github/workflows/server-webui.yml',
|
||||
'tools/server/webui/**.*',
|
||||
'tools/server/tests/**.*',
|
||||
'tools/server/public/**'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
|
||||
paths: [
|
||||
'.github/workflows/server-webui.yml',
|
||||
'tools/server/webui/**.*',
|
||||
'tools/server/tests/**.*',
|
||||
'tools/server/public/**'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# Server build and tests
|
||||
name: Server
|
||||
|
||||
on:
|
||||
@@ -15,10 +14,34 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
paths: [
|
||||
'.github/workflows/server.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/Makefile',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'tools/server/**.*'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
paths: [
|
||||
'.github/workflows/server.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/Makefile',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'tools/server/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
@@ -34,17 +57,18 @@ jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
name: server (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
|
||||
build_type: [RelWithDebInfo]
|
||||
build_type: [Release]
|
||||
wf_name: ["default"]
|
||||
include:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
extra_args: ""
|
||||
wf_name: "default"
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
@@ -74,13 +98,7 @@ jobs:
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON \
|
||||
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
|
||||
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
|
||||
+20
-36
@@ -2,29 +2,13 @@
|
||||
# multiplie collaborators per item can be specified
|
||||
|
||||
/.devops/*.Dockerfile @ngxson
|
||||
/.github/actions/ @CISC
|
||||
/.github/workflows/ @CISC
|
||||
/.github/actions/ @ggml-org/ci
|
||||
/.github/workflows/ @ggml-org/ci
|
||||
/ci/ @ggerganov
|
||||
/cmake/ @ggerganov
|
||||
/common/CMakeLists.txt @ggerganov
|
||||
/common/arg.* @ggerganov
|
||||
/common/base64.hpp.* @ggerganov
|
||||
/common/build-info.* @ggerganov
|
||||
/common/chat.* @pwilkin
|
||||
/common/chat-auto*.* @pwilkin
|
||||
/common/chat-diff-analyzer.* @pwilkin
|
||||
/common/chat-peg-parser.* @aldehir
|
||||
/common/common.* @ggerganov
|
||||
/common/console.* @ggerganov
|
||||
/common/http.* @angt
|
||||
/common/jinja/ @ngxson @CISC @aldehir
|
||||
/common/llguidance.* @ggerganov
|
||||
/common/log.* @ggerganov
|
||||
/common/ @ggml-org/llama-common
|
||||
/common/jinja/ @CISC
|
||||
/common/ngram-map.* @srogmann
|
||||
/common/peg-parser.* @aldehir
|
||||
/common/sampling.* @ggerganov
|
||||
/common/speculative.* @ggerganov
|
||||
/common/unicode.* @aldehir
|
||||
/convert_*.py @CISC
|
||||
/examples/batched.swift/ @ggerganov
|
||||
/examples/batched/ @ggerganov
|
||||
@@ -51,29 +35,27 @@
|
||||
/examples/speculative/ @ggerganov
|
||||
/ggml/cmake/ @ggerganov
|
||||
/ggml/include/ @ggerganov
|
||||
/ggml/src/ggml-cann/ @ggml-org/ggml-cann
|
||||
/ggml/src/ggml-common.h @ggerganov
|
||||
/ggml/src/ggml-cpu/ @ggerganov
|
||||
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
|
||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
|
||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda
|
||||
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
|
||||
/ggml/src/ggml-hip/ @IMbackK
|
||||
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
|
||||
/ggml/src/ggml-impl.h @ggerganov
|
||||
/ggml/src/ggml-metal/ @ggerganov
|
||||
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
|
||||
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
|
||||
/ggml/src/ggml-metal/ @ggml-org/ggml-metal
|
||||
/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl
|
||||
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/ggml-quants.* @ggerganov
|
||||
/ggml/src/ggml-rpc/ @rgerganov
|
||||
/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc
|
||||
/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl
|
||||
/ggml/src/ggml-threading.* @ggerganov
|
||||
/ggml/src/ggml-vulkan/ @0cc4m
|
||||
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
|
||||
/ggml/src/ggml-virtgpu/ @kpouget
|
||||
/ggml/src/ggml-webgpu/ @reeselevine
|
||||
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
|
||||
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
|
||||
/ggml/src/ggml.c @ggerganov
|
||||
/ggml/src/ggml.cpp @ggerganov
|
||||
@@ -93,16 +75,18 @@
|
||||
/src/models/ @CISC
|
||||
/tests/ @ggerganov
|
||||
/tests/test-chat.* @pwilkin
|
||||
/tests/test-llama-archs.cpp @JohannesGaessler
|
||||
/tools/batched-bench/ @ggerganov
|
||||
/tools/cli/ @ngxson
|
||||
/tools/completion/ @ggerganov
|
||||
/tools/mtmd/ @ngxson
|
||||
/tools/mtmd/ @ggml-org/llama-mtmd
|
||||
/tools/perplexity/ @ggerganov
|
||||
/tools/parser/ @pwilkin
|
||||
/tools/quantize/ @ggerganov
|
||||
/tools/rpc/ @rgerganov
|
||||
/tools/server/* @ngxson @ggerganov # no subdir
|
||||
/tools/server/webui/ @allozaur
|
||||
/tools/rpc/ @ggml-org/ggml-rpc
|
||||
/tools/server/* @ggml-org/llama-server # no subdir
|
||||
/tools/server/tests/ @ggml-org/llama-server
|
||||
/tools/server/webui/ @ggml-org/llama-webui
|
||||
/tools/tokenize/ @ggerganov
|
||||
/tools/tts/ @ggerganov
|
||||
/vendor/ @ggerganov
|
||||
|
||||
@@ -24,9 +24,9 @@ Fri Mar 6 11:39:45 2026
|
||||
+-----------------------------------------+------------------------+----------------------+
|
||||
```
|
||||
|
||||
## ggml-org/nemotron-3-super-120b-GGUF
|
||||
## ggml-org/Nemotron-3-Super-120B-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/nemotron-3-super-120b-GGUF
|
||||
Model: https://huggingface.co/ggml-org/Nemotron-3-Super-120B-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
@@ -53,7 +53,6 @@ main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_
|
||||
| 8192 | 32 | 16 | 131584 | 171.066 | 766.21 | 10.774 | 47.52 | 181.840 | 723.62 |
|
||||
| 8192 | 32 | 32 | 263168 | 342.140 | 766.19 | 18.969 | 53.98 | 361.109 | 728.78 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | n_ubatch | fa | test | t/s |
|
||||
@@ -70,3 +69,49 @@ main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d32768 | 19.45 ± 0.18 |
|
||||
|
||||
build: 04a65daab (8268)
|
||||
|
||||
## ggml-org/Nemotron-3-Nano-4B-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/Nemotron-3-Nano-4B-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.152 | 3371.61 | 0.597 | 53.64 | 0.748 | 726.90 |
|
||||
| 512 | 32 | 2 | 1088 | 0.319 | 3208.68 | 0.857 | 74.66 | 1.176 | 924.89 |
|
||||
| 512 | 32 | 4 | 2176 | 0.720 | 2843.56 | 1.323 | 96.78 | 2.043 | 1065.18 |
|
||||
| 512 | 32 | 8 | 4352 | 1.428 | 2867.96 | 2.311 | 110.76 | 3.739 | 1163.82 |
|
||||
| 512 | 32 | 16 | 8704 | 2.857 | 2866.94 | 4.203 | 121.82 | 7.060 | 1232.82 |
|
||||
| 512 | 32 | 32 | 17408 | 5.709 | 2869.76 | 7.964 | 128.58 | 13.673 | 1273.14 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.458 | 2809.76 | 0.605 | 52.92 | 2.062 | 2001.52 |
|
||||
| 4096 | 32 | 2 | 8256 | 2.905 | 2819.95 | 0.875 | 73.12 | 3.780 | 2183.95 |
|
||||
| 4096 | 32 | 4 | 16512 | 5.790 | 2829.74 | 1.361 | 94.07 | 7.151 | 2309.17 |
|
||||
| 4096 | 32 | 8 | 33024 | 11.598 | 2825.32 | 2.378 | 107.65 | 13.976 | 2362.89 |
|
||||
| 4096 | 32 | 16 | 66048 | 23.208 | 2823.88 | 4.348 | 117.76 | 27.556 | 2396.89 |
|
||||
| 4096 | 32 | 32 | 132096 | 46.515 | 2817.85 | 8.279 | 123.69 | 54.794 | 2410.79 |
|
||||
| 8192 | 32 | 1 | 8224 | 2.950 | 2776.95 | 0.617 | 51.89 | 3.567 | 2305.75 |
|
||||
| 8192 | 32 | 2 | 16448 | 5.921 | 2767.32 | 0.896 | 71.45 | 6.816 | 2413.05 |
|
||||
| 8192 | 32 | 4 | 32896 | 11.842 | 2767.21 | 1.401 | 91.34 | 13.243 | 2484.03 |
|
||||
| 8192 | 32 | 8 | 65792 | 23.726 | 2762.17 | 2.461 | 104.03 | 26.187 | 2512.38 |
|
||||
| 8192 | 32 | 16 | 131584 | 47.777 | 2743.43 | 4.577 | 111.86 | 52.354 | 2513.36 |
|
||||
| 8192 | 32 | 32 | 263168 | 96.691 | 2711.16 | 8.772 | 116.73 | 105.463 | 2495.36 |
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | n_ubatch | fa | test | t/s |
|
||||
| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 | 2761.90 ± 19.31 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 | 52.85 ± 0.12 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d4096 | 2687.07 ± 21.84 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d4096 | 52.32 ± 0.23 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d8192 | 2564.52 ± 57.69 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d8192 | 51.27 ± 0.34 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d16384 | 2334.02 ± 37.83 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d16384 | 49.71 ± 0.14 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d32768 | 2041.46 ± 40.45 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d32768 | 46.71 ± 0.13 |
|
||||
|
||||
build: 1bbec6a75 (8382)
|
||||
|
||||
@@ -479,6 +479,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
|
||||
|
||||
if (!comparison_with_tools || !comparison_with_reasoning) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto & diff_tools = comparison_with_tools->diff;
|
||||
@@ -911,8 +912,10 @@ void analyze_tools::extract_function_markers() {
|
||||
// we'll have to rely on an extra diff with no-calls version
|
||||
auto notool_comp = compare_variants(
|
||||
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_nocall }); });
|
||||
auto nt_diff = notool_comp->diff;
|
||||
closer_suffix = nt_diff.left.substr(nt_diff.left.find("YYYY") + 4);
|
||||
if (notool_comp) {
|
||||
auto nt_diff = notool_comp->diff;
|
||||
closer_suffix = nt_diff.left.substr(nt_diff.left.find("YYYY") + 4);
|
||||
}
|
||||
} else {
|
||||
closer_suffix = diff.suffix.substr(0, diff.suffix.find(suffix_marker));
|
||||
}
|
||||
|
||||
+4
-1
@@ -1519,7 +1519,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
// map developer to system for all models except for GPT-OSS
|
||||
workaround::map_developer_role_to_system(params.messages);
|
||||
}
|
||||
workaround::func_args_not_string(params.messages);
|
||||
|
||||
if (!tmpl.original_caps().supports_system_role) {
|
||||
workaround::system_message_not_supported(params.messages);
|
||||
@@ -1532,6 +1531,10 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
workaround::requires_non_null_content(params.messages);
|
||||
}
|
||||
|
||||
if (tmpl.original_caps().supports_object_arguments) {
|
||||
workaround::func_args_not_string(params.messages);
|
||||
}
|
||||
|
||||
params.extra_context = common_chat_extra_context();
|
||||
for (auto el : inputs.chat_template_kwargs) {
|
||||
params.extra_context[el.first] = json::parse(el.second);
|
||||
|
||||
+108
-12
@@ -75,6 +75,7 @@ std::map<std::string, bool> caps::to_map() const {
|
||||
{"supports_parallel_tool_calls", supports_parallel_tool_calls},
|
||||
{"supports_system_role", supports_system_role},
|
||||
{"supports_preserve_reasoning", supports_preserve_reasoning},
|
||||
{"supports_object_arguments", supports_object_arguments},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -158,9 +159,9 @@ caps caps_get(jinja::program & prog) {
|
||||
}
|
||||
);
|
||||
|
||||
JJ_DEBUG("%s\n", ">>> Running capability check: single tool support");
|
||||
JJ_DEBUG("%s\n", ">>> Running capability check: single tool with object arguments support");
|
||||
|
||||
// case: tools support: single call
|
||||
// case: tools support: single call with object arguments
|
||||
caps_try_execute(
|
||||
prog,
|
||||
[&]() {
|
||||
@@ -226,9 +227,7 @@ caps caps_get(jinja::program & prog) {
|
||||
},
|
||||
[&](bool success, value & messages, value & tools) {
|
||||
if (!success) {
|
||||
result.supports_tool_calls = false;
|
||||
result.supports_tools = false;
|
||||
return;
|
||||
return; // Nothing can be inferred
|
||||
}
|
||||
|
||||
auto & tool_name = tools->at(0)->at("function")->at("name");
|
||||
@@ -242,16 +241,117 @@ caps caps_get(jinja::program & prog) {
|
||||
caps_print_stats(tool_calls, "messages[1].tool_calls");
|
||||
if (!tool_calls->stats.used) {
|
||||
result.supports_tool_calls = false;
|
||||
return;
|
||||
}
|
||||
|
||||
auto & tool_arg = tool_calls->at(0)->at("function")->at("arguments")->at("arg");
|
||||
caps_print_stats(tool_arg, "messages[1].tool_calls[0].function.arguments.arg");
|
||||
if (tool_arg->stats.used) {
|
||||
result.supports_object_arguments = true;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
if (!result.supports_object_arguments) {
|
||||
JJ_DEBUG("%s\n", ">>> Running capability check: single tool with string arguments support");
|
||||
|
||||
// case: tools support: single call with string arguments
|
||||
caps_try_execute(
|
||||
prog,
|
||||
[&]() {
|
||||
// messages
|
||||
return json::array({
|
||||
{
|
||||
{"role", "user"},
|
||||
{"content", "User message"},
|
||||
},
|
||||
{
|
||||
{"role", "assistant"},
|
||||
{"content", ""}, // Some templates expect content to be empty with tool calls
|
||||
{"tool_calls", json::array({
|
||||
{
|
||||
{"id", "call00001"},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", "tool1"},
|
||||
{"arguments", R"({"arg": "value"})"}
|
||||
}}
|
||||
}
|
||||
})}
|
||||
},
|
||||
{
|
||||
{"role", "tool"},
|
||||
{"content", "Tool response"},
|
||||
{"tool_call_id", "call00001"}
|
||||
},
|
||||
{
|
||||
{"role", "assistant"},
|
||||
{"content", "The tool response was 'tool response'"}
|
||||
},
|
||||
{
|
||||
{"role", "user"},
|
||||
{"content", "User message"},
|
||||
},
|
||||
});
|
||||
},
|
||||
[&]() {
|
||||
// tools
|
||||
return json::array({
|
||||
{
|
||||
{"name", "tool"},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", "tool1"},
|
||||
{"description", "Tool description"},
|
||||
{"parameters", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"arg", {
|
||||
{"type", "string"},
|
||||
{"description", "Arg description"},
|
||||
}},
|
||||
}},
|
||||
{"required", json::array({ "arg" })},
|
||||
}},
|
||||
}},
|
||||
},
|
||||
});
|
||||
},
|
||||
[&](bool success, value & messages, value & tools) {
|
||||
if (!success) {
|
||||
result.supports_tool_calls = false;
|
||||
result.supports_tools = false;
|
||||
return;
|
||||
}
|
||||
|
||||
auto & tool_name = tools->at(0)->at("function")->at("name");
|
||||
caps_print_stats(tool_name, "tools[0].function.name");
|
||||
caps_print_stats(tools, "tools");
|
||||
if (!tool_name->stats.used) {
|
||||
result.supports_tools = false;
|
||||
}
|
||||
|
||||
auto & tool_calls = messages->at(1)->at("tool_calls");
|
||||
caps_print_stats(tool_calls, "messages[1].tool_calls");
|
||||
if (!tool_calls->stats.used) {
|
||||
result.supports_tool_calls = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
JJ_DEBUG("%s\n", ">>> Running capability check: parallel tool support");
|
||||
|
||||
// case: tools support: parallel calls
|
||||
caps_try_execute(
|
||||
prog,
|
||||
[&]() {
|
||||
json args = json(R"({"arg": "value"})");
|
||||
if (result.supports_object_arguments) {
|
||||
args = json{{"arg", "value"}};
|
||||
}
|
||||
|
||||
// messages
|
||||
return json::array({
|
||||
{
|
||||
@@ -267,9 +367,7 @@ caps caps_get(jinja::program & prog) {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", "tool1"},
|
||||
{"arguments", {
|
||||
{"arg", "value"}
|
||||
}}
|
||||
{"arguments", args}
|
||||
}}
|
||||
},
|
||||
{
|
||||
@@ -277,9 +375,7 @@ caps caps_get(jinja::program & prog) {
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"name", "tool1"},
|
||||
{"arguments", {
|
||||
{"arg", "value"}
|
||||
}}
|
||||
{"arguments", args}
|
||||
}}
|
||||
}
|
||||
})}
|
||||
@@ -328,7 +424,7 @@ caps caps_get(jinja::program & prog) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto & tool_calls = messages->at(1)->at("tool_calls");;
|
||||
auto & tool_calls = messages->at(1)->at("tool_calls");
|
||||
caps_print_stats(tool_calls, "messages[1].tool_calls");
|
||||
|
||||
// check for second tool call usage
|
||||
|
||||
@@ -18,6 +18,8 @@ struct caps {
|
||||
bool supports_string_content = true;
|
||||
bool supports_typed_content = false;
|
||||
|
||||
bool supports_object_arguments = false;
|
||||
|
||||
// for reporting on server
|
||||
std::map<std::string, bool> to_map() const;
|
||||
|
||||
|
||||
@@ -102,7 +102,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
||||
auto is_star = *it == '*';
|
||||
++it;
|
||||
if (is_star) {
|
||||
if (*it == '?') {
|
||||
if (it != end && *it == '?') {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
+136
-53
@@ -272,8 +272,9 @@ class ModelBase:
|
||||
return tensors
|
||||
|
||||
def dequant_model(self):
|
||||
if self._is_nvfp4:
|
||||
return # NVFP4 weights are repacked in _generate_nvfp4_tensors
|
||||
# If all quantized tensors were already handled (e.g. pure NVFP4), skip
|
||||
if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
|
||||
return
|
||||
|
||||
tensors_to_remove: list[str] = []
|
||||
new_tensors: dict[str, Callable[[], Tensor]] = {}
|
||||
@@ -297,11 +298,16 @@ class ModelBase:
|
||||
scale = scale.float()
|
||||
|
||||
if block_size is not None:
|
||||
dim_offset = scale.ndim - len(block_size)
|
||||
for i, size in enumerate(block_size):
|
||||
scale = scale.repeat_interleave(size, i)
|
||||
scale = scale.repeat_interleave(size, dim_offset + i)
|
||||
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
|
||||
scale = scale[tuple(slice(0, size) for size in weight.shape)]
|
||||
|
||||
# align scale dims to weight for correct broadcasting (e.g. [128] -> [128, 1, 1])
|
||||
while scale.ndim < weight.ndim:
|
||||
scale = scale.unsqueeze(-1)
|
||||
|
||||
return weight.float() * scale
|
||||
|
||||
# ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476
|
||||
@@ -392,7 +398,7 @@ class ModelBase:
|
||||
elif quant_method == "fp8":
|
||||
block_size = quant_config.get("weight_block_size")
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale_inv"):
|
||||
if name.endswith("_scale_inv"):
|
||||
weight_name = name.removesuffix("_scale_inv")
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
@@ -400,6 +406,8 @@ class ModelBase:
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith(".activation_scale"): # unused
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused
|
||||
tensors_to_remove.append(name)
|
||||
# mistral format
|
||||
if name.endswith(".qscale_weight"):
|
||||
weight_name = name.removesuffix("qscale_weight") + "weight"
|
||||
@@ -474,7 +482,20 @@ class ModelBase:
|
||||
tensors_to_remove.append(base_name + "_zero_point")
|
||||
else:
|
||||
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
|
||||
else:
|
||||
elif quant_method == "modelopt":
|
||||
# Mixed-precision ModelOpt models: NVFP4 tensors are handled by
|
||||
# _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and
|
||||
# are dequantized here. input_scale tensors are unused.
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method is not None:
|
||||
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
|
||||
|
||||
for name in tensors_to_remove:
|
||||
@@ -520,12 +541,6 @@ class ModelBase:
|
||||
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# skip NVFP4 auxiliary tensors (handled in _generate_nvfp4_tensors)
|
||||
if self._is_nvfp4:
|
||||
if name.endswith((".weight_scale", ".weight_scale_2", ".input_scale", ".k_scale", ".v_scale")):
|
||||
return []
|
||||
if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
|
||||
return []
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
@@ -609,6 +624,7 @@ class ModelBase:
|
||||
expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
|
||||
expert_shapes: dict[tuple[int, str], list[int]] = {}
|
||||
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
|
||||
consumed: list[str] = []
|
||||
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if not name.endswith(".weight"):
|
||||
@@ -620,8 +636,18 @@ class ModelBase:
|
||||
# Force eager materialization of lazy tensors
|
||||
weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
|
||||
scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
|
||||
|
||||
# Skip non-NVFP4 tensors (e.g. FP8 with per-channel 1D scales)
|
||||
if scale.ndim < 2:
|
||||
continue
|
||||
|
||||
scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
|
||||
|
||||
# Mark tensors for removal from model_tensors (already written to gguf)
|
||||
consumed.extend([name, scale_name])
|
||||
if scale2_name in self.model_tensors:
|
||||
consumed.append(scale2_name)
|
||||
|
||||
# Check if this is a per-expert tensor
|
||||
m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
|
||||
if m:
|
||||
@@ -652,6 +678,15 @@ class ModelBase:
|
||||
for (bid, proj_type) in list(expert_blocks.keys()):
|
||||
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||
|
||||
# Remove consumed tensors so get_tensors/modify_tensors won't see them
|
||||
for name in consumed:
|
||||
self.model_tensors.pop(name, None)
|
||||
|
||||
# Remove unused auxiliary tensors (input_scale, k_scale, v_scale)
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
del self.model_tensors[name]
|
||||
|
||||
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
|
||||
experts = expert_blocks.pop(key)
|
||||
scales = expert_scales.pop(key)
|
||||
@@ -677,20 +712,31 @@ class ModelBase:
|
||||
def prepare_tensors(self):
|
||||
# detect NVFP4 quantization (ModelOpt format)
|
||||
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
|
||||
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
|
||||
quant_config_file = self.dir_model / "hf_quant_config.json"
|
||||
|
||||
if not quant_algo and quant_config_file.is_file():
|
||||
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
|
||||
with open(quant_config_file, "r", encoding="utf-8") as f:
|
||||
quant_algo = (json.load(f).get("quantization") or {}).get("quant_algo")
|
||||
quant_config = json.load(f).get("quantization") or {}
|
||||
quant_algo = quant_config.get("quant_algo", quant_algo)
|
||||
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
|
||||
|
||||
# Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with
|
||||
# per-layer NVFP4/FP8) instead of a single global "NVFP4" value.
|
||||
if quant_algo != "NVFP4":
|
||||
if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
|
||||
quant_algo = "NVFP4"
|
||||
|
||||
self._is_nvfp4 = quant_algo == "NVFP4"
|
||||
|
||||
self.dequant_model()
|
||||
|
||||
# NVFP4 weights are repacked and written directly to gguf_writer
|
||||
# NVFP4 weights are repacked and written directly to gguf_writer.
|
||||
# This must run before dequant_model so NVFP4 tensors are removed
|
||||
# from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
|
||||
if self._is_nvfp4:
|
||||
self._generate_nvfp4_tensors()
|
||||
|
||||
self.dequant_model()
|
||||
|
||||
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
|
||||
if self.tensor_map.mapping:
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
@@ -2992,10 +3038,16 @@ class LlavaVisionModel(MmprojModel):
|
||||
def get_token_id(self, token: str) -> int:
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
added_tokens_decoder = json.load(f)['added_tokens_decoder']
|
||||
added_tokens_decoder = json.load(f).get('added_tokens_decoder') or {}
|
||||
for id_, token_data in added_tokens_decoder.items():
|
||||
if token_data["content"] == token:
|
||||
if token_data.get("content") == token:
|
||||
return int(id_)
|
||||
# fallthrough to tokenizer.json
|
||||
with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
for token_data in tokenizer_json["added_tokens"]:
|
||||
if token_data["content"] == token:
|
||||
return int(token_data["id"])
|
||||
raise ValueError(f"Token '{token}' not found in tokenizer config.")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
@@ -3159,40 +3211,6 @@ class Llama4VisionModel(MmprojModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
"Mistral3ForConditionalGeneration",
|
||||
"Ministral3ForCausalLM",
|
||||
)
|
||||
class Mistral3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.MISTRAL3
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# for compatibility, we use LLAMA arch for older models
|
||||
# TODO: remove this once everyone has migrated to newer version of llama.cpp
|
||||
if self.hparams.get("model_type") != "ministral3":
|
||||
self.model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
rope_params = self.rope_parameters
|
||||
if self.hparams.get("model_type") == "ministral3":
|
||||
assert rope_params, "ministral3 must have 'rope_parameters' config"
|
||||
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
||||
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
name = name.replace("language_model.", "")
|
||||
if "multi_modal_projector" in name or "vision_tower" in name:
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("DeciLMForCausalLM")
|
||||
class DeciModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.DECI
|
||||
@@ -8232,6 +8250,8 @@ class DeepseekV2Model(TextModel):
|
||||
# TODO @ngxson : remove this when we support MTP for deepseek models
|
||||
skip_mtp = True
|
||||
|
||||
merge_expert = True
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_gpt2()
|
||||
@@ -8370,7 +8390,7 @@ class DeepseekV2Model(TextModel):
|
||||
return
|
||||
|
||||
# process the experts separately
|
||||
if name.find("mlp.experts") != -1:
|
||||
if self.merge_expert and name.find("mlp.experts") != -1:
|
||||
n_experts = self.hparams["n_routed_experts"]
|
||||
assert bid is not None
|
||||
|
||||
@@ -8429,6 +8449,69 @@ class DeepseekV2Model(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
"Mistral3ForConditionalGeneration",
|
||||
"Ministral3ForCausalLM",
|
||||
)
|
||||
class Mistral3Model(TextModel):
|
||||
class Ministral3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.MISTRAL3
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
rope_params = self.rope_parameters
|
||||
if self.hparams.get("model_type") == "ministral3":
|
||||
assert rope_params, "ministral3 must have 'rope_parameters' config"
|
||||
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
|
||||
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
name = name.replace("language_model.", "")
|
||||
if "multi_modal_projector" in name or "vision_tower" in name:
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
class Mistral4Model(DeepseekV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.MISTRAL4
|
||||
skip_mtp = False # model contains no MTP layers, so no need to skip
|
||||
merge_expert = False # experts are already stacked as 3D
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
if name.endswith(".down_proj") or name.endswith(".gate_up_proj"):
|
||||
name = name + ".weight"
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
model_arch = gguf.MODEL_ARCH.MISTRAL3 # unused
|
||||
impl: TextModel
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if self.hparams.get("model_type") == "mistral4":
|
||||
self.impl = Mistral3Model.Mistral4Model(*args, **kwargs)
|
||||
else:
|
||||
self.impl = Mistral3Model.Ministral3Model(*args, **kwargs)
|
||||
|
||||
def set_vocab(self):
|
||||
self.impl.set_vocab()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.impl.set_gguf_parameters()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
yield from self.impl.modify_tensors(data_torch, name, bid)
|
||||
|
||||
def prepare_tensors(self):
|
||||
self.impl.prepare_tensors()
|
||||
|
||||
def write_vocab(self):
|
||||
self.impl.write_vocab()
|
||||
|
||||
def write(self):
|
||||
self.impl.write()
|
||||
|
||||
|
||||
@ModelBase.register("MiniMaxM2ForCausalLM")
|
||||
class MiniMaxM2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MINIMAXM2
|
||||
|
||||
@@ -128,6 +128,12 @@ class LoraTorchTensor:
|
||||
assert dim is None
|
||||
return self.shape
|
||||
|
||||
def contiguous(self) -> LoraTorchTensor:
|
||||
return LoraTorchTensor(
|
||||
self._lora_A.contiguous(),
|
||||
self._lora_B.contiguous(),
|
||||
)
|
||||
|
||||
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
|
||||
if isinstance(shape[0], tuple):
|
||||
new_shape: tuple[int, ...] = shape[0]
|
||||
|
||||
+1
-1
@@ -117,5 +117,5 @@ Legend:
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
+288
-18
@@ -5937,6 +5937,20 @@
|
||||
"SYCL0","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.100000","support","1","yes","SYCL"
|
||||
"SYCL0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=10.000000","support","1","yes","SYCL"
|
||||
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=10.000000,inplace=0","support","1","yes","SYCL"
|
||||
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=10.000000","support","1","yes","SYCL"
|
||||
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=10.000000,inplace=0","support","1","yes","SYCL"
|
||||
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=10.000000","support","1","yes","SYCL"
|
||||
"SYCL0","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=10.000000","support","1","yes","SYCL"
|
||||
"SYCL0","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=10.000000,inplace=0","support","1","yes","SYCL"
|
||||
"SYCL0","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=10.000000","support","1","yes","SYCL"
|
||||
"SYCL0","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=10.000000,inplace=0","support","1","yes","SYCL"
|
||||
"SYCL0","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=10.000000","support","1","yes","SYCL"
|
||||
"SYCL0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","SYCL"
|
||||
"SYCL0","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","SYCL"
|
||||
@@ -10209,24 +10223,24 @@
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","0","no","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","1","yes","SYCL"
|
||||
"SYCL0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","1","yes","SYCL"
|
||||
"SYCL0","SUM","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
|
||||
"SYCL0","SUM","type=f32,ne=[11,5,6,3],permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","SUM","type=f32,ne=[11,5,6,3],permute=[0,3,2,1]","support","0","no","SYCL"
|
||||
@@ -13325,6 +13339,262 @@
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","SYCL"
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -666,7 +666,7 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if defined __ARM_NEON
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
||||
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
float32x4_t acc = vdupq_n_f32(0.0f);
|
||||
|
||||
@@ -1473,10 +1473,12 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
} else {
|
||||
if (op->src[0]->type != GGML_TYPE_F16) {
|
||||
return nullptr;
|
||||
}
|
||||
std::array<ggml_kleidiai_kernels *, GGML_KLEIDIAI_MAX_KERNEL_SLOTS> kernel_chain;
|
||||
const int slot_total = kleidiai_collect_kernel_chain(op, kernel_chain);
|
||||
const bool has_kernel = slot_total > 0;
|
||||
if (has_kernel && op->src[1]->ne[1] > 1) {
|
||||
if (slot_total > 0 && op->src[1]->ne[1] > 1) {
|
||||
if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
||||
(op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
|
||||
return nullptr;
|
||||
|
||||
@@ -6205,7 +6205,7 @@ static void ggml_compute_forward_im2col_f16(
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
@@ -6236,7 +6236,7 @@ static void ggml_compute_forward_im2col_f16(
|
||||
int ofs1 = is_2D ? nb12 : nb11;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
{
|
||||
@@ -6249,7 +6249,12 @@ static void ggml_compute_forward_im2col_f16(
|
||||
|
||||
// micro kernel
|
||||
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
||||
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
||||
const float * const src_data_f32 = src1->type == GGML_TYPE_F32
|
||||
? (const float *)((const char *) src1->data + in*ofs0 + iic*ofs1)
|
||||
: nullptr; // [IH, IW]
|
||||
const ggml_fp16_t * const src_data_f16 = src1->type == GGML_TYPE_F16
|
||||
? (const ggml_fp16_t *)((const char *) src1->data + in*ofs0 + iic*ofs1)
|
||||
: nullptr; // [IH, IW]
|
||||
|
||||
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
||||
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
||||
@@ -6259,7 +6264,11 @@ static void ggml_compute_forward_im2col_f16(
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
||||
} else {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
||||
if (src_data_f32 != nullptr) {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data_f32[iih*IW + iiw]);
|
||||
} else {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = src_data_f16[iih*IW + iiw];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -892,7 +892,7 @@ void launch_fattn(
|
||||
const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
const int ntiles_z_gqa = ((gqa_ratio + ncols2 - 1) / ncols2);
|
||||
const int ntiles_total = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3];
|
||||
const int ntiles_dst = ntiles_x * ntiles_z_gqa * K->ne[2] * Q->ne[3];
|
||||
|
||||
// Optional optimization where the mask is scanned to determine whether part of the calculation can be skipped.
|
||||
// Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
|
||||
@@ -919,37 +919,37 @@ void launch_fattn(
|
||||
GGML_ASSERT(max_blocks_per_sm > 0);
|
||||
int parallel_blocks = max_blocks_per_sm;
|
||||
|
||||
const int ntiles_KV = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by KV cache length.
|
||||
|
||||
dim3 blocks_num;
|
||||
if (stream_k) {
|
||||
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
|
||||
const int max_blocks = max_blocks_per_sm*nsm;
|
||||
const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
|
||||
const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);
|
||||
const int tiles_nwaves = (ntiles_dst + max_blocks - 1) / max_blocks;
|
||||
const int tiles_efficiency_percent = 100 * ntiles_dst / (max_blocks*tiles_nwaves);
|
||||
|
||||
const int nblocks_stream_k = max_blocks;
|
||||
const int nblocks_stream_k = std::min(max_blocks, ntiles_KV*ntiles_dst);
|
||||
|
||||
const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75;
|
||||
|
||||
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
|
||||
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_dst;
|
||||
blocks_num.y = 1;
|
||||
blocks_num.z = 1;
|
||||
|
||||
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
||||
if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
||||
dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
|
||||
}
|
||||
} else {
|
||||
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.
|
||||
|
||||
// parallel_blocks must not be larger than what the tensor size allows:
|
||||
parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
|
||||
parallel_blocks = std::min(parallel_blocks, ntiles_KV);
|
||||
|
||||
// If ntiles_total % blocks_per_wave != 0 then some efficiency is lost due to tail effects.
|
||||
// Test whether parallel_blocks can be set to a higher value for better efficiency.
|
||||
const int blocks_per_wave = nsm * max_blocks_per_sm;
|
||||
int nwaves_best = 0;
|
||||
int efficiency_percent_best = 0;
|
||||
for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KQ; ++parallel_blocks_test) {
|
||||
const int nblocks_total = ntiles_total * parallel_blocks_test;
|
||||
for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KV; ++parallel_blocks_test) {
|
||||
const int nblocks_total = ntiles_dst * parallel_blocks_test;
|
||||
const int nwaves = (nblocks_total + blocks_per_wave - 1) / blocks_per_wave;
|
||||
const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
|
||||
|
||||
@@ -1015,7 +1015,7 @@ void launch_fattn(
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (stream_k) {
|
||||
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
||||
if (ntiles_dst % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
||||
const dim3 block_dim_combine(DV, 1, 1);
|
||||
const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#include "gated_delta_net.cuh"
|
||||
|
||||
template <int S_v, bool KDA>
|
||||
__global__ void gated_delta_net_cuda(const float * q,
|
||||
__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
|
||||
gated_delta_net_cuda(const float * q,
|
||||
const float * k,
|
||||
const float * v,
|
||||
const float * g,
|
||||
@@ -38,7 +39,7 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||
|
||||
const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
|
||||
state += state_offset;
|
||||
curr_state += state_offset;
|
||||
curr_state += state_offset + col * S_v;
|
||||
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
|
||||
@@ -46,10 +47,11 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||
constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
|
||||
float s_shard[rows_per_lane];
|
||||
// state is stored transposed: M[col][i] = S[i][col], row col is contiguous
|
||||
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = curr_state[col * S_v + i];
|
||||
s_shard[r] = curr_state[i];
|
||||
}
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
@@ -63,6 +65,16 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||
|
||||
const float beta_val = *beta_t;
|
||||
|
||||
// Cache k and q in registers
|
||||
float k_reg[rows_per_lane];
|
||||
float q_reg[rows_per_lane];
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
k_reg[r] = k_t[i];
|
||||
q_reg[r] = q_t[i];
|
||||
}
|
||||
|
||||
if constexpr (!KDA) {
|
||||
const float g_val = expf(*g_t);
|
||||
|
||||
@@ -70,8 +82,7 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||
float kv_shard = 0.0f;
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
kv_shard += s_shard[r] * k_t[i];
|
||||
kv_shard += s_shard[r] * k_reg[r];
|
||||
}
|
||||
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||
|
||||
@@ -83,9 +94,8 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||
float attn_partial = 0.0f;
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = g_val * s_shard[r] + k_t[i] * delta_col;
|
||||
attn_partial += s_shard[r] * q_t[i];
|
||||
s_shard[r] = g_val * s_shard[r] + k_reg[r] * delta_col;
|
||||
attn_partial += s_shard[r] * q_reg[r];
|
||||
}
|
||||
|
||||
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||
@@ -99,7 +109,7 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
kv_shard += expf(g_t[i]) * s_shard[r] * k_t[i];
|
||||
kv_shard += expf(g_t[i]) * s_shard[r] * k_reg[r];
|
||||
}
|
||||
|
||||
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
|
||||
@@ -113,8 +123,8 @@ __global__ void gated_delta_net_cuda(const float * q,
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_t[i] * delta_col;
|
||||
attn_partial += s_shard[r] * q_t[i];
|
||||
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_reg[r] * delta_col;
|
||||
attn_partial += s_shard[r] * q_reg[r];
|
||||
}
|
||||
|
||||
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
|
||||
|
||||
@@ -124,7 +124,10 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||
err = cudaMallocManaged(ptr, size);
|
||||
#if defined(GGML_USE_HIP)
|
||||
if (err == hipSuccess) {
|
||||
CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||
// hipMemAdviseSetCoarseGrain is an optional performance hint;
|
||||
// ignore errors (e.g. hipErrorInvalidValue on some APU/iGPU configs).
|
||||
cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
|
||||
(void)hipGetLastError(); // clear any error
|
||||
}
|
||||
|
||||
// fall back to cudaMalloc if not supported (e.g. on Windows)
|
||||
@@ -251,11 +254,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].supports_cooperative_launch = false;
|
||||
#endif // !(GGML_USE_MUSA)
|
||||
|
||||
// cudaMemGetInfo returns info for the current device
|
||||
size_t free_mem;
|
||||
CUDA_CHECK(cudaSetDevice(id));
|
||||
CUDA_CHECK(cudaMemGetInfo(&free_mem, NULL));
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
|
||||
@@ -270,25 +268,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
}
|
||||
}
|
||||
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB (%zu MiB free)\n",
|
||||
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB\n",
|
||||
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
|
||||
device_vmm ? "yes" : "no", prop.warpSize,
|
||||
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
|
||||
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
|
||||
info.devices[id].warp_size = 32;
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
|
||||
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
|
||||
(size_t)(prop.totalGlobalMem / (1024 * 1024)));
|
||||
std::string device_name(prop.name);
|
||||
if (device_name == "NVIDIA GeForce MX450") {
|
||||
turing_devices_without_mma.push_back({ id, device_name });
|
||||
@@ -303,6 +301,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
// TODO: Check for future drivers the default scheduling strategy and
|
||||
// remove this call again when cudaDeviceScheduleSpin is default.
|
||||
if (prop.major == 12 && prop.minor == 1) {
|
||||
CUDA_CHECK(cudaSetDevice(id));
|
||||
CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
|
||||
}
|
||||
|
||||
|
||||
+73
-16
@@ -60,11 +60,17 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
enum mmvq_parameter_table_id {
|
||||
MMVQ_PARAMETERS_GENERIC = 0,
|
||||
MMVQ_PARAMETERS_GCN,
|
||||
MMVQ_PARAMETERS_RDNA2
|
||||
MMVQ_PARAMETERS_RDNA2,
|
||||
MMVQ_PARAMETERS_RDNA3_0,
|
||||
MMVQ_PARAMETERS_RDNA4
|
||||
};
|
||||
|
||||
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
||||
#if defined(RDNA2) || defined(RDNA3) || defined(RDNA4)
|
||||
#if defined(RDNA4)
|
||||
return MMVQ_PARAMETERS_RDNA4;
|
||||
#elif defined(RDNA3_0)
|
||||
return MMVQ_PARAMETERS_RDNA3_0;
|
||||
#elif defined(RDNA2) || defined(RDNA3_5)
|
||||
return MMVQ_PARAMETERS_RDNA2;
|
||||
#elif defined(GCN) || defined(CDNA)
|
||||
return MMVQ_PARAMETERS_GCN;
|
||||
@@ -74,7 +80,13 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
||||
}
|
||||
|
||||
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
return MMVQ_PARAMETERS_RDNA4;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA3_0(cc)) {
|
||||
return MMVQ_PARAMETERS_RDNA3_0;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) {
|
||||
return MMVQ_PARAMETERS_RDNA2;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
@@ -83,7 +95,7 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||
return MMVQ_PARAMETERS_GENERIC;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
|
||||
static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC) {
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
@@ -114,6 +126,50 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_paramet
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (table_id == MMVQ_PARAMETERS_RDNA4) {
|
||||
// nwarps=8 benefits types with simple vec_dot on RDNA4 (ncols_dst=1).
|
||||
// Types with complex vec_dot (Q3_K, IQ2_*, IQ3_*) regress due to register
|
||||
// pressure and lookup table contention at higher thread counts.
|
||||
if (ncols_dst == 1) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return 8;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
if (table_id == MMVQ_PARAMETERS_RDNA3_0) {
|
||||
// RDNA3 (W7900): stricter whitelist than RDNA4.
|
||||
// Q2_K / Q5_K / IQ4_XS regress in full quant sweeps.
|
||||
if (ncols_dst == 1) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return 8;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -138,7 +194,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
|
||||
}
|
||||
|
||||
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false>
|
||||
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
|
||||
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
|
||||
@@ -151,7 +207,7 @@ static __global__ void mul_mat_vec_q(
|
||||
constexpr int qi = ggml_cuda_type_traits<type>::qi;
|
||||
constexpr int vdr = get_vdr_mmvq(type);
|
||||
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
|
||||
constexpr int nwarps = calc_nwarps(ncols_dst, table_id);
|
||||
constexpr int nwarps = calc_nwarps(type, ncols_dst, table_id);
|
||||
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
@@ -355,12 +411,13 @@ static __global__ void mul_mat_vec_q(
|
||||
}
|
||||
}
|
||||
|
||||
template<ggml_type type>
|
||||
static std::pair<dim3, dim3> calc_launch_params(
|
||||
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
|
||||
const int warp_size, const mmvq_parameter_table_id table_id) {
|
||||
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
|
||||
const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens);
|
||||
const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
|
||||
const dim3 block_dims(warp_size, calc_nwarps(type, ncols_dst, table_id), 1);
|
||||
return {block_nums, block_dims};
|
||||
}
|
||||
|
||||
@@ -420,7 +477,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
if (has_ids && ncols_dst > 1) {
|
||||
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
|
||||
constexpr int c_ncols_dst = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
@@ -431,7 +488,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
switch (ncols_dst) {
|
||||
case 1: {
|
||||
constexpr int c_ncols_dst = 1;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
@@ -439,7 +496,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
} break;
|
||||
case 2: {
|
||||
constexpr int c_ncols_dst = 2;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
@@ -447,7 +504,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
} break;
|
||||
case 3: {
|
||||
constexpr int c_ncols_dst = 3;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
@@ -455,7 +512,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
} break;
|
||||
case 4: {
|
||||
constexpr int c_ncols_dst = 4;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
@@ -463,7 +520,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
} break;
|
||||
case 5: {
|
||||
constexpr int c_ncols_dst = 5;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
@@ -471,7 +528,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
} break;
|
||||
case 6: {
|
||||
constexpr int c_ncols_dst = 6;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
@@ -479,7 +536,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
} break;
|
||||
case 7: {
|
||||
constexpr int c_ncols_dst = 7;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
@@ -487,7 +544,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
|
||||
} break;
|
||||
case 8: {
|
||||
constexpr int c_ncols_dst = 8;
|
||||
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
|
||||
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
|
||||
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
|
||||
|
||||
Vendored
+8
@@ -207,6 +207,14 @@
|
||||
#define RDNA3
|
||||
#endif // defined(__GFX11__)
|
||||
|
||||
#if defined(__gfx1150__) || defined(__gfx1151__)
|
||||
#define RDNA3_5
|
||||
#endif // defined(__gfx1150__) || defined(__gfx1151__)
|
||||
|
||||
#if defined(RDNA3) && !defined(RDNA3_5)
|
||||
#define RDNA3_0
|
||||
#endif // defined(RDNA3) && !defined(RDNA3_5)
|
||||
|
||||
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||
#define RDNA2
|
||||
|
||||
@@ -1142,6 +1142,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
op->src[0]->ne[0] != 128 &&
|
||||
op->src[0]->ne[0] != 192 &&
|
||||
op->src[0]->ne[0] != 256 &&
|
||||
op->src[0]->ne[0] != 320 &&
|
||||
op->src[0]->ne[0] != 576) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -6176,6 +6176,7 @@ template [[host_name("kernel_flash_attn_ext_f32_dk128_dv128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 32, 32>;
|
||||
@@ -6190,6 +6191,7 @@ template [[host_name("kernel_flash_attn_ext_f16_dk128_dv128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 576, 512>;
|
||||
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
@@ -6205,6 +6207,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_dk128_dv128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
|
||||
#endif
|
||||
|
||||
@@ -6220,6 +6223,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_dk128_dv128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 32, 32>;
|
||||
@@ -6234,6 +6238,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_dk128_dv128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 32, 32>;
|
||||
@@ -6248,6 +6253,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_dk128_dv128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 32, 32>;
|
||||
@@ -6262,6 +6268,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_dk128_dv128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 576, 512>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 32, 32>;
|
||||
@@ -6276,6 +6283,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk128_dv128")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 320, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
|
||||
|
||||
#undef FA_TYPES
|
||||
@@ -6846,6 +6854,17 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk256_dv256")]] kernel flas
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 256, 256, 1>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 256, 256, 1>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f32_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 320, 256, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 320, 256, 2>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 320, 256, 2>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 320, 256, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 320, 256, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 320, 256, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 320, 256, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 320, 256, 2>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f32_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 576, 512, 2>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 576, 512, 2>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
|
||||
@@ -4767,7 +4767,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
||||
sumqx += w*q*xb[j];
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
d = sumqx/sumq2;
|
||||
d = sumq2 > 0 ? sumqx/sumq2 : 0.f;
|
||||
float best = d*sumqx;
|
||||
for (int itry = -ntry; itry <= ntry; ++itry) {
|
||||
id = (itry + values[0])/max;
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "dmmv.hpp"
|
||||
#include "element_wise.hpp"
|
||||
#include "fattn.hpp"
|
||||
#include "gated_delta_net.hpp"
|
||||
#include "gla.hpp"
|
||||
#include "im2col.hpp"
|
||||
#include "mmq.hpp"
|
||||
@@ -31,6 +32,7 @@
|
||||
#include "norm.hpp"
|
||||
#include "outprod.hpp"
|
||||
#include "pad.hpp"
|
||||
#include "pad_reflect_1d.hpp"
|
||||
#include "quantize.hpp"
|
||||
#include "quants.hpp"
|
||||
#include "roll.hpp"
|
||||
@@ -39,8 +41,8 @@
|
||||
#include "ssm_conv.hpp"
|
||||
#include "softmax.hpp"
|
||||
#include "tsembd.hpp"
|
||||
#include "upscale.hpp"
|
||||
#include "wkv.hpp"
|
||||
#include "pad_reflect_1d.hpp"
|
||||
|
||||
|
||||
#endif // GGML_SYCL_BACKEND_HPP
|
||||
|
||||
@@ -216,7 +216,8 @@ struct sycl_device_info {
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
bool vmm; // virtual memory support
|
||||
size_t total_vram;
|
||||
//sycl_hw_info hw_info; \\ device id and aarch, currently not used
|
||||
int max_work_group_sizes;
|
||||
sycl_hw_info hw_info;
|
||||
optimize_feature opt_feature;
|
||||
};
|
||||
|
||||
@@ -692,15 +693,20 @@ static __dpct_inline__ float get_alibi_slope(const float max_bias,
|
||||
return dpct::pow(base, exph);
|
||||
}
|
||||
|
||||
static const sycl::uint3 init_fastdiv_values(uint32_t d) {
|
||||
GGML_ASSERT(d != 0);
|
||||
static const sycl::uint3 init_fastdiv_values(uint64_t d_64) {
|
||||
GGML_ASSERT(d_64 != 0);
|
||||
GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
uint32_t d = (uint32_t)d_64;
|
||||
|
||||
// compute L = ceil(log2(d));
|
||||
uint32_t L = 0;
|
||||
while (L < 32 && (uint32_t{ 1 } << L) < d) {
|
||||
L++;
|
||||
}
|
||||
|
||||
uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
|
||||
// pack divisor as well to reduce error surface
|
||||
return sycl::uint3(mp, L, d);
|
||||
}
|
||||
|
||||
@@ -965,4 +971,23 @@ static T block_reduce(T val, T * shared_vals, int block_size_template) {
|
||||
return val;
|
||||
}
|
||||
|
||||
/*
|
||||
* for Debug
|
||||
*/
|
||||
static int get_thread_id() {
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
int blockId = item_ct1.get_group(2) +
|
||||
item_ct1.get_group(1) * item_ct1.get_group_range(2) +
|
||||
item_ct1.get_group(0) * item_ct1.get_group_range(2) *
|
||||
item_ct1.get_group_range(1);
|
||||
int threadsPerBlock = item_ct1.get_local_range(2) *
|
||||
item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
|
||||
int threadInBlockId = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
|
||||
item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
|
||||
item_ct1.get_local_range(1);
|
||||
int id = blockId * threadsPerBlock + threadInBlockId;
|
||||
return id;
|
||||
}
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
@@ -294,30 +294,6 @@ static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl:
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void upscale(const T *x, T *dst, const int nb00, const int nb01,
|
||||
const int nb02, const int nb03, const int ne10, const int ne11,
|
||||
const int ne12, const int ne13, const float sf0, const float sf1,
|
||||
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
||||
int index = item_ct1.get_local_id(0) +
|
||||
item_ct1.get_group(0) * item_ct1.get_local_range(0);
|
||||
if (index >= ne10 * ne11 * ne12 * ne13) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
int i10 = index % ne10;
|
||||
int i11 = (index / ne10) % ne11;
|
||||
int i12 = (index / (ne10 * ne11)) % ne12;
|
||||
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
||||
|
||||
int i00 = static_cast<int>(i10 / sf0);
|
||||
int i01 = static_cast<int>(i11 / sf1);
|
||||
int i02 = static_cast<int>(i12 / sf2);
|
||||
int i03 = static_cast<int>(i13 / sf3);
|
||||
|
||||
dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void clamp(const T * x, T * dst, const float min, const float max, const int k,
|
||||
const sycl::nd_item<1> &item_ct1) {
|
||||
@@ -392,20 +368,6 @@ static void arange_kernel(T * dst, const int k, T start, T step,
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
|
||||
const int nb02, const int nb03, const int ne10, const int ne11,
|
||||
const int ne12, const int ne13, const float sf0, const float sf1,
|
||||
const float sf2, const float sf3, queue_ptr stream) {
|
||||
int dst_size = ne10 * ne11 * ne12 * ne13;
|
||||
int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE);
|
||||
sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
|
||||
upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
template<typename KernelInvoker, typename... Args>
|
||||
static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
@@ -505,42 +467,6 @@ static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & c
|
||||
}
|
||||
}
|
||||
|
||||
template<typename KernelInvoker, typename... Args>
|
||||
static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
|
||||
GGML_ASSERT(dst->src[0]->type == dst->type);
|
||||
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
|
||||
const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0];
|
||||
const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1];
|
||||
const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
|
||||
const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
auto data_pts = cast_data<sycl::half>(dst);
|
||||
kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2],
|
||||
(int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3,
|
||||
main_stream, std::forward<Args>(args)...);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
auto data_pts = cast_data<float>(dst);
|
||||
kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2],
|
||||
(int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3,
|
||||
main_stream, std::forward<Args>(args)...);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
}
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
static inline void ggml_sycl_op_unary(
|
||||
ggml_backend_sycl_context & ctx, ggml_tensor * dst, F func) {
|
||||
@@ -784,15 +710,6 @@ static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03,
|
||||
int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3,
|
||||
queue_ptr stream) {
|
||||
ggml_sycl_detail::upscale_sycl(src, dst_ptr, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, stream);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
float min_val;
|
||||
float max_val;
|
||||
@@ -1131,12 +1048,6 @@ void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_sqr(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_upscale(ctx, dst);
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_clamp(ctx, dst);
|
||||
|
||||
@@ -71,8 +71,6 @@ void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -70,6 +70,7 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp16(const int DKQ, co
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 64, 64)
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -130,134 +131,6 @@ static constexpr uint32_t ggml_sycl_fattn_tile_get_config_fp32(const int DKQ, co
|
||||
return 0;
|
||||
}
|
||||
|
||||
static constexpr uint32_t ggml_sycl_fattn_tile_get_config_amd(const int DKQ, const int DV, const int ncols) {
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 2, 64, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 4, 128, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 8, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 16, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 32, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 64, 256, 2, 32, 40)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 2, 64, 3, 32, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 4, 128, 3, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 8, 128, 2, 32, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 256, 2, 128, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 256, 2, 64, 64)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 16, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 32, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 64, 256, 2, 32, 40)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 2, 64, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 4, 128, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 8, 256, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 16, 256, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 32, 256, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 64, 256, 2, 32, 48)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 2, 64, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 4, 128, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 8, 256, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 16, 256, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 32, 256, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 64, 256, 2, 32, 56)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 2, 256, 2, 128, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 4, 128, 2, 64, 128)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 8, 256, 2, 64, 128)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 2, 64, 128)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 2, 64, 32)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 2, 256, 2, 128, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 4, 256, 2, 64, 128)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 128)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 128)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 32, 512, 1, 128, 64)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static constexpr uint32_t ggml_sycl_fattn_tile_get_config_amd_rdna(const int DKQ, const int DV, const int ncols) {
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 2, 64, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 4, 128, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 8, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 16, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 32, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 40, 40, 64, 256, 2, 32, 40)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 2, 64, 8, 32, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 4, 64, 8, 32, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 8, 128, 5, 128, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 128, 5, 128, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 128, 4, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 128, 5, 64, 64)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 16, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 32, 256, 2, 32, 40)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 80, 80, 64, 256, 2, 32, 40)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 2, 64, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 4, 128, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 8, 256, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 16, 256, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 32, 256, 2, 32, 48)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE( 96, 96, 64, 256, 2, 32, 48)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 2, 64, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 4, 128, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 8, 256, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 16, 256, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 32, 256, 2, 32, 56)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(112, 112, 64, 256, 2, 32, 56)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 2, 64, 8, 32, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 4, 128, 8, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 8, 128, 8, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 3, 128, 128)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 3, 128, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 3, 64, 64)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 8, 32, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 6, 32, 256)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 8, 128, 6, 32, 256)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5, 32, 256)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3, 64, 128)
|
||||
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 4, 128, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 8, 256, 2, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 16, 256, 4, 64, 64)
|
||||
GGML_SYCL_FATTN_TILE_CONFIG_CASE(576, 512, 32, 256, 2, 128, 64)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static constexpr uint32_t ggml_sycl_fattn_tile_get_config(const int DKQ, const int DV, const int ncols, const int cc) {
|
||||
if(fast_fp16_available(cc))
|
||||
return ggml_sycl_fattn_tile_get_config_fp16(DKQ, DV, ncols);
|
||||
@@ -455,7 +328,7 @@ static __dpct_inline__ void flash_attn_tile_iter_KQ(T_vec_dot * const Q_tmp,
|
||||
|
||||
flash_attn_tile_load_tile<warp_size, nwarps, nbatch_fa, nbatch_K, cpy_ne, oob_check>
|
||||
(K_h2 + int64_t(k_VKQ_0)*stride_K2 + k_KQ_0/2, KV_tmp, stride_K2, k_VKQ_sup);
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
#ifdef SYCL_FAST_FP16
|
||||
static_assert((nbatch_K/2) % cpy_ne == 0, "bad nbatch_K");
|
||||
@@ -505,7 +378,7 @@ static __dpct_inline__ void flash_attn_tile_iter_KQ(T_vec_dot * const Q_tmp,
|
||||
}
|
||||
|
||||
if (k_KQ_0 + nbatch_K < DKQ) {
|
||||
item_ct1.barrier(); // Sync not needed on last iteration.
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space); // Sync not needed on last iteration.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -545,7 +418,7 @@ static __dpct_inline__ void flash_attn_tile_iter(T_vec_dot * const Q_tmp,
|
||||
const int k_VKQ_max,
|
||||
const int col_Q_0,
|
||||
float * KQ_max_new_shared) {
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
constexpr int cpy_nb = ggml_sycl_get_max_cpy_bytes();
|
||||
constexpr int cpy_ne = cpy_nb / 4;
|
||||
|
||||
@@ -620,14 +493,14 @@ static __dpct_inline__ void flash_attn_tile_iter(T_vec_dot * const Q_tmp,
|
||||
}
|
||||
|
||||
if constexpr (np == 1) {
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
} else {
|
||||
static_assert(cpw == 1, "bad cpw");
|
||||
|
||||
if (item_ct1.get_local_id(2) == 0) {
|
||||
KQ_max_new_shared[item_ct1.get_local_id(1)] = KQ_max_new[0];
|
||||
}
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
KQ_max_new[0] = KQ_max_new_shared[(item_ct1.get_local_id(1) & ~(np - 1)) + item_ct1.get_local_id(2) % np];
|
||||
KQ_max_new[0] = warp_reduce_max<np>(KQ_max_new[0]);
|
||||
}
|
||||
@@ -697,7 +570,7 @@ static __dpct_inline__ void flash_attn_tile_iter(T_vec_dot * const Q_tmp,
|
||||
for (int k0 = 0; k0 < nbatch_fa; k0 += nbatch_V) {
|
||||
flash_attn_tile_load_tile<warp_size, nwarps, nbatch_V, DV, 0, oob_check>
|
||||
(V_h2 + int64_t(k_VKQ_0 + k0)*stride_V2, KV_tmp, stride_V2, k_VKQ_sup - k0);
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
#ifdef SYCL_FAST_FP16
|
||||
#pragma unroll
|
||||
@@ -765,7 +638,7 @@ static __dpct_inline__ void flash_attn_tile_iter(T_vec_dot * const Q_tmp,
|
||||
}
|
||||
}
|
||||
#endif // SYCL_FAST_FP16
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -825,6 +698,9 @@ static void flash_attn_tile(const char * Q,
|
||||
nb31, nb32, nb33);
|
||||
return;
|
||||
}
|
||||
// int id =get_thread_id();
|
||||
// if(id==0) sycl::ext::oneapi::experimental::printf("DKQ=%d, DV=%d, ncols1=%d, ncols2=%d\n",
|
||||
// DKQ, DV, ncols1, ncols2);
|
||||
|
||||
static_assert(ggml_sycl_fattn_tile_get_config(DKQ, DV, ncols1*ncols2) != 0, "kernel config not defined");
|
||||
|
||||
@@ -932,7 +808,9 @@ static void flash_attn_tile(const char * Q,
|
||||
|
||||
constexpr int cpy_ne_D = cpy_ne < DKQp/warp_size ? cpy_ne : DKQp/warp_size;
|
||||
|
||||
|
||||
#pragma unroll
|
||||
|
||||
for (int i0 = 0; i0 < DKQp; i0 += np*warp_size*cpy_ne_D) {
|
||||
if (i0 + np * warp_size * cpy_ne_D <= DKQ ||
|
||||
i0 + (item_ct1.get_local_id(1) % np) * (warp_size * cpy_ne_D) + item_ct1.get_local_id(2) * cpy_ne_D <
|
||||
@@ -972,7 +850,7 @@ static void flash_attn_tile(const char * Q,
|
||||
}
|
||||
}
|
||||
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
// Main loop over KV cache:
|
||||
const int k_VKQ_max = KV_max ? KV_max[sequence * item_ct1.get_group_range(2) + item_ct1.get_group(2)] : ne11;
|
||||
@@ -1051,7 +929,7 @@ static void flash_attn_tile(const char * Q,
|
||||
return;
|
||||
}
|
||||
|
||||
item_ct1.barrier();
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
#pragma unroll
|
||||
for (int ip = 1; ip < np; ++ip) {
|
||||
@@ -1189,30 +1067,58 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_sycl_context & ctx, ggm
|
||||
|
||||
const int id = ggml_sycl_get_device();
|
||||
const int cc = ggml_sycl_info().devices[id].cc;
|
||||
const auto arch = ggml_sycl_info().devices[id].hw_info.arch;
|
||||
const auto gpu_name = ggml_sycl_info().devices[id].hw_info.name;
|
||||
const int warp_size = WARP_32_SIZE; //can't support WARP_16_SIZE
|
||||
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
|
||||
if constexpr (DV <= 256) {
|
||||
if (Q->ne[1] > 16/ncols2) {
|
||||
constexpr int cols_per_block = 32;
|
||||
const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
|
||||
const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
|
||||
launch_fattn<DV, cols_per_block/ncols2, ncols2,
|
||||
flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size>
|
||||
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
|
||||
return;
|
||||
// printf("zjy DKQ=%d, DV=%d, ncols2=%d, use_logit_softca=%d Q->ne[1]=%d 4/ncols2=%d\n",
|
||||
// DKQ, DV, ncols2, use_logit_softcap, Q->ne[1], 4/ncols2);
|
||||
// printf("zjy name=%s\n", gpu_name.data());
|
||||
// std::cout << "GPU name "<<gpu_name<< " arch "<< ggml_sycl_info().devices[id].hw_info.arch_name
|
||||
// <<std::endl;
|
||||
|
||||
//same or older iGPU are weak too.
|
||||
if (arch <= gpu_arch::intel_gpu_adl_s) {
|
||||
if (Q->ne[1]>=75) {
|
||||
{
|
||||
constexpr int cols_per_block = ncols2*2;
|
||||
const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
|
||||
const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
|
||||
launch_fattn<DV, cols_per_block/ncols2, ncols2,
|
||||
flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size>
|
||||
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Q->ne[1] > 8/ncols2) {
|
||||
constexpr int cols_per_block = 16;
|
||||
const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
|
||||
const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
|
||||
launch_fattn<DV, cols_per_block/ncols2, ncols2,
|
||||
flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size>
|
||||
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
|
||||
return;
|
||||
if (arch > gpu_arch::intel_gpu_acm_g10 || Q->ne[1]<75) {
|
||||
if constexpr (ncols2 <= 32) {
|
||||
if (Q->ne[1] > 16/ncols2) {
|
||||
constexpr int cols_per_block = 32;
|
||||
const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
|
||||
const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
|
||||
launch_fattn<DV, cols_per_block/ncols2, ncols2,
|
||||
flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size>
|
||||
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if constexpr (ncols2 <= 16) {
|
||||
if (Q->ne[1] > 8/ncols2) {
|
||||
constexpr int cols_per_block = 16;
|
||||
const int nwarps = ggml_sycl_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
|
||||
const int nbatch_fa = ggml_sycl_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
|
||||
launch_fattn<DV, cols_per_block/ncols2, ncols2,
|
||||
flash_attn_tile<DKQ, DV, cols_per_block / ncols2, ncols2, use_logit_softcap, warp_size>, warp_size>
|
||||
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (ncols2 <= 8) {
|
||||
@@ -1237,6 +1143,8 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_sycl_context & ctx, ggm
|
||||
(ctx, dst, nwarps, nbytes_shared, nbatch_fa, true, true, false);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
if constexpr (ncols2 <= 2) {
|
||||
|
||||
@@ -55,7 +55,7 @@ void gated_delta_net_sycl(const float * q,
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
s_shard[r] = curr_state[i * S_v + col];
|
||||
s_shard[r] = curr_state[col * S_v + i];
|
||||
}
|
||||
|
||||
for (int t = 0; t < n_tokens; t++) {
|
||||
@@ -137,7 +137,7 @@ void gated_delta_net_sycl(const float * q,
|
||||
#pragma unroll
|
||||
for (int r = 0; r < rows_per_lane; r++) {
|
||||
const int i = r * warp_size + lane;
|
||||
state[i * S_v + col] = s_shard[r];
|
||||
state[col * S_v + i] = s_shard[r];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -44,7 +44,6 @@
|
||||
#include "ggml-sycl/backend.hpp"
|
||||
#include "ggml-sycl/common.hpp"
|
||||
#include "ggml-sycl/element_wise.hpp"
|
||||
#include "ggml-sycl/gated_delta_net.hpp"
|
||||
#include "ggml-sycl/gemm.hpp"
|
||||
#include "ggml-sycl/getrows.hpp"
|
||||
#include "ggml-sycl/norm.hpp"
|
||||
@@ -105,6 +104,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
|
||||
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
||||
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
|
||||
info.devices[i].hw_info = get_device_hw_info(&device);
|
||||
|
||||
}
|
||||
|
||||
@@ -4863,9 +4863,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
|
||||
return true;
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
|
||||
@@ -1,15 +1,67 @@
|
||||
#include "sycl_hw.hpp"
|
||||
|
||||
// TODO: currently not used
|
||||
/*
|
||||
sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
|
||||
sycl_hw_info res;
|
||||
int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
|
||||
res.device_id = id;
|
||||
using namespace std;
|
||||
|
||||
syclex::architecture arch = device_ptr->get_info<syclex::info::device::architecture>();
|
||||
res.arch = arch;
|
||||
|
||||
return res;
|
||||
}
|
||||
/*defined in
|
||||
* /opt/intel/oneapi/compiler/latest/include/sycl/ext/oneapi/experimental/device_architecture.def
|
||||
*/
|
||||
static map<gpu_arch, std::pair<const char*, sycl_intel_gpu_family>> arch2name = {
|
||||
{gpu_arch::intel_gpu_bdw, {"intel_gpu_bdw", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_skl, {"intel_gpu_skl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_kbl, {"intel_gpu_kbl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_cfl, {"intel_gpu_cfl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_apl, {"intel_gpu_apl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_glk, {"intel_gpu_glk", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_whl, {"intel_gpu_whl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_aml, {"intel_gpu_aml", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_cml, {"intel_gpu_cml", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_icllp, {"intel_gpu_icllp", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_ehl, {"intel_gpu_ehl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_tgllp, {"intel_gpu_tgllp", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_rkl, {"intel_gpu_rkl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_adl_s, {"intel_gpu_adl_s", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_adl_p, {"intel_gpu_adl_p", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_adl_n, {"intel_gpu_adl_n", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_dg1, {"intel_gpu_dg1", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_acm_g10, {"intel_gpu_acm_g10", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_acm_g11, {"intel_gpu_acm_g11", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_acm_g12, {"intel_gpu_acm_g12", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_pvc, {"intel_gpu_pvc", GPU_FAMILY_DGPU_CLOUD}},
|
||||
{gpu_arch::intel_gpu_pvc_vg, {"intel_gpu_pvc_vg", GPU_FAMILY_DGPU_CLOUD}},
|
||||
{gpu_arch::intel_gpu_mtl_u, {"intel_gpu_mtl_u", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_mtl_h, {"intel_gpu_mtl_h", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_arl_h, {"intel_gpu_arl_h", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_bmg_g21, {"intel_gpu_bmg_g21", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_bmg_g31, {"intel_gpu_bmg_g31", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_lnl_m, {"intel_gpu_lnl_m", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_ptl_h, {"intel_gpu_ptl_h", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_ptl_u, {"intel_gpu_ptl_u", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_wcl, {"intel_gpu_wcl", GPU_FAMILY_IGPU_XE}}
|
||||
};
|
||||
|
||||
|
||||
sycl_hw_info get_device_hw_info(sycl::device* device_ptr) {
|
||||
sycl_hw_info res;
|
||||
int32_t id =
|
||||
device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
|
||||
res.device_id = id;
|
||||
|
||||
res.name = device_ptr->get_info<sycl::info::device::name>();
|
||||
|
||||
syclex::architecture arch =
|
||||
device_ptr->get_info<syclex::info::device::architecture>();
|
||||
res.arch = arch;
|
||||
|
||||
map<syclex::architecture,
|
||||
std::pair<const char*, sycl_intel_gpu_family>>::iterator it =
|
||||
arch2name.find(res.arch);
|
||||
if (it != arch2name.end()) {
|
||||
res.arch_name = it->second.first;
|
||||
res.gpu_family = it->second.second;
|
||||
} else {
|
||||
res.arch_name = "unknown";
|
||||
res.gpu_family = GPU_FAMILY_UKNOWN;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -9,18 +9,30 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
namespace syclex = sycl::ext::oneapi::experimental;
|
||||
using gpu_arch = sycl::ext::oneapi::experimental::architecture;
|
||||
|
||||
// TODO: currently not used
|
||||
/*
|
||||
struct sycl_hw_info {
|
||||
syclex::architecture arch;
|
||||
int32_t device_id;
|
||||
// It's used to mark the GPU computing capacity in Flash-attention OP
|
||||
// The value must flow the order of performance.
|
||||
enum sycl_intel_gpu_family {
|
||||
GPU_FAMILY_UKNOWN = -1,
|
||||
// Normal iGPU in Intel Core CPU, before Meteor Lake iGPU(Xe)
|
||||
GPU_FAMILY_IGPU_NON_XE = 0,
|
||||
// iGPU with Xe core, Meteor Lake iGPU or newer.
|
||||
GPU_FAMILY_IGPU_XE = 1,
|
||||
// dGPU for gaming in client/data center, DG1/FLex 140 or newer.
|
||||
GPU_FAMILY_DGPU_CLIENT_GAME = 2,
|
||||
// dGPU for AI in cloud, PVC or newer.
|
||||
GPU_FAMILY_DGPU_CLOUD = 3
|
||||
};
|
||||
|
||||
bool is_in_vector(std::vector<int> &vec, int item);
|
||||
struct sycl_hw_info {
|
||||
syclex::architecture arch;
|
||||
const char* arch_name;
|
||||
int32_t device_id;
|
||||
std::string name;
|
||||
sycl_intel_gpu_family gpu_family;
|
||||
};
|
||||
|
||||
sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
|
||||
*/
|
||||
|
||||
|
||||
#endif // SYCL_HW_HPP
|
||||
|
||||
@@ -0,0 +1,410 @@
|
||||
#include "upscale.hpp"
|
||||
|
||||
static void upscale_f32(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3) {
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
int index = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
if (index >= ne10 * ne11 * ne12 * ne13) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i10 = index % ne10;
|
||||
int i11 = (index / ne10) % ne11;
|
||||
int i12 = (index / (ne10 * ne11)) % ne12;
|
||||
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
||||
|
||||
int i00 = i10 / sf0;
|
||||
int i01 = i11 / sf1;
|
||||
int i02 = i12 / sf2;
|
||||
int i03 = i13 / sf3;
|
||||
|
||||
dst[index] = *((const float*)((const char*)x + i03 * nb03 + i02 * nb02 +
|
||||
i01 * nb01 + i00 * nb00));
|
||||
}
|
||||
|
||||
static void upscale_f32_bilinear(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne00_src, const int ne01_src,
|
||||
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
const float pixel_offset) {
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
const int64_t index = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
|
||||
if (index >= dst_total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i10_dst = index % ne10_dst;
|
||||
const int i11_dst = (index / ne10_dst) % ne11_dst;
|
||||
const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
|
||||
const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
|
||||
|
||||
const int i02_src = (int)(i12_dst / sf2);
|
||||
const int i03_src = (int)(i13_dst / sf3);
|
||||
|
||||
const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
|
||||
int y0_src = (int) sycl::floor((float) y_src_f);
|
||||
int y1_src = y0_src + 1;
|
||||
|
||||
y0_src = sycl::max(0, sycl::min(y0_src, ne01_src - 1));
|
||||
y1_src = sycl::max(0, sycl::min(y1_src, ne01_src - 1));
|
||||
|
||||
float dy = y_src_f - (float)y0_src;
|
||||
dy = sycl::max(0.0f, sycl::min(dy, 1.0f));
|
||||
|
||||
float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
|
||||
int x0_src = (int) sycl::floor(x_src_f);
|
||||
int x1_src = x0_src + 1;
|
||||
|
||||
x0_src = sycl::max(0, sycl::min(x0_src, ne00_src - 1));
|
||||
x1_src = sycl::max(0, sycl::min(x1_src, ne00_src - 1));
|
||||
|
||||
float dx = x_src_f - (float)x0_src;
|
||||
dx = sycl::max(0.0f, sycl::min(dx, 1.0f));
|
||||
|
||||
const float* p_a =
|
||||
(const float*)((const char*)x + (int64_t)x0_src * nb00 +
|
||||
(int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 +
|
||||
(int64_t)i03_src * nb03);
|
||||
const float* p_b =
|
||||
(const float*)((const char*)x + (int64_t)x1_src * nb00 +
|
||||
(int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 +
|
||||
(int64_t)i03_src * nb03);
|
||||
const float* p_c =
|
||||
(const float*)((const char*)x + (int64_t)x0_src * nb00 +
|
||||
(int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 +
|
||||
(int64_t)i03_src * nb03);
|
||||
const float* p_d =
|
||||
(const float*)((const char*)x + (int64_t)x1_src * nb00 +
|
||||
(int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 +
|
||||
(int64_t)i03_src * nb03);
|
||||
|
||||
const float val_a = *p_a;
|
||||
const float val_b = *p_b;
|
||||
const float val_c = *p_c;
|
||||
const float val_d = *p_d;
|
||||
|
||||
float result = val_a * (1.0f - dx) * (1.0f - dy) +
|
||||
val_b * dx * (1.0f - dy) +
|
||||
val_c * (1.0f - dx) * dy +
|
||||
val_d * dx * dy;
|
||||
|
||||
dst[index] = result;
|
||||
}
|
||||
|
||||
// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
|
||||
// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
|
||||
static void upscale_f32_bilinear_antialias(const float * src0,
|
||||
float * dst,
|
||||
const int nb00,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int ne00_src,
|
||||
const int ne01_src,
|
||||
const int ne10_dst,
|
||||
const int ne11_dst,
|
||||
const int ne12_dst,
|
||||
const int ne13_dst,
|
||||
const float sf0,
|
||||
const float sf1,
|
||||
const float sf2,
|
||||
const float sf3,
|
||||
const float pixel_offset) {
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
const int64_t index = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
|
||||
if (index >= dst_total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i10_dst = index % ne10_dst;
|
||||
const int i11_dst = (index / ne10_dst) % ne11_dst;
|
||||
const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
|
||||
const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
|
||||
|
||||
const int i02_src = (int)(i12_dst / sf2);
|
||||
const int i03_src = (int)(i13_dst / sf3);
|
||||
|
||||
const float y = ((float)i11_dst + pixel_offset) / sf1;
|
||||
const float x = ((float)i10_dst + pixel_offset) / sf0;
|
||||
|
||||
// support and invscale, minimum 1 pixel for bilinear
|
||||
const float support1 = sycl::max(1.0f / sf1, 1.0f);
|
||||
const float invscale1 = 1.0f / support1;
|
||||
const float support0 = sycl::max(1.0f / sf0, 1.0f);
|
||||
const float invscale0 = 1.0f / support0;
|
||||
|
||||
// the range of source pixels that contribute
|
||||
const int64_t x_min = sycl::max(int64_t(0), int64_t(x - support0 + pixel_offset));
|
||||
const int64_t x_max = sycl::min(int64_t(ne00_src), int64_t(x + support0 + pixel_offset));
|
||||
const int64_t y_min = sycl::max(int64_t(0), int64_t(y - support1 + pixel_offset));
|
||||
const int64_t y_max = sycl::min(int64_t(ne01_src), int64_t(y + support1 + pixel_offset));
|
||||
|
||||
// bilinear filter with antialiasing
|
||||
float val = 0.0f;
|
||||
float total_weight = 0.0f;
|
||||
|
||||
auto triangle_filter = [](float x) -> float {
|
||||
return sycl::max(1.0f - sycl::fabs(x), 0.0f);
|
||||
};
|
||||
|
||||
for (int64_t sy = y_min; sy < y_max; sy++) {
|
||||
const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
|
||||
|
||||
for (int64_t sx = x_min; sx < x_max; sx++) {
|
||||
const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
|
||||
const float weight = weight_x * weight_y;
|
||||
|
||||
if (weight <= 0.0f) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const float pixel =
|
||||
*(const float*)((const char*)src0 + sx * nb00 + sy * nb01 +
|
||||
i02_src * nb02 + i03_src * nb03);
|
||||
val += pixel * weight;
|
||||
total_weight += weight;
|
||||
}
|
||||
}
|
||||
|
||||
if (total_weight > 0.0f) {
|
||||
val /= total_weight;
|
||||
}
|
||||
|
||||
dst[index] = val;
|
||||
}
|
||||
|
||||
namespace bicubic_interpolation {
|
||||
static float weight1(float x, const float &a) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
|
||||
static float weight2(float x, const float &a) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
|
||||
|
||||
static float bicubic(float p0, float p1, float p2, float p3, float x, float a) {
|
||||
const float w0 = weight2(x + 1, a);
|
||||
const float w1 = weight1(x + 0, a);
|
||||
const float w2 = weight1(1 - x, a);
|
||||
const float w3 = weight2(2 - x, a);
|
||||
return p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
static void upscale_f32_bicubic(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne00_src, const int ne01_src,
|
||||
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
const float pixel_offset) {
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
const float a = -0.75f;
|
||||
using bicubic_interpolation::bicubic;
|
||||
|
||||
const int64_t index = item_ct1.get_local_id(2) +
|
||||
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
||||
const int64_t dst_total_elements =
|
||||
ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
|
||||
if (index >= dst_total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i10_dst = index % ne10_dst;
|
||||
const int i11_dst = (index / ne10_dst) % ne11_dst;
|
||||
const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
|
||||
const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
|
||||
|
||||
const int i02_src = (int)(i12_dst / sf2);
|
||||
const int i03_src = (int)(i13_dst / sf3);
|
||||
|
||||
const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
|
||||
const int y0_src = (int) sycl::floor((float) y_src_f);
|
||||
const float dy = y_src_f - (float)y0_src;
|
||||
|
||||
const float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
|
||||
const int x0_src = (int) sycl::floor((float) x_src_f);
|
||||
const float dx = x_src_f - (float)x0_src;
|
||||
|
||||
const char * x_base = (const char *)x + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03;
|
||||
|
||||
auto load = [=](int x_off, int y_off) -> float {
|
||||
int i00_src = sycl::max(0, sycl::min(x0_src + x_off, ne00_src - 1));
|
||||
int i01_src = sycl::max(0, sycl::min(y0_src + y_off, ne01_src - 1));
|
||||
return *(const float *)(x_base + (int64_t)i00_src * nb00 + (int64_t)i01_src * nb01);
|
||||
};
|
||||
|
||||
const float result = bicubic(
|
||||
bicubic(load(-1, -1), load(0, -1), load(1, -1), load(2, -1), dx, a),
|
||||
bicubic(load(-1, 0), load(0, 0), load(1, 0), load(2, 0), dx, a),
|
||||
bicubic(load(-1, 1), load(0, 1), load(1, 1), load(2, 1), dx, a),
|
||||
bicubic(load(-1, 2), load(0, 2), load(1, 2), load(2, 2), dx, a),
|
||||
dy,
|
||||
a);
|
||||
|
||||
dst[index] = result;
|
||||
}
|
||||
|
||||
static void upscale_f32_sycl(const float * x,
|
||||
float * dst,
|
||||
const int nb00,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const float sf0,
|
||||
const float sf1,
|
||||
const float sf2,
|
||||
const float sf3,
|
||||
dpct::queue_ptr stream) {
|
||||
const int64_t dst_size = ne10 * ne11 * ne12 * ne13;
|
||||
const int64_t num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
|
||||
});
|
||||
}
|
||||
|
||||
static void upscale_f32_bilinear_sycl(const float * x,
|
||||
float * dst,
|
||||
const int nb00,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int ne00_src,
|
||||
const int ne01_src,
|
||||
const int ne10_dst,
|
||||
const int ne11_dst,
|
||||
const int ne12_dst,
|
||||
const int ne13_dst,
|
||||
const float sf0,
|
||||
const float sf1,
|
||||
const float sf2,
|
||||
const float sf3,
|
||||
const float pixel_offset,
|
||||
bool antialias,
|
||||
dpct::queue_ptr stream) {
|
||||
const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
const int64_t num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
if (antialias) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
upscale_f32_bilinear_antialias(
|
||||
x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst,
|
||||
ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
|
||||
});
|
||||
} else {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
upscale_f32_bilinear(
|
||||
x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst,
|
||||
ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void upscale_f32_bicubic_sycl(const float * x,
|
||||
float * dst,
|
||||
const int nb00,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int ne00_src,
|
||||
const int ne01_src,
|
||||
const int ne10_dst,
|
||||
const int ne11_dst,
|
||||
const int ne12_dst,
|
||||
const int ne13_dst,
|
||||
const float sf0,
|
||||
const float sf1,
|
||||
const float sf2,
|
||||
const float sf3,
|
||||
const float pixel_offset,
|
||||
dpct::queue_ptr stream) {
|
||||
const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
const int64_t num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
{
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(
|
||||
sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
upscale_f32_bicubic(
|
||||
x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst,
|
||||
ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
dpct::queue_ptr stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int mode_flags = dst->op_params[0];
|
||||
const ggml_scale_mode mode = (ggml_scale_mode)(mode_flags & 0xFF);
|
||||
|
||||
float sf0 = (float)dst->ne[0]/src0->ne[0];
|
||||
float sf1 = (float)dst->ne[1]/src0->ne[1];
|
||||
float sf2 = (float)dst->ne[2]/src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
||||
|
||||
float pixel_offset = 0.5f;
|
||||
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
||||
sf0 = dst->ne[0] > 1 && src0->ne[0] > 1
|
||||
? (float)(dst->ne[0] - 1) / (src0->ne[0] - 1)
|
||||
: sf0;
|
||||
sf1 = dst->ne[1] > 1 && src0->ne[1] > 1
|
||||
? (float)(dst->ne[1] - 1) / (src0->ne[1] - 1)
|
||||
: sf1;
|
||||
pixel_offset = 0.0f;
|
||||
}
|
||||
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
upscale_f32_sycl(
|
||||
src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
|
||||
upscale_f32_bilinear_sycl(
|
||||
src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
sf0, sf1, sf2, sf3, pixel_offset, antialias, stream);
|
||||
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
|
||||
upscale_f32_bicubic_sycl(
|
||||
src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
sf0, sf1, sf2, sf3, pixel_offset, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_upscale(ctx, dst);
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include "dpct/helper.hpp"
|
||||
#include "common.hpp"
|
||||
|
||||
#define SYCL_UPSCALE_BLOCK_SIZE 256
|
||||
|
||||
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
@@ -4981,8 +4981,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
|
||||
|
||||
// Try to find a non-graphics compute queue and transfer-focused queues
|
||||
const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
|
||||
const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
|
||||
// On AMD, the graphics queue seems to be faster, so don't avoid it
|
||||
const vk::QueueFlagBits graphics_flag = device->vendor_id == VK_VENDOR_ID_AMD ? (vk::QueueFlagBits)0 : vk::QueueFlagBits::eGraphics;
|
||||
const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, graphics_flag, -1, 1);
|
||||
const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | graphics_flag, compute_queue_family_index, 1);
|
||||
|
||||
const float priorities[] = { 1.0f, 1.0f };
|
||||
device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1;
|
||||
@@ -5441,13 +5443,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
||||
ggml_vk_load_shaders(device);
|
||||
|
||||
const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN;
|
||||
|
||||
if (!device->single_queue) {
|
||||
const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
|
||||
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
|
||||
|
||||
device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr);
|
||||
device->async_use_transfer_queue = (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr);
|
||||
} else {
|
||||
// TODO: Use pointer or reference to avoid copy
|
||||
device->transfer_queue.copyFrom(device->compute_queue);
|
||||
|
||||
@@ -245,7 +245,7 @@ void main() {
|
||||
#endif
|
||||
}
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
Sf[r][c] += ACC_TYPE(dot(Q_cache[r], K_Tf));
|
||||
Sf[r][c] += dot(ACC_TYPEV4(Q_cache[r]), ACC_TYPEV4(K_Tf));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -270,7 +270,7 @@ void main() {
|
||||
#endif
|
||||
}
|
||||
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
|
||||
Sf[r][c] += ACC_TYPE(dot(Qf[tile_row(r) * qf_stride + d * D_split + d_tid], K_Tf));
|
||||
Sf[r][c] += dot(ACC_TYPEV4(Qf[tile_row(r) * qf_stride + d * D_split + d_tid]), ACC_TYPEV4(K_Tf));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -478,6 +478,7 @@ class MODEL_ARCH(IntEnum):
|
||||
RND1 = auto()
|
||||
PANGU_EMBED = auto()
|
||||
MISTRAL3 = auto()
|
||||
MISTRAL4 = auto()
|
||||
PADDLEOCR = auto()
|
||||
MIMO2 = auto()
|
||||
STEP35 = auto()
|
||||
@@ -924,6 +925,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.RND1: "rnd1",
|
||||
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
|
||||
MODEL_ARCH.MISTRAL3: "mistral3",
|
||||
MODEL_ARCH.MISTRAL4: "mistral4",
|
||||
MODEL_ARCH.PADDLEOCR: "paddleocr",
|
||||
MODEL_ARCH.MIMO2: "mimo2",
|
||||
MODEL_ARCH.STEP35: "step35",
|
||||
@@ -3538,6 +3540,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.MISTRAL4: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_A,
|
||||
MODEL_TENSOR.ATTN_Q_B,
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||
MODEL_TENSOR.ATTN_KV_B,
|
||||
MODEL_TENSOR.ATTN_K_B,
|
||||
MODEL_TENSOR.ATTN_V_B,
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
],
|
||||
MODEL_ARCH.MIMO2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -1 +1 @@
|
||||
d6754f3d0e6d0acd21c12442353c9fd2f94188e7
|
||||
553552e1d88be2b214b85e5159eedd39a63e2c34
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "refs/tags/v0.37.2"
|
||||
HTTPLIB_VERSION = "refs/tags/v0.38.0"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
@@ -123,6 +123,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_MISTRAL4, "mistral4" },
|
||||
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
{ LLM_ARCH_STEP35, "step35" },
|
||||
@@ -1589,6 +1590,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
};
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
|
||||
@@ -127,6 +127,7 @@ enum llm_arch {
|
||||
LLM_ARCH_RND1,
|
||||
LLM_ARCH_PANGU_EMBED,
|
||||
LLM_ARCH_MISTRAL3,
|
||||
LLM_ARCH_MISTRAL4,
|
||||
LLM_ARCH_PADDLEOCR,
|
||||
LLM_ARCH_MIMO2,
|
||||
LLM_ARCH_STEP35,
|
||||
|
||||
@@ -1953,6 +1953,12 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
||||
|
||||
cells.pos_set(i, pos);
|
||||
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
llama_kv_cell_ext ext;
|
||||
io.read_to(&ext, sizeof(ext));
|
||||
cells.ext_set(i, ext);
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
llama_seq_id seq_id;
|
||||
io.read_to(&seq_id, sizeof(seq_id));
|
||||
|
||||
+34
-1
@@ -1587,6 +1587,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
{
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
|
||||
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
|
||||
@@ -4883,6 +4884,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
{
|
||||
const bool is_mla = hparams.is_mla();
|
||||
|
||||
@@ -7462,6 +7464,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
if (!layer.wo_s && layer.wo) {
|
||||
layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wqkv_s && layer.wqkv) {
|
||||
layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wqkv_gate_s && layer.wqkv_gate) {
|
||||
layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// dense FFN weight scales (per-tensor, shape {1})
|
||||
if (!layer.ffn_gate_s && layer.ffn_gate) {
|
||||
@@ -7473,6 +7481,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
if (!layer.ffn_up_s && layer.ffn_up) {
|
||||
layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
|
||||
layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
|
||||
layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
|
||||
layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// MoE expert weight scales (per-expert, shape {n_expert})
|
||||
if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
|
||||
@@ -7484,6 +7501,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
|
||||
layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// recurrent / linear-attention weight scales (per-tensor, shape {1})
|
||||
if (!layer.ssm_in_s && layer.ssm_in) {
|
||||
layer.ssm_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_out_s && layer.ssm_out) {
|
||||
layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_alpha_s && layer.ssm_alpha) {
|
||||
layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_beta_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7821,7 +7852,7 @@ void llama_model::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA) {
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
||||
@@ -8399,6 +8430,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
{
|
||||
llm = std::make_unique<llm_build_deepseek2>(*this, params);
|
||||
} break;
|
||||
@@ -8810,6 +8842,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
case LLM_ARCH_LLAMA_EMBED:
|
||||
case LLM_ARCH_MAINCODER:
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
|
||||
@@ -401,9 +401,18 @@ struct llama_layer {
|
||||
struct ggml_tensor * wk_s = nullptr;
|
||||
struct ggml_tensor * wv_s = nullptr;
|
||||
struct ggml_tensor * wo_s = nullptr;
|
||||
struct ggml_tensor * wqkv_s = nullptr;
|
||||
struct ggml_tensor * wqkv_gate_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_shexp_s = nullptr;
|
||||
struct ggml_tensor * ssm_in_s = nullptr;
|
||||
struct ggml_tensor * ssm_out_s = nullptr;
|
||||
struct ggml_tensor * ssm_alpha_s = nullptr;
|
||||
struct ggml_tensor * ssm_beta_s = nullptr;
|
||||
|
||||
// altup & laurel
|
||||
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
||||
|
||||
@@ -42,7 +42,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
|
||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
|
||||
|
||||
// {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
|
||||
ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
|
||||
ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur, layer.ssm_in_s);
|
||||
// split the above in two
|
||||
// => {d_inner, n_seq_tokens, n_seqs}
|
||||
ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
|
||||
@@ -137,7 +137,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
|
||||
y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
|
||||
|
||||
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
||||
cur = build_lora_mm(layer.ssm_out, y);
|
||||
cur = build_lora_mm(layer.ssm_out, y, layer.ssm_out_s);
|
||||
}
|
||||
|
||||
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
||||
@@ -184,7 +184,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
|
||||
// d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
|
||||
|
||||
// {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
|
||||
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
|
||||
ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur, model.layers[il].ssm_in_s);
|
||||
|
||||
// split the above in three
|
||||
ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
|
||||
@@ -278,7 +278,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
|
||||
y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
|
||||
|
||||
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, y);
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, y, model.layers[il].ssm_out_s);
|
||||
}
|
||||
|
||||
// {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
|
||||
|
||||
@@ -107,9 +107,9 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
||||
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il) {
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
|
||||
NULL,
|
||||
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
@@ -136,7 +136,10 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
il,
|
||||
router_logits);
|
||||
router_logits, nullptr,
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
nullptr, // no gate
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
if (model.layers[il].ffn_latent_up) {
|
||||
@@ -144,9 +147,9 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_shexp = build_ffn(inp_emb,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
NULL /* no gate */ , NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
|
||||
NULL /* no gate */ , NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
|
||||
NULL,
|
||||
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
cb(ffn_shexp, "ffn_shexp", il);
|
||||
|
||||
+13
-13
@@ -90,11 +90,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35::build_qkvz(
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
|
||||
ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s);
|
||||
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
|
||||
cb(qkv_mixed, "linear_attn_qkv_mixed", il);
|
||||
|
||||
ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
|
||||
ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s);
|
||||
cb(z, "z", il);
|
||||
|
||||
return { qkv_mixed, z };
|
||||
@@ -123,7 +123,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
|
||||
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
|
||||
|
||||
// Qwen3Next uses a single Q projection that outputs query + gate
|
||||
ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
|
||||
ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ]
|
||||
cb(Qcur_full, "Qcur_full", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
|
||||
@@ -135,10 +135,10 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// Apply K normalization
|
||||
@@ -186,7 +186,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
|
||||
cur = ggml_mul(ctx0, cur, gate_sigmoid);
|
||||
cb(cur, "attn_gated", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].wo, cur);
|
||||
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||
cb(cur, "attn_output", il);
|
||||
|
||||
return cur;
|
||||
@@ -217,14 +217,14 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
ggml_tensor * qkv_mixed = qkvz.first;
|
||||
ggml_tensor * z = qkvz.second;
|
||||
|
||||
ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
|
||||
ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s);
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
|
||||
cb(beta, "beta", il);
|
||||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
|
||||
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s);
|
||||
alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
|
||||
cb(alpha, "alpha", il);
|
||||
|
||||
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
||||
@@ -356,7 +356,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear(
|
||||
cb(final_output, "final_output", il);
|
||||
|
||||
// Output projection
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, final_output);
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s);
|
||||
cb(cur, "linear_attn_out", il);
|
||||
|
||||
// Reshape back to original dimensions
|
||||
@@ -370,9 +370,9 @@ ggml_tensor * llm_build_qwen35::build_layer_ffn(ggml_tensor * cur, const int il)
|
||||
GGML_ASSERT(model.layers[il].ffn_gate_inp == nullptr);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
|
||||
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
|
||||
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
+17
-14
@@ -90,11 +90,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen35moe::build_qkvz(
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
|
||||
ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input, model.layers[il].wqkv_s);
|
||||
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
|
||||
cb(qkv_mixed, "linear_attn_qkv_mixed", il);
|
||||
|
||||
ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
|
||||
ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input, model.layers[il].wqkv_gate_s);
|
||||
cb(z, "z", il);
|
||||
|
||||
return { qkv_mixed, z };
|
||||
@@ -123,7 +123,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
|
||||
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
|
||||
|
||||
// Qwen3Next uses a single Q projection that outputs query + gate
|
||||
ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
|
||||
ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); // [ (n_embd_head * 2) * n_head, n_tokens ]
|
||||
cb(Qcur_full, "Qcur_full", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
|
||||
@@ -135,10 +135,10 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// Apply K normalization
|
||||
@@ -186,7 +186,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
|
||||
cur = ggml_mul(ctx0, cur, gate_sigmoid);
|
||||
cb(cur, "attn_gated", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].wo, cur);
|
||||
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||
cb(cur, "attn_output", il);
|
||||
|
||||
return cur;
|
||||
@@ -217,14 +217,14 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
ggml_tensor * qkv_mixed = qkvz.first;
|
||||
ggml_tensor * z = qkvz.second;
|
||||
|
||||
ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur);
|
||||
ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur, model.layers[il].ssm_beta_s);
|
||||
beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs);
|
||||
cb(beta, "beta", il);
|
||||
|
||||
beta = ggml_sigmoid(ctx0, beta);
|
||||
|
||||
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur);
|
||||
alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * alpha = build_lora_mm(model.layers[il].ssm_alpha, cur, model.layers[il].ssm_alpha_s);
|
||||
alpha = ggml_reshape_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs);
|
||||
cb(alpha, "alpha", il);
|
||||
|
||||
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
||||
@@ -356,7 +356,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear(
|
||||
cb(final_output, "final_output", il);
|
||||
|
||||
// Output projection
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, final_output);
|
||||
cur = build_lora_mm(model.layers[il].ssm_out, final_output, model.layers[il].ssm_out_s);
|
||||
cb(cur, "linear_attn_out", il);
|
||||
|
||||
// Reshape back to original dimensions
|
||||
@@ -380,16 +380,19 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int
|
||||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
|
||||
nullptr, model.layers[il].ffn_gate_up_exps);
|
||||
nullptr, model.layers[il].ffn_gate_up_exps,
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
model.layers[il].ffn_gate_exps_s,
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// Add shared experts if present - following Qwen3Next reference implementation
|
||||
if (model.layers[il].ffn_up_shexp != nullptr) {
|
||||
ggml_tensor * ffn_shexp =
|
||||
build_ffn(cur,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp_s,
|
||||
model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp_s,
|
||||
model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp_s,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(ffn_shexp, "ffn_shexp", il);
|
||||
|
||||
@@ -8576,11 +8576,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 576 }) {
|
||||
for (int hsk : { 40, 64, 72, 80, 96, 128, 192, 256, 320, 576 }) {
|
||||
for (int hsv : { 40, 64, 72, 80, 96, 128, 192, 256, 512 }) {
|
||||
if (hsk != 192 && hsk != 576 && hsk != hsv) continue;
|
||||
if (hsk != 192 && hsk != 320 && hsk != 576 && hsk != hsv) continue;
|
||||
if (hsk == 192 && (hsv != 128 && hsv != 192)) continue;
|
||||
if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA
|
||||
if (hsk == 320 && hsv != 256) continue; // MLA
|
||||
|
||||
for (bool mask : { true, false } ) {
|
||||
for (bool sinks : { true, false } ) {
|
||||
@@ -8589,12 +8590,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (float logit_softcap : {0.0f, 10.0f}) {
|
||||
if (hsk != 128 && logit_softcap != 0.0f) continue;
|
||||
for (int nh : { 1, 4 }) {
|
||||
if (nh == 1 && hsk != 576) continue; // GLM 4.7 Flash
|
||||
if (nh == 1 && hsk != 320 && hsk != 576) continue; // GLM 4.7 Flash
|
||||
for (int nr3 : { 1, 3, }) {
|
||||
if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes
|
||||
for (int nr2 : { 1, 4, 12, 20 }) {
|
||||
for (int nr2 : { 1, 4, 12, 20, 32 }) {
|
||||
if (nr2 == 12 && hsk != 128) continue;
|
||||
if (nr2 == 20 && (nh != 1 || hsk != 576)) continue;
|
||||
if (nr2 == 32 && (nh != 1 || hsk != 320)) continue;
|
||||
//for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
|
||||
for (int kv : { 113, 512, 1024, }) {
|
||||
if (nr2 != 1 && kv != 512) continue;
|
||||
|
||||
@@ -1915,7 +1915,7 @@ env.globals["raise_exception"] = raise_exception
|
||||
|
||||
template = env.from_string(tmpl)
|
||||
result = template.render(**vars_json)
|
||||
print(result, end='')
|
||||
sys.stdout.buffer.write(result.encode())
|
||||
)";
|
||||
|
||||
static void test_template_py(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
|
||||
|
||||
@@ -90,7 +90,10 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
||||
n_embd = 64;
|
||||
n_head = 1;
|
||||
n_ff = 96;
|
||||
} else if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
} else if (arch == LLM_ARCH_DEEPSEEK2
|
||||
|| arch == LLM_ARCH_GLM_DSA
|
||||
|| arch == LLM_ARCH_KIMI_LINEAR
|
||||
|| arch == LLM_ARCH_MISTRAL4) {
|
||||
n_embd = 128;
|
||||
n_head = 1;
|
||||
n_ff = 192;
|
||||
@@ -145,7 +148,10 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
||||
}
|
||||
|
||||
ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f);
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
if (arch == LLM_ARCH_DEEPSEEK2
|
||||
|| arch == LLM_ARCH_GLM_DSA
|
||||
|| arch == LLM_ARCH_KIMI_LINEAR
|
||||
|| arch == LLM_ARCH_MISTRAL4) {
|
||||
ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH, uint32_t(576));
|
||||
ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, uint32_t(512));
|
||||
ms.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, uint32_t(64));
|
||||
@@ -319,6 +325,7 @@ static bool moe_mandatory(const llm_arch arch) {
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_KIMI_LINEAR:
|
||||
case LLM_ARCH_STEP35:
|
||||
case LLM_ARCH_MISTRAL4:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
+1
-1
@@ -215,7 +215,7 @@ struct cli_context {
|
||||
inputs.parallel_tool_calls = false;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
inputs.enable_thinking = common_chat_templates_support_enable_thinking(chat_params.tmpls.get());
|
||||
inputs.enable_thinking = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
|
||||
|
||||
Binary file not shown.
@@ -1273,17 +1273,27 @@ json convert_responses_to_chatcmpl(const json & response_body) {
|
||||
|
||||
for (const auto & output_text : item.at("content")) {
|
||||
const std::string type = json_value(output_text, "type", std::string());
|
||||
if (type != "output_text") {
|
||||
throw std::invalid_argument("'type' must be 'output_text'");
|
||||
if (type == "output_text") {
|
||||
if (!exists_and_is_string(output_text, "text")) {
|
||||
throw std::invalid_argument("'Output text' requires 'text'");
|
||||
// Ignore annotations and logprobs for now
|
||||
chatcmpl_content.push_back({
|
||||
{"text", output_text.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
}
|
||||
} else if (type == "refusal") {
|
||||
if (!exists_and_is_string(output_text, "refusal")) {
|
||||
throw std::invalid_argument("'Refusal' requires 'refusal'");
|
||||
// Ignore annotations and logprobs for now
|
||||
chatcmpl_content.push_back({
|
||||
{"refusal", output_text.at("refusal")},
|
||||
{"type", "refusal"},
|
||||
});
|
||||
}
|
||||
} else {
|
||||
throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
|
||||
}
|
||||
if (!exists_and_is_string(output_text, "text")) {
|
||||
throw std::invalid_argument("'Output text' requires 'text'");
|
||||
}
|
||||
// Ignore annotations and logprobs for now
|
||||
chatcmpl_content.push_back({
|
||||
{"text", output_text.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
}
|
||||
|
||||
if (merge_prev) {
|
||||
|
||||
@@ -563,7 +563,7 @@ def test_cancel_request():
|
||||
except requests.exceptions.ReadTimeout:
|
||||
pass # expected
|
||||
# make sure the slot is free
|
||||
time.sleep(1) # wait for HTTP_POLLING_SECONDS
|
||||
time.sleep(2)
|
||||
res = server.make_request("GET", "/slots")
|
||||
assert res.body[0]["is_processing"] == False
|
||||
|
||||
|
||||
Generated
+3
-30
@@ -939,7 +939,6 @@
|
||||
"integrity": "sha512-oJrXtQiAXLvT9clCf1K4kxp3eKsQhIaZqxEyowkBcsvZDdZkbWrVmnGknxs5flTD0VGsxrxKgBCZty1EzoiMzA==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@swc/helpers": "^0.5.0"
|
||||
}
|
||||
@@ -2161,7 +2160,6 @@
|
||||
"integrity": "sha512-W9R51zUCd2iHOQBg/D93+bdpYv6kbtFx+kft5X8lPKQl6yEu0aKs9i5N5GyCASOhIApgx/tkqZIJ7vgM4cqrHA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"ts-dedent": "^2.0.0",
|
||||
"type-fest": "~2.19"
|
||||
@@ -2245,7 +2243,6 @@
|
||||
"integrity": "sha512-875hTUkEbz+MyJIxWbQjfMaekqdmEKUUfR7JyKcpfMRZqcGyrO9Gd+iS1D/Dx8LpE5FEtutWGOtlAh4ReSAiOA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@standard-schema/spec": "^1.0.0",
|
||||
"@sveltejs/acorn-typescript": "^1.0.5",
|
||||
@@ -2289,7 +2286,6 @@
|
||||
"integrity": "sha512-YZs/OSKOQAQCnJvM/P+F1URotNnYNeU3P2s4oIpzm1uFaqUEqRxUB0g5ejMjEb5Gjb9/PiBI5Ktrq4rUUF8UVQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@sveltejs/vite-plugin-svelte-inspector": "^5.0.0",
|
||||
"debug": "^4.4.1",
|
||||
@@ -2705,7 +2701,6 @@
|
||||
"integrity": "sha512-pemlzrSESWbdAloYml3bAJMEfNh1Z7EduzqPKprCH5S341frlpYnUEW0H72dLxa6IsYr+mPno20GiSm+h9dEdQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@babel/code-frame": "^7.10.4",
|
||||
"@babel/runtime": "^7.12.5",
|
||||
@@ -2873,7 +2868,6 @@
|
||||
"integrity": "sha512-+0/4J266CBGPUq/ELg7QUHhN25WYjE0wYTPSQJn1xeu8DOlIOPxXxrNGiLmfAWl7HMMgWFWXpt9IDjMWrF5Iow==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~7.16.0"
|
||||
}
|
||||
@@ -2940,7 +2934,6 @@
|
||||
"integrity": "sha512-IgSWvLobTDOjnaxAfDTIHaECbkNlAlKv2j5SjpB2v7QHKv1FIfjwMy8FsDbVfDX/KjmCmYICcw7uGaXLhtsLNg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@typescript-eslint/scope-manager": "8.56.0",
|
||||
"@typescript-eslint/types": "8.56.0",
|
||||
@@ -3177,7 +3170,6 @@
|
||||
"integrity": "sha512-tJxiPrWmzH8a+w9nLKlQMzAKX/7VjFs50MWgcAj7p9XQ7AQ9/35fByFYptgPELyLw+0aixTnC4pUWV+APcZ/kw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@testing-library/dom": "^10.4.0",
|
||||
"@testing-library/user-event": "^14.6.1",
|
||||
@@ -3305,7 +3297,6 @@
|
||||
"integrity": "sha512-oukfKT9Mk41LreEW09vt45f8wx7DordoWUZMYdY/cyAk7w5TWkTRCNZYF7sX7n2wB7jyGAl74OxgwhPgKaqDMQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@vitest/utils": "3.2.4",
|
||||
"pathe": "^2.0.3",
|
||||
@@ -3376,7 +3367,6 @@
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
|
||||
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"acorn": "bin/acorn"
|
||||
},
|
||||
@@ -4094,7 +4084,8 @@
|
||||
"resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
|
||||
"integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
"license": "MIT",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.4.3",
|
||||
@@ -4404,7 +4395,6 @@
|
||||
"dev": true,
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"esbuild": "bin/esbuild"
|
||||
},
|
||||
@@ -4465,7 +4455,6 @@
|
||||
"integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@eslint-community/eslint-utils": "^4.8.0",
|
||||
"@eslint-community/regexpp": "^4.12.1",
|
||||
@@ -5672,7 +5661,6 @@
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.11.7.tgz",
|
||||
"integrity": "sha512-l7qMiNee7t82bH3SeyUCt9UF15EVmaBvsppY2zQtrbIhl/yzBTny+YUxsVjSjQ6gaqaeVtZmGocom8TzBlA4Yw==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=16.9.0"
|
||||
}
|
||||
@@ -8097,7 +8085,6 @@
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"nanoid": "^3.3.11",
|
||||
"picocolors": "^1.1.1",
|
||||
@@ -8231,7 +8218,6 @@
|
||||
"integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"prettier": "bin/prettier.cjs"
|
||||
},
|
||||
@@ -8248,7 +8234,6 @@
|
||||
"integrity": "sha512-pn1ra/0mPObzqoIQn/vUTR3ZZI6UuZ0sHqMK5x2jMLGrs53h0sXhkVuDcrlssHwIMk7FYrMjHBPoUSyyEEDlBQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"peerDependencies": {
|
||||
"prettier": "^3.0.0",
|
||||
"svelte": "^3.2.0 || ^4.0.0-next.0 || ^5.0.0-next.0"
|
||||
@@ -8480,7 +8465,6 @@
|
||||
"integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
@@ -8491,7 +8475,6 @@
|
||||
"integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"scheduler": "^0.26.0"
|
||||
},
|
||||
@@ -8766,7 +8749,6 @@
|
||||
"integrity": "sha512-4iya7Jb76fVpQyLoiVpzUrsjQ12r3dM7fIVz+4NwoYvZOShknRmiv+iu9CClZml5ZLGb0XMcYLutK6w9tgxHDw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@types/estree": "1.0.8"
|
||||
},
|
||||
@@ -8877,7 +8859,6 @@
|
||||
"integrity": "sha512-elOcIZRTM76dvxNAjqYrucTSI0teAF/L2Lv0s6f6b7FOwcwIuA357bIE871580AjHJuSvLIRUosgV+lIWx6Rgg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"chokidar": "^4.0.0",
|
||||
"immutable": "^5.0.2",
|
||||
@@ -9172,7 +9153,6 @@
|
||||
"integrity": "sha512-LwF0VZsT4qkgx66Ad/q0QgZZrU2a5WftaADDEcJ3bGq3O2fHvwWPlSZjM1HiXD4vqP9U5JiMqQkV1gkyH0XJkw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@storybook/global": "^5.0.0",
|
||||
"@storybook/icons": "^2.0.1",
|
||||
@@ -9387,7 +9367,6 @@
|
||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.48.3.tgz",
|
||||
"integrity": "sha512-w7QZ398cdNherTdiQ/v3SYLLGOO4948Jgjh04PYqtTYVohmBvbmFwLmo7pp8gp4/1tceRWfSTjHgjtfpCVNJmQ==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@jridgewell/remapping": "^2.3.4",
|
||||
"@jridgewell/sourcemap-codec": "^1.5.0",
|
||||
@@ -9633,7 +9612,6 @@
|
||||
"integrity": "sha512-gBXpgUm/3rp1lMZZrM/w7D8GKqshif0zAymAhbCyIt8KMe+0v9DQ7cdYLR4FHH/cKpdTXb+A/tKKU3eolfsI+g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/dcastil"
|
||||
@@ -9664,8 +9642,7 @@
|
||||
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.11.tgz",
|
||||
"integrity": "sha512-2E9TBm6MDD/xKYe+dvJZAmg3yxIEDNRc0jwlNyDg/4Fil2QcSLjFKGVff0lAf1jjeaArlG/M75Ey/EYr/OJtBA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tapable": {
|
||||
"version": "2.2.2",
|
||||
@@ -9942,7 +9919,6 @@
|
||||
"integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
@@ -10336,7 +10312,6 @@
|
||||
"integrity": "sha512-BxAKBWmIbrDgrokdGZH1IgkIk/5mMHDreLDmCJ0qpyJaAteP8NvMhkwr/ZCQNqNH97bw/dANTE9PDzqwJghfMQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"esbuild": "^0.25.0",
|
||||
"fdir": "^6.5.0",
|
||||
@@ -10497,7 +10472,6 @@
|
||||
"integrity": "sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@types/chai": "^5.2.2",
|
||||
"@vitest/expect": "3.2.4",
|
||||
@@ -10819,7 +10793,6 @@
|
||||
"resolved": "https://registry.npmjs.org/zod/-/zod-4.2.1.tgz",
|
||||
"integrity": "sha512-0wZ1IRqGGhMP76gLqz8EyfBXKk0J2qo2+H3fi4mcUP/KtTocoX08nmIAHl1Z2kJIZbZee8KOpBCSNPRgauucjw==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/colinhacks"
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
iconSize?: string;
|
||||
class?: string;
|
||||
disabled?: boolean;
|
||||
onclick: () => void;
|
||||
onclick: (e?: MouseEvent) => void;
|
||||
'aria-label'?: string;
|
||||
}
|
||||
|
||||
|
||||
+2
-1
@@ -65,7 +65,8 @@
|
||||
$effect(() => {
|
||||
if (conversationModel) {
|
||||
modelsStore.selectModelByName(conversationModel);
|
||||
} else if (isRouter && modelsStore.loadedModelIds.length > 0) {
|
||||
} else if (isRouter && !modelsStore.selectedModelId && modelsStore.loadedModelIds.length > 0) {
|
||||
// auto-select the first loaded model only when nothing is selected yet
|
||||
const first = modelOptions().find((m) => modelsStore.loadedModelIds.includes(m.model));
|
||||
if (first) modelsStore.selectModelById(first.id);
|
||||
}
|
||||
|
||||
+5
-12
@@ -3,6 +3,7 @@
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
import { DialogConversationSelection, DialogConfirmation } from '$lib/components/app';
|
||||
import { createMessageCountMap } from '$lib/utils';
|
||||
import { ISO_DATE_TIME_SEPARATOR } from '$lib/constants';
|
||||
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
||||
import { toast } from 'svelte-sonner';
|
||||
|
||||
@@ -55,18 +56,10 @@
|
||||
})
|
||||
);
|
||||
|
||||
const blob = new Blob([JSON.stringify(allData, null, 2)], {
|
||||
type: 'application/json'
|
||||
});
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
|
||||
a.href = url;
|
||||
a.download = `conversations_${new Date().toISOString().split('T')[0]}.json`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
conversationsStore.downloadConversationFile(
|
||||
allData,
|
||||
`${new Date().toISOString().split(ISO_DATE_TIME_SEPARATOR)[0]}_conversations.json`
|
||||
);
|
||||
|
||||
exportedConversations = selectedConversations;
|
||||
showExportSummary = true;
|
||||
|
||||
@@ -5,21 +5,38 @@
|
||||
import { serverStore } from '$lib/stores/server.svelte';
|
||||
import { modelsStore, modelOptions, modelsLoading } from '$lib/stores/models.svelte';
|
||||
import { formatFileSize, formatParameters, formatNumber } from '$lib/utils';
|
||||
import type { ApiLlamaCppServerProps } from '$lib/types';
|
||||
|
||||
interface Props {
|
||||
open?: boolean;
|
||||
onOpenChange?: (open: boolean) => void;
|
||||
// when set, fetch props from the child process (router mode)
|
||||
modelId?: string | null;
|
||||
}
|
||||
|
||||
let { open = $bindable(), onOpenChange }: Props = $props();
|
||||
let { open = $bindable(), onOpenChange, modelId = null }: Props = $props();
|
||||
|
||||
let serverProps = $derived(serverStore.props);
|
||||
let modelName = $derived(modelsStore.singleModelName);
|
||||
let isRouter = $derived(serverStore.isRouterMode);
|
||||
|
||||
// per-model props fetched from the child process
|
||||
let routerModelProps = $state<ApiLlamaCppServerProps | null>(null);
|
||||
let isLoadingRouterProps = $state(false);
|
||||
|
||||
// in router mode use per-model props, otherwise use global props
|
||||
let serverProps = $derived(isRouter && modelId ? routerModelProps : serverStore.props);
|
||||
|
||||
let modelName = $derived(isRouter && modelId ? modelId : modelsStore.singleModelName);
|
||||
let models = $derived(modelOptions());
|
||||
let isLoadingModels = $derived(modelsLoading());
|
||||
|
||||
// Get the first model for single-model mode display
|
||||
let firstModel = $derived(models[0] ?? null);
|
||||
// in router mode, find the model option matching modelId
|
||||
// in single mode, use the first model as before
|
||||
let firstModel = $derived.by(() => {
|
||||
if (isRouter && modelId) {
|
||||
return models.find((m) => m.model === modelId) ?? null;
|
||||
}
|
||||
return models[0] ?? null;
|
||||
});
|
||||
|
||||
// Get modalities from modelStore using the model ID from the first model
|
||||
let modalities = $derived.by(() => {
|
||||
@@ -33,10 +50,31 @@
|
||||
modelsStore.fetch();
|
||||
}
|
||||
});
|
||||
|
||||
// fetch per-model props from child process when dialog opens in router mode
|
||||
$effect(() => {
|
||||
if (open && isRouter && modelId) {
|
||||
isLoadingRouterProps = true;
|
||||
modelsStore
|
||||
.fetchModelProps(modelId)
|
||||
.then((props) => {
|
||||
routerModelProps = props;
|
||||
})
|
||||
.catch(() => {
|
||||
routerModelProps = null;
|
||||
})
|
||||
.finally(() => {
|
||||
isLoadingRouterProps = false;
|
||||
});
|
||||
}
|
||||
if (!open) {
|
||||
routerModelProps = null;
|
||||
}
|
||||
});
|
||||
</script>
|
||||
|
||||
<Dialog.Root bind:open {onOpenChange}>
|
||||
<Dialog.Content class="@container z-9999 !max-w-[60rem] max-w-full">
|
||||
<Dialog.Content class="@container z-9999 !max-h-[80dvh] !max-w-[60rem] max-w-full">
|
||||
<style>
|
||||
@container (max-width: 56rem) {
|
||||
.resizable-text-container {
|
||||
@@ -52,7 +90,7 @@
|
||||
</Dialog.Header>
|
||||
|
||||
<div class="space-y-6 py-4">
|
||||
{#if isLoadingModels}
|
||||
{#if isLoadingModels || isLoadingRouterProps}
|
||||
<div class="flex items-center justify-center py-8">
|
||||
<div class="text-sm text-muted-foreground">Loading model information...</div>
|
||||
</div>
|
||||
@@ -212,7 +250,7 @@
|
||||
<Table.Cell class="align-middle font-medium">Chat Template</Table.Cell>
|
||||
|
||||
<Table.Cell class="py-10">
|
||||
<div class="max-h-120 overflow-y-auto rounded-md bg-muted p-4">
|
||||
<div class="rounded-md bg-muted p-4">
|
||||
<pre
|
||||
class="font-mono text-xs whitespace-pre-wrap">{serverProps.chat_template}</pre>
|
||||
</div>
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
import { parseHeadersToArray, serializeHeaders } from '$lib/utils';
|
||||
import { UrlProtocol } from '$lib/enums';
|
||||
import { MCP_SERVER_URL_PLACEHOLDER } from '$lib/constants';
|
||||
import { mcpStore } from '$lib/stores/mcp.svelte';
|
||||
|
||||
interface Props {
|
||||
url: string;
|
||||
@@ -62,14 +63,33 @@
|
||||
{/if}
|
||||
|
||||
{#if !isWebSocket && onUseProxyChange}
|
||||
<label class="mt-3 flex cursor-pointer items-center gap-2">
|
||||
<label
|
||||
class="mt-3 flex items-start gap-2"
|
||||
class:cursor-pointer={mcpStore.isProxyAvailable}
|
||||
class:opacity-80={!mcpStore.isProxyAvailable}
|
||||
>
|
||||
<Switch
|
||||
class="mt-1"
|
||||
id="use-proxy-{id}"
|
||||
checked={useProxy}
|
||||
disabled={!mcpStore.isProxyAvailable}
|
||||
onCheckedChange={(checked) => onUseProxyChange?.(checked)}
|
||||
/>
|
||||
|
||||
<span class="text-xs text-muted-foreground">Use llama-server proxy</span>
|
||||
<span>
|
||||
<span class="text-xs text-muted-foreground">Use llama-server proxy</span>
|
||||
|
||||
<br />
|
||||
|
||||
{#if !mcpStore.isProxyAvailable}
|
||||
<span class="inline-flex gap-0.75 text-xs text-muted-foreground/60"
|
||||
>(Run <pre>llama-server</pre>
|
||||
with
|
||||
<pre>--webui-mcp-proxy</pre>
|
||||
flag)</span
|
||||
>
|
||||
{/if}
|
||||
</span>
|
||||
</label>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { onMount } from 'svelte';
|
||||
import { SvelteMap } from 'svelte/reactivity';
|
||||
import { ChevronDown, Loader2, Package } from '@lucide/svelte';
|
||||
import * as DropdownMenu from '$lib/components/ui/dropdown-menu';
|
||||
import * as Tooltip from '$lib/components/ui/tooltip';
|
||||
@@ -19,9 +18,11 @@
|
||||
DialogModelInformation,
|
||||
DropdownMenuSearchable,
|
||||
ModelId,
|
||||
ModelsSelectorList,
|
||||
ModelsSelectorOption
|
||||
} from '$lib/components/app';
|
||||
import type { ModelOption } from '$lib/types/models';
|
||||
import { filterModelOptions, groupModelOptions, type ModelItem } from './utils';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
@@ -73,89 +74,13 @@
|
||||
let searchTerm = $state('');
|
||||
let highlightedIndex = $state<number>(-1);
|
||||
|
||||
let filteredOptions: ModelOption[] = $derived.by(() => {
|
||||
const term = searchTerm.trim().toLowerCase();
|
||||
if (!term) return options;
|
||||
let filteredOptions = $derived(filterModelOptions(options, searchTerm));
|
||||
|
||||
return options.filter(
|
||||
(option) =>
|
||||
option.model.toLowerCase().includes(term) ||
|
||||
option.name?.toLowerCase().includes(term) ||
|
||||
option.aliases?.some((alias: string) => alias.toLowerCase().includes(term)) ||
|
||||
option.tags?.some((tag: string) => tag.toLowerCase().includes(term))
|
||||
);
|
||||
});
|
||||
|
||||
let groupedFilteredOptions = $derived.by(() => {
|
||||
const favIds = modelsStore.favouriteModelIds;
|
||||
const result: {
|
||||
orgName: string | null;
|
||||
isFavouritesGroup: boolean;
|
||||
isLoadedGroup: boolean;
|
||||
items: { option: ModelOption; flatIndex: number }[];
|
||||
}[] = [];
|
||||
|
||||
// Loaded models group (top)
|
||||
const loadedItems: { option: ModelOption; flatIndex: number }[] = [];
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
if (modelsStore.isModelLoaded(filteredOptions[i].model)) {
|
||||
loadedItems.push({ option: filteredOptions[i], flatIndex: i });
|
||||
}
|
||||
}
|
||||
|
||||
if (loadedItems.length > 0) {
|
||||
result.push({
|
||||
orgName: null,
|
||||
isFavouritesGroup: false,
|
||||
isLoadedGroup: true,
|
||||
items: loadedItems
|
||||
});
|
||||
}
|
||||
|
||||
// Favourites group
|
||||
const loadedModelIds = new Set(loadedItems.map((item) => item.option.model));
|
||||
const favItems: { option: ModelOption; flatIndex: number }[] = [];
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
if (favIds.has(filteredOptions[i].model) && !loadedModelIds.has(filteredOptions[i].model)) {
|
||||
favItems.push({ option: filteredOptions[i], flatIndex: i });
|
||||
}
|
||||
}
|
||||
|
||||
if (favItems.length > 0) {
|
||||
result.push({
|
||||
orgName: null,
|
||||
isFavouritesGroup: true,
|
||||
isLoadedGroup: false,
|
||||
items: favItems
|
||||
});
|
||||
}
|
||||
|
||||
// Org groups (excluding loaded and favourites)
|
||||
const orgGroups = new SvelteMap<string, { option: ModelOption; flatIndex: number }[]>();
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
const option = filteredOptions[i];
|
||||
|
||||
if (loadedModelIds.has(option.model) || favIds.has(option.model)) continue;
|
||||
|
||||
const orgName = option.parsedId?.orgName ?? null;
|
||||
const key = orgName ?? '';
|
||||
|
||||
if (!orgGroups.has(key)) orgGroups.set(key, []);
|
||||
|
||||
orgGroups.get(key)!.push({ option, flatIndex: i });
|
||||
}
|
||||
|
||||
for (const [orgName, items] of orgGroups) {
|
||||
result.push({
|
||||
orgName: orgName || null,
|
||||
isFavouritesGroup: false,
|
||||
isLoadedGroup: false,
|
||||
items
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
});
|
||||
let groupedFilteredOptions = $derived(
|
||||
groupModelOptions(filteredOptions, modelsStore.favouriteModelIds, (m) =>
|
||||
modelsStore.isModelLoaded(m)
|
||||
)
|
||||
);
|
||||
|
||||
$effect(() => {
|
||||
void searchTerm;
|
||||
@@ -164,6 +89,12 @@
|
||||
|
||||
let isOpen = $state(false);
|
||||
let showModelDialog = $state(false);
|
||||
let infoModelId = $state<string | null>(null);
|
||||
|
||||
function handleInfoClick(modelName: string) {
|
||||
infoModelId = modelName;
|
||||
showModelDialog = true;
|
||||
}
|
||||
|
||||
onMount(() => {
|
||||
modelsStore.fetch().catch((error) => {
|
||||
@@ -418,45 +349,39 @@
|
||||
<p class="px-4 py-3 text-sm text-muted-foreground">No models found.</p>
|
||||
{/if}
|
||||
|
||||
{#each groupedFilteredOptions as group (group.isLoadedGroup ? '__loaded__' : group.isFavouritesGroup ? '__favourites__' : group.orgName)}
|
||||
{#if group.isLoadedGroup}
|
||||
<p class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none">
|
||||
Loaded models
|
||||
</p>
|
||||
{:else if group.isFavouritesGroup}
|
||||
<p class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none">
|
||||
Favourite models
|
||||
</p>
|
||||
{:else if group.orgName}
|
||||
<p
|
||||
class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none [&:not(:first-child)]:mt-2"
|
||||
>
|
||||
{group.orgName}
|
||||
</p>
|
||||
{/if}
|
||||
{#snippet modelOption(item: ModelItem, showOrgName: boolean)}
|
||||
{@const { option, flatIndex } = item}
|
||||
{@const isSelected = currentModel === option.model || activeId === option.id}
|
||||
{@const isHighlighted = flatIndex === highlightedIndex}
|
||||
{@const isFav = modelsStore.favouriteModelIds.has(option.model)}
|
||||
|
||||
{#each group.items as { option, flatIndex } (group.isLoadedGroup ? `loaded-${option.id}` : group.isFavouritesGroup ? `fav-${option.id}` : option.id)}
|
||||
{@const isSelected = currentModel === option.model || activeId === option.id}
|
||||
{@const isHighlighted = flatIndex === highlightedIndex}
|
||||
{@const isFav = modelsStore.favouriteModelIds.has(option.model)}
|
||||
<ModelsSelectorOption
|
||||
{option}
|
||||
{isSelected}
|
||||
{isHighlighted}
|
||||
{isFav}
|
||||
{showOrgName}
|
||||
onSelect={handleSelect}
|
||||
onInfoClick={handleInfoClick}
|
||||
onMouseEnter={() => (highlightedIndex = flatIndex)}
|
||||
onKeyDown={(e) => {
|
||||
if (e.key === KeyboardKey.ENTER || e.key === KeyboardKey.SPACE) {
|
||||
e.preventDefault();
|
||||
handleSelect(option.id);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
{/snippet}
|
||||
|
||||
<ModelsSelectorOption
|
||||
{option}
|
||||
{isSelected}
|
||||
{isHighlighted}
|
||||
{isFav}
|
||||
showOrgName={group.isFavouritesGroup || group.isLoadedGroup}
|
||||
onSelect={handleSelect}
|
||||
onMouseEnter={() => (highlightedIndex = flatIndex)}
|
||||
onKeyDown={(e) => {
|
||||
if (e.key === KeyboardKey.ENTER || e.key === KeyboardKey.SPACE) {
|
||||
e.preventDefault();
|
||||
handleSelect(option.id);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
{/each}
|
||||
{/each}
|
||||
<ModelsSelectorList
|
||||
groups={groupedFilteredOptions}
|
||||
{currentModel}
|
||||
{activeId}
|
||||
sectionHeaderClass="my-1.5 px-2 py-2 text-[13px] font-semibold text-muted-foreground/70 select-none"
|
||||
onSelect={handleSelect}
|
||||
onInfoClick={handleInfoClick}
|
||||
renderOption={modelOption}
|
||||
/>
|
||||
</div>
|
||||
</DropdownMenuSearchable>
|
||||
</DropdownMenu.Content>
|
||||
@@ -500,6 +425,6 @@
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
{#if showModelDialog && !isRouter}
|
||||
<DialogModelInformation bind:open={showModelDialog} />
|
||||
{#if showModelDialog}
|
||||
<DialogModelInformation bind:open={showModelDialog} modelId={infoModelId} />
|
||||
{/if}
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
<script lang="ts">
|
||||
import { modelsStore } from '$lib/stores/models.svelte';
|
||||
import { ModelsSelectorOption } from '$lib/components/app';
|
||||
import type { GroupedModelOptions, ModelItem } from './utils';
|
||||
|
||||
interface Props {
|
||||
groups: GroupedModelOptions;
|
||||
currentModel: string | null;
|
||||
activeId: string | null;
|
||||
sectionHeaderClass?: string;
|
||||
orgHeaderClass?: string;
|
||||
onSelect: (modelId: string) => void;
|
||||
onInfoClick: (modelName: string) => void;
|
||||
renderOption?: import('svelte').Snippet<[ModelItem, boolean]>;
|
||||
}
|
||||
|
||||
let {
|
||||
groups,
|
||||
currentModel,
|
||||
activeId,
|
||||
sectionHeaderClass = 'my-1 px-2 py-2 text-[13px] font-semibold text-muted-foreground/70 select-none',
|
||||
orgHeaderClass = 'px-2 py-2 text-[11px] font-semibold text-muted-foreground/50 select-none [&:not(:first-child)]:mt-1',
|
||||
onSelect,
|
||||
onInfoClick,
|
||||
renderOption
|
||||
}: Props = $props();
|
||||
let render = $derived(renderOption ?? defaultOption);
|
||||
</script>
|
||||
|
||||
{#snippet defaultOption(item: ModelItem, showOrgName: boolean)}
|
||||
{@const { option } = item}
|
||||
{@const isSelected = currentModel === option.model || activeId === option.id}
|
||||
{@const isFav = modelsStore.favouriteModelIds.has(option.model)}
|
||||
|
||||
<ModelsSelectorOption
|
||||
{option}
|
||||
{isSelected}
|
||||
isHighlighted={false}
|
||||
{isFav}
|
||||
{showOrgName}
|
||||
{onSelect}
|
||||
{onInfoClick}
|
||||
onMouseEnter={() => {}}
|
||||
onKeyDown={() => {}}
|
||||
/>
|
||||
{/snippet}
|
||||
|
||||
{#if groups.loaded.length > 0}
|
||||
<p class={sectionHeaderClass}>Loaded models</p>
|
||||
{#each groups.loaded as item (`loaded-${item.option.id}`)}
|
||||
{@render render(item, true)}
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
{#if groups.favourites.length > 0}
|
||||
<p class={sectionHeaderClass}>Favourite models</p>
|
||||
{#each groups.favourites as item (`fav-${item.option.id}`)}
|
||||
{@render render(item, true)}
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
{#if groups.available.length > 0}
|
||||
<p class={sectionHeaderClass}>Available models</p>
|
||||
{#each groups.available as group (group.orgName)}
|
||||
{#if group.orgName}
|
||||
<p class={orgHeaderClass}>{group.orgName}</p>
|
||||
{/if}
|
||||
{#each group.items as item (item.option.id)}
|
||||
{@render render(item, false)}
|
||||
{/each}
|
||||
{/each}
|
||||
{/if}
|
||||
@@ -1,5 +1,14 @@
|
||||
<script lang="ts">
|
||||
import { CircleAlert, Heart, HeartOff, Loader2, Power, PowerOff, RotateCw } from '@lucide/svelte';
|
||||
import {
|
||||
CircleAlert,
|
||||
Heart,
|
||||
HeartOff,
|
||||
Info,
|
||||
Loader2,
|
||||
Power,
|
||||
PowerOff,
|
||||
RotateCw
|
||||
} from '@lucide/svelte';
|
||||
import { cn } from '$lib/components/ui/utils';
|
||||
import { ActionIcon, ModelId } from '$lib/components/app';
|
||||
import type { ModelOption } from '$lib/types/models';
|
||||
@@ -15,6 +24,7 @@
|
||||
onSelect: (modelId: string) => void;
|
||||
onMouseEnter: () => void;
|
||||
onKeyDown: (e: KeyboardEvent) => void;
|
||||
onInfoClick?: (modelName: string) => void;
|
||||
}
|
||||
|
||||
let {
|
||||
@@ -25,7 +35,8 @@
|
||||
showOrgName = false,
|
||||
onSelect,
|
||||
onMouseEnter,
|
||||
onKeyDown
|
||||
onKeyDown,
|
||||
onInfoClick
|
||||
}: Props = $props();
|
||||
|
||||
let currentRouterModels = $derived(routerModels());
|
||||
@@ -63,11 +74,11 @@
|
||||
class="flex-1"
|
||||
/>
|
||||
|
||||
<div class="flex shrink-0 items-center gap-2.5">
|
||||
<div class="flex shrink-0 items-center gap-1">
|
||||
<!-- svelte-ignore a11y_no_static_element_interactions -->
|
||||
<!-- svelte-ignore a11y_click_events_have_key_events -->
|
||||
<div
|
||||
class="pointer-events-none flex w-4 items-center justify-center pl-2 opacity-0 group-hover:pointer-events-auto group-hover:opacity-100"
|
||||
class="pointer-events-none flex items-center justify-center gap-0.75 pl-2 opacity-0 group-hover:pointer-events-auto group-hover:opacity-100"
|
||||
onclick={(e) => e.stopPropagation()}
|
||||
>
|
||||
{#if isFav}
|
||||
@@ -87,7 +98,19 @@
|
||||
onclick={() => modelsStore.toggleFavourite(option.model)}
|
||||
/>
|
||||
{/if}
|
||||
|
||||
<!-- info button: only shown when model is loaded and callback is provided -->
|
||||
{#if isLoaded && onInfoClick}
|
||||
<ActionIcon
|
||||
iconSize="h-2.5 w-2.5"
|
||||
icon={Info}
|
||||
tooltip="Model information"
|
||||
class="h-3 w-3 hover:text-foreground"
|
||||
onclick={() => onInfoClick(option.model)}
|
||||
/>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
{#if isLoading}
|
||||
<Loader2 class="h-4 w-4 animate-spin text-muted-foreground" />
|
||||
{:else if isFailed}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { onMount } from 'svelte';
|
||||
import { SvelteMap } from 'svelte/reactivity';
|
||||
import { ChevronDown, Loader2, Package } from '@lucide/svelte';
|
||||
import * as Sheet from '$lib/components/ui/sheet';
|
||||
import { cn } from '$lib/components/ui/utils';
|
||||
@@ -15,11 +14,12 @@
|
||||
import { isRouterMode } from '$lib/stores/server.svelte';
|
||||
import {
|
||||
DialogModelInformation,
|
||||
ModelsSelectorList,
|
||||
SearchInput,
|
||||
TruncatedText,
|
||||
ModelsSelectorOption
|
||||
TruncatedText
|
||||
} from '$lib/components/app';
|
||||
import type { ModelOption } from '$lib/types/models';
|
||||
import { filterModelOptions, groupModelOptions } from './utils';
|
||||
|
||||
interface Props {
|
||||
class?: string;
|
||||
@@ -73,85 +73,22 @@
|
||||
|
||||
let searchTerm = $state('');
|
||||
|
||||
let filteredOptions: ModelOption[] = $derived.by(() => {
|
||||
const term = searchTerm.trim().toLowerCase();
|
||||
if (!term) return options;
|
||||
let filteredOptions = $derived(filterModelOptions(options, searchTerm));
|
||||
|
||||
return options.filter(
|
||||
(option) =>
|
||||
option.model.toLowerCase().includes(term) ||
|
||||
option.name?.toLowerCase().includes(term) ||
|
||||
option.aliases?.some((alias: string) => alias.toLowerCase().includes(term)) ||
|
||||
option.tags?.some((tag: string) => tag.toLowerCase().includes(term))
|
||||
);
|
||||
});
|
||||
|
||||
let groupedFilteredOptions = $derived.by(() => {
|
||||
const favIds = modelsStore.favouriteModelIds;
|
||||
const result: {
|
||||
orgName: string | null;
|
||||
isFavouritesGroup: boolean;
|
||||
isLoadedGroup: boolean;
|
||||
items: { option: ModelOption; flatIndex: number }[];
|
||||
}[] = [];
|
||||
|
||||
// Loaded models group (top)
|
||||
const loadedItems: { option: ModelOption; flatIndex: number }[] = [];
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
if (modelsStore.isModelLoaded(filteredOptions[i].model)) {
|
||||
loadedItems.push({ option: filteredOptions[i], flatIndex: i });
|
||||
}
|
||||
}
|
||||
if (loadedItems.length > 0) {
|
||||
result.push({
|
||||
orgName: null,
|
||||
isFavouritesGroup: false,
|
||||
isLoadedGroup: true,
|
||||
items: loadedItems
|
||||
});
|
||||
}
|
||||
|
||||
// Favourites group
|
||||
const loadedModelIds = new Set(loadedItems.map((item) => item.option.model));
|
||||
const favItems: { option: ModelOption; flatIndex: number }[] = [];
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
if (favIds.has(filteredOptions[i].model) && !loadedModelIds.has(filteredOptions[i].model)) {
|
||||
favItems.push({ option: filteredOptions[i], flatIndex: i });
|
||||
}
|
||||
}
|
||||
if (favItems.length > 0) {
|
||||
result.push({
|
||||
orgName: null,
|
||||
isFavouritesGroup: true,
|
||||
isLoadedGroup: false,
|
||||
items: favItems
|
||||
});
|
||||
}
|
||||
|
||||
// Org groups (excluding loaded and favourites)
|
||||
const orgGroups = new SvelteMap<string, { option: ModelOption; flatIndex: number }[]>();
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
const option = filteredOptions[i];
|
||||
if (loadedModelIds.has(option.model) || favIds.has(option.model)) continue;
|
||||
const orgName = option.parsedId?.orgName ?? null;
|
||||
const key = orgName ?? '';
|
||||
if (!orgGroups.has(key)) orgGroups.set(key, []);
|
||||
orgGroups.get(key)!.push({ option, flatIndex: i });
|
||||
}
|
||||
for (const [orgName, items] of orgGroups) {
|
||||
result.push({
|
||||
orgName: orgName || null,
|
||||
isFavouritesGroup: false,
|
||||
isLoadedGroup: false,
|
||||
items
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
});
|
||||
let groupedFilteredOptions = $derived(
|
||||
groupModelOptions(filteredOptions, modelsStore.favouriteModelIds, (m) =>
|
||||
modelsStore.isModelLoaded(m)
|
||||
)
|
||||
);
|
||||
|
||||
let sheetOpen = $state(false);
|
||||
let showModelDialog = $state(false);
|
||||
let infoModelId = $state<string | null>(null);
|
||||
|
||||
function handleInfoClick(modelName: string) {
|
||||
infoModelId = modelName;
|
||||
showModelDialog = true;
|
||||
}
|
||||
|
||||
onMount(() => {
|
||||
modelsStore.fetch().catch((error) => {
|
||||
@@ -339,38 +276,15 @@
|
||||
<p class="px-3 py-3 text-center text-sm text-muted-foreground">No models found.</p>
|
||||
{/if}
|
||||
|
||||
{#each groupedFilteredOptions as group (group.isLoadedGroup ? '__loaded__' : group.isFavouritesGroup ? '__favourites__' : group.orgName)}
|
||||
{#if group.isLoadedGroup}
|
||||
<p class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none">
|
||||
Loaded models
|
||||
</p>
|
||||
{:else if group.isFavouritesGroup}
|
||||
<p class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none">
|
||||
Favourite models
|
||||
</p>
|
||||
{:else if group.orgName}
|
||||
<p
|
||||
class="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none [&:not(:first-child)]:mt-2"
|
||||
>
|
||||
{group.orgName}
|
||||
</p>
|
||||
{/if}
|
||||
|
||||
{#each group.items as { option } (group.isLoadedGroup ? `loaded-${option.id}` : group.isFavouritesGroup ? `fav-${option.id}` : option.id)}
|
||||
{@const isSelected = currentModel === option.model || activeId === option.id}
|
||||
{@const isFav = modelsStore.favouriteModelIds.has(option.model)}
|
||||
<ModelsSelectorOption
|
||||
{option}
|
||||
{isSelected}
|
||||
isHighlighted={false}
|
||||
{isFav}
|
||||
showOrgName={group.isFavouritesGroup || group.isLoadedGroup}
|
||||
onSelect={handleSelect}
|
||||
onMouseEnter={() => {}}
|
||||
onKeyDown={() => {}}
|
||||
/>
|
||||
{/each}
|
||||
{/each}
|
||||
<ModelsSelectorList
|
||||
groups={groupedFilteredOptions}
|
||||
{currentModel}
|
||||
{activeId}
|
||||
sectionHeaderClass="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none"
|
||||
orgHeaderClass="px-2 py-2 text-xs font-semibold text-muted-foreground/60 select-none [&:not(:first-child)]:mt-2"
|
||||
onSelect={handleSelect}
|
||||
onInfoClick={handleInfoClick}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</Sheet.Content>
|
||||
@@ -403,6 +317,6 @@
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
{#if showModelDialog && !isRouter}
|
||||
<DialogModelInformation bind:open={showModelDialog} />
|
||||
{#if showModelDialog}
|
||||
<DialogModelInformation bind:open={showModelDialog} modelId={infoModelId} />
|
||||
{/if}
|
||||
|
||||
@@ -44,6 +44,27 @@
|
||||
*/
|
||||
export { default as ModelsSelector } from './ModelsSelector.svelte';
|
||||
|
||||
/**
|
||||
* **ModelsSelectorList** - Grouped model options list
|
||||
*
|
||||
* Renders grouped model options (loaded, favourites, available) with section
|
||||
* headers and org subgroups. Shared between ModelsSelector and ModelsSelectorSheet
|
||||
* to avoid template duplication.
|
||||
*
|
||||
* Accepts an optional `renderOption` snippet to customize how each option is
|
||||
* rendered (e.g., to add keyboard navigation or highlighting).
|
||||
*/
|
||||
export { default as ModelsSelectorList } from './ModelsSelectorList.svelte';
|
||||
|
||||
/**
|
||||
* **ModelsSelectorOption** - Single model option row
|
||||
*
|
||||
* Renders a single model option with selection state, favourite toggle,
|
||||
* load/unload actions, status indicators, and an info button.
|
||||
* Used inside ModelsSelectorList or directly in custom render snippets.
|
||||
*/
|
||||
export { default as ModelsSelectorOption } from './ModelsSelectorOption.svelte';
|
||||
|
||||
/**
|
||||
* **ModelsSelectorSheet** - Mobile model selection sheet
|
||||
*
|
||||
@@ -80,5 +101,12 @@ export { default as ModelsSelectorSheet } from './ModelsSelectorSheet.svelte';
|
||||
* ```
|
||||
*/
|
||||
export { default as ModelBadge } from './ModelBadge.svelte';
|
||||
|
||||
/**
|
||||
* **ModelId** - Parsed model identifier display
|
||||
*
|
||||
* Displays a model ID with optional org name, parameter badges, quantization,
|
||||
* aliases, and tags. Supports raw mode to show the unprocessed model name.
|
||||
* Respects the user's `showRawModelNames` setting.
|
||||
*/
|
||||
export { default as ModelId } from './ModelId.svelte';
|
||||
export { default as ModelsSelectorOption } from './ModelsSelectorOption.svelte';
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
import { SvelteMap } from 'svelte/reactivity';
|
||||
import type { ModelOption } from '$lib/types/models';
|
||||
|
||||
export interface ModelItem {
|
||||
option: ModelOption;
|
||||
flatIndex: number;
|
||||
}
|
||||
|
||||
export interface OrgGroup {
|
||||
orgName: string | null;
|
||||
items: ModelItem[];
|
||||
}
|
||||
|
||||
export interface GroupedModelOptions {
|
||||
loaded: ModelItem[];
|
||||
favourites: ModelItem[];
|
||||
available: OrgGroup[];
|
||||
}
|
||||
|
||||
export function filterModelOptions(options: ModelOption[], searchTerm: string): ModelOption[] {
|
||||
const term = searchTerm.trim().toLowerCase();
|
||||
if (!term) return options;
|
||||
|
||||
return options.filter(
|
||||
(option) =>
|
||||
option.model.toLowerCase().includes(term) ||
|
||||
option.name?.toLowerCase().includes(term) ||
|
||||
option.aliases?.some((alias: string) => alias.toLowerCase().includes(term)) ||
|
||||
option.tags?.some((tag: string) => tag.toLowerCase().includes(term))
|
||||
);
|
||||
}
|
||||
|
||||
export function groupModelOptions(
|
||||
filteredOptions: ModelOption[],
|
||||
favouriteIds: Set<string>,
|
||||
isModelLoaded: (model: string) => boolean
|
||||
): GroupedModelOptions {
|
||||
// Loaded models
|
||||
const loaded: ModelItem[] = [];
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
if (isModelLoaded(filteredOptions[i].model)) {
|
||||
loaded.push({ option: filteredOptions[i], flatIndex: i });
|
||||
}
|
||||
}
|
||||
|
||||
// Favourites (excluding loaded)
|
||||
const loadedModelIds = new Set(loaded.map((item) => item.option.model));
|
||||
const favourites: ModelItem[] = [];
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
if (
|
||||
favouriteIds.has(filteredOptions[i].model) &&
|
||||
!loadedModelIds.has(filteredOptions[i].model)
|
||||
) {
|
||||
favourites.push({ option: filteredOptions[i], flatIndex: i });
|
||||
}
|
||||
}
|
||||
|
||||
// Available models grouped by org (excluding loaded and favourites)
|
||||
const available: OrgGroup[] = [];
|
||||
const orgGroups = new SvelteMap<string, ModelItem[]>();
|
||||
for (let i = 0; i < filteredOptions.length; i++) {
|
||||
const option = filteredOptions[i];
|
||||
if (loadedModelIds.has(option.model) || favouriteIds.has(option.model)) continue;
|
||||
|
||||
const key = option.parsedId?.orgName ?? '';
|
||||
if (!orgGroups.has(key)) orgGroups.set(key, []);
|
||||
orgGroups.get(key)!.push({ option, flatIndex: i });
|
||||
}
|
||||
|
||||
for (const [orgName, items] of orgGroups) {
|
||||
available.push({ orgName: orgName || null, items });
|
||||
}
|
||||
|
||||
return { loaded, favourites, available };
|
||||
}
|
||||
@@ -24,6 +24,7 @@ export * from './max-bundle-size';
|
||||
export * from './mcp';
|
||||
export * from './mcp-form';
|
||||
export * from './mcp-resource';
|
||||
export * from './message-export';
|
||||
export * from './model-id';
|
||||
export * from './precision';
|
||||
export * from './processing-info';
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
// Conversation filename constants
|
||||
|
||||
// Length of the trimmed conversation ID in the filename
|
||||
export const EXPORT_CONV_ID_TRIM_LENGTH = 8;
|
||||
// Maximum length of the sanitized conversation name snippet
|
||||
export const EXPORT_CONV_NAME_SUFFIX_MAX_LENGTH = 20;
|
||||
// Characters to keep in the ISO timestamp. 19 keeps 2026-01-01T00:00:00
|
||||
export const ISO_TIMESTAMP_SLICE_LENGTH = 19;
|
||||
|
||||
// Replacements for making the conversation title filename-friendly
|
||||
export const NON_ALPHANUMERIC_REGEX = /[^a-z0-9]/gi;
|
||||
export const EXPORT_CONV_NONALNUM_REPLACEMENT = '_';
|
||||
export const MULTIPLE_UNDERSCORE_REGEX = /_+/g;
|
||||
|
||||
// Replacements to the ISO date for use in the export filename
|
||||
export const ISO_DATE_TIME_SEPARATOR = 'T';
|
||||
export const ISO_DATE_TIME_SEPARATOR_REPLACEMENT = '_';
|
||||
|
||||
export const ISO_TIME_SEPARATOR = ':';
|
||||
export const ISO_TIME_SEPARATOR_REPLACEMENT = '-';
|
||||
@@ -26,6 +26,18 @@ import { config } from '$lib/stores/settings.svelte';
|
||||
import { filterByLeafNodeId, findLeafNode } from '$lib/utils';
|
||||
import type { McpServerOverride } from '$lib/types/database';
|
||||
import { MessageRole } from '$lib/enums';
|
||||
import {
|
||||
ISO_DATE_TIME_SEPARATOR,
|
||||
ISO_DATE_TIME_SEPARATOR_REPLACEMENT,
|
||||
ISO_TIMESTAMP_SLICE_LENGTH,
|
||||
EXPORT_CONV_ID_TRIM_LENGTH,
|
||||
EXPORT_CONV_NONALNUM_REPLACEMENT,
|
||||
EXPORT_CONV_NAME_SUFFIX_MAX_LENGTH,
|
||||
ISO_TIME_SEPARATOR,
|
||||
ISO_TIME_SEPARATOR_REPLACEMENT,
|
||||
NON_ALPHANUMERIC_REGEX,
|
||||
MULTIPLE_UNDERSCORE_REGEX
|
||||
} from '$lib/constants';
|
||||
|
||||
class ConversationsStore {
|
||||
/**
|
||||
@@ -619,6 +631,66 @@ class ConversationsStore {
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Generates a sanitized filename for a conversation export
|
||||
* @param conversation - The conversation metadata
|
||||
* @param msgs - Optional array of messages belonging to the conversation
|
||||
* @returns The generated filename string
|
||||
*/
|
||||
generateConversationFilename(
|
||||
conversation: { id?: string; name?: string },
|
||||
msgs?: DatabaseMessage[]
|
||||
): string {
|
||||
const conversationName = (conversation.name ?? '').trim().toLowerCase();
|
||||
|
||||
const sanitizedName = conversationName
|
||||
.replace(NON_ALPHANUMERIC_REGEX, EXPORT_CONV_NONALNUM_REPLACEMENT)
|
||||
.replace(MULTIPLE_UNDERSCORE_REGEX, '_')
|
||||
.substring(0, EXPORT_CONV_NAME_SUFFIX_MAX_LENGTH);
|
||||
|
||||
// If we have messages, use the timestamp of the newest message
|
||||
const referenceDate = msgs?.length
|
||||
? new Date(Math.max(...msgs.map((m) => m.timestamp)))
|
||||
: new Date();
|
||||
|
||||
const iso = referenceDate.toISOString().slice(0, ISO_TIMESTAMP_SLICE_LENGTH);
|
||||
const formattedDate = iso
|
||||
.replace(ISO_DATE_TIME_SEPARATOR, ISO_DATE_TIME_SEPARATOR_REPLACEMENT)
|
||||
.replaceAll(ISO_TIME_SEPARATOR, ISO_TIME_SEPARATOR_REPLACEMENT);
|
||||
const trimmedConvId = conversation.id?.slice(0, EXPORT_CONV_ID_TRIM_LENGTH) ?? '';
|
||||
return `${formattedDate}_conv_${trimmedConvId}_${sanitizedName}.json`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Triggers a browser download of the provided exported conversation data
|
||||
* @param data - The exported conversation payload (either a single conversation or array of them)
|
||||
* @param filename - Filename; if omitted, a deterministic name is generated
|
||||
*/
|
||||
downloadConversationFile(data: ExportedConversations, filename?: string): void {
|
||||
// Choose the first conversation or message
|
||||
const conversation =
|
||||
'conv' in data ? data.conv : Array.isArray(data) ? data[0]?.conv : undefined;
|
||||
const msgs =
|
||||
'messages' in data ? data.messages : Array.isArray(data) ? data[0]?.messages : undefined;
|
||||
|
||||
if (!conversation) {
|
||||
console.error('Invalid data: missing conversation');
|
||||
return;
|
||||
}
|
||||
|
||||
const downloadFilename = filename ?? this.generateConversationFilename(conversation, msgs);
|
||||
|
||||
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = downloadFilename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Downloads a conversation as JSON file.
|
||||
* @param convId - The conversation ID to download
|
||||
@@ -636,40 +708,7 @@ class ConversationsStore {
|
||||
messages = await DatabaseService.getConversationMessages(convId);
|
||||
}
|
||||
|
||||
this.triggerDownload({ conv: conversation, messages });
|
||||
}
|
||||
|
||||
/**
|
||||
* Exports all conversations with their messages as a JSON file
|
||||
* @returns The list of exported conversations
|
||||
*/
|
||||
async exportAllConversations(): Promise<DatabaseConversation[]> {
|
||||
const allConversations = await DatabaseService.getAllConversations();
|
||||
|
||||
if (allConversations.length === 0) {
|
||||
throw new Error('No conversations to export');
|
||||
}
|
||||
|
||||
const allData = await Promise.all(
|
||||
allConversations.map(async (conv) => {
|
||||
const messages = await DatabaseService.getConversationMessages(conv.id);
|
||||
return { conv, messages };
|
||||
})
|
||||
);
|
||||
|
||||
const blob = new Blob([JSON.stringify(allData, null, 2)], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `all_conversations_${new Date().toISOString().split('T')[0]}.json`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
|
||||
toast.success(`All conversations (${allConversations.length}) prepared for download`);
|
||||
|
||||
return allConversations;
|
||||
this.downloadConversationFile({ conv: conversation, messages });
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -743,37 +782,6 @@ class ConversationsStore {
|
||||
await this.loadConversations();
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Triggers file download in browser
|
||||
*/
|
||||
private triggerDownload(data: ExportedConversations, filename?: string): void {
|
||||
const conversation =
|
||||
'conv' in data ? data.conv : Array.isArray(data) ? data[0]?.conv : undefined;
|
||||
|
||||
if (!conversation) {
|
||||
console.error('Invalid data: missing conversation');
|
||||
return;
|
||||
}
|
||||
|
||||
const conversationName = conversation.name?.trim() || '';
|
||||
const truncatedSuffix = conversationName
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]/gi, '_')
|
||||
.replace(/_+/g, '_')
|
||||
.substring(0, 20);
|
||||
const downloadFilename = filename || `conversation_${conversation.id}_${truncatedSuffix}.json`;
|
||||
|
||||
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = downloadFilename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
}
|
||||
}
|
||||
|
||||
export const conversationsStore = new ConversationsStore();
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
*/
|
||||
|
||||
import { browser } from '$app/environment';
|
||||
import { base } from '$app/paths';
|
||||
import { MCPService } from '$lib/services/mcp.service';
|
||||
import { config, settingsStore } from '$lib/stores/settings.svelte';
|
||||
import { mcpResourceStore } from '$lib/stores/mcp-resources.svelte';
|
||||
@@ -42,6 +43,7 @@ import {
|
||||
ToolCallType
|
||||
} from '$lib/enums';
|
||||
import {
|
||||
CORS_PROXY_ENDPOINT,
|
||||
DEFAULT_CACHE_TTL_MS,
|
||||
DEFAULT_MCP_CONFIG,
|
||||
EXPECTED_THEMED_ICON_PAIR_COUNT,
|
||||
@@ -78,165 +80,13 @@ import type { ListChangedHandlers } from '@modelcontextprotocol/sdk/types.js';
|
||||
import type { DatabaseMessageExtraMcpResource, McpServerOverride } from '$lib/types/database';
|
||||
import type { SettingsConfigType } from '$lib/types/settings';
|
||||
|
||||
export function buildMcpClientConfig(
|
||||
cfg: SettingsConfigType,
|
||||
perChatOverrides?: McpServerOverride[]
|
||||
): MCPClientConfig | undefined {
|
||||
return buildMcpClientConfigInternal(cfg, perChatOverrides);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal helper to build MCP client config.
|
||||
* Kept as standalone function for external use and tests.
|
||||
*/
|
||||
export function buildMcpClientConfigInternal(
|
||||
cfg: SettingsConfigType,
|
||||
perChatOverrides?: McpServerOverride[]
|
||||
): MCPClientConfig | undefined {
|
||||
const rawServers = parseServerSettings(cfg.mcpServers);
|
||||
if (!rawServers.length) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const servers: Record<string, MCPServerConfig> = {};
|
||||
|
||||
for (const [index, entry] of rawServers.entries()) {
|
||||
if (!checkServerEnabled(entry, perChatOverrides)) continue;
|
||||
const normalized = buildServerConfig(entry);
|
||||
if (normalized) servers[generateMcpServerId(entry.id, index)] = normalized;
|
||||
}
|
||||
|
||||
if (Object.keys(servers).length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
return {
|
||||
protocolVersion: DEFAULT_MCP_CONFIG.protocolVersion,
|
||||
capabilities: DEFAULT_MCP_CONFIG.capabilities,
|
||||
clientInfo: DEFAULT_MCP_CONFIG.clientInfo,
|
||||
requestTimeoutMs: Math.round(DEFAULT_MCP_CONFIG.requestTimeoutSeconds * 1000),
|
||||
servers
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a unique server ID from an optional ID string or index.
|
||||
* @deprecated Use MCPStore.#generateServerId instead
|
||||
*/
|
||||
function generateMcpServerId(id: unknown, index: number): string {
|
||||
if (typeof id === 'string' && id.trim()) {
|
||||
return id.trim();
|
||||
}
|
||||
|
||||
return `${MCP_SERVER_ID_PREFIX}-${index + 1}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses raw server settings from config into MCPServerSettingsEntry array.
|
||||
* @deprecated Use MCPStore.#parseServerSettings instead
|
||||
*/
|
||||
function parseServerSettings(rawServers: unknown): MCPServerSettingsEntry[] {
|
||||
if (!rawServers) {
|
||||
return [];
|
||||
}
|
||||
|
||||
let parsed: unknown;
|
||||
if (typeof rawServers === 'string') {
|
||||
const trimmed = rawServers.trim();
|
||||
if (!trimmed) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
parsed = JSON.parse(trimmed);
|
||||
} catch (error) {
|
||||
console.warn('[MCP] Failed to parse mcpServers JSON:', error);
|
||||
|
||||
return [];
|
||||
}
|
||||
} else {
|
||||
parsed = rawServers;
|
||||
}
|
||||
if (!Array.isArray(parsed)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return parsed.map((entry, index) => {
|
||||
const url = typeof entry?.url === 'string' ? entry.url.trim() : '';
|
||||
const headers = typeof entry?.headers === 'string' ? entry.headers.trim() : undefined;
|
||||
|
||||
return {
|
||||
id: generateMcpServerId((entry as { id?: unknown })?.id, index),
|
||||
enabled: Boolean((entry as { enabled?: unknown })?.enabled),
|
||||
url,
|
||||
name: (entry as { name?: string })?.name,
|
||||
requestTimeoutSeconds: DEFAULT_MCP_CONFIG.requestTimeoutSeconds,
|
||||
headers: headers || undefined,
|
||||
useProxy: Boolean((entry as { useProxy?: unknown })?.useProxy)
|
||||
} satisfies MCPServerSettingsEntry;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds server configuration from a settings entry.
|
||||
* @deprecated Use MCPStore.#buildServerConfig instead
|
||||
*/
|
||||
function buildServerConfig(
|
||||
entry: MCPServerSettingsEntry,
|
||||
connectionTimeoutMs = DEFAULT_MCP_CONFIG.connectionTimeoutMs
|
||||
): MCPServerConfig | undefined {
|
||||
if (!entry?.url) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
let headers: Record<string, string> | undefined;
|
||||
if (entry.headers) {
|
||||
try {
|
||||
const parsed = JSON.parse(entry.headers);
|
||||
if (typeof parsed === 'object' && parsed !== null && !Array.isArray(parsed))
|
||||
headers = parsed as Record<string, string>;
|
||||
} catch {
|
||||
console.warn('[MCP] Failed to parse custom headers JSON:', entry.headers);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
url: entry.url,
|
||||
transport: detectMcpTransportFromUrl(entry.url),
|
||||
handshakeTimeoutMs: connectionTimeoutMs,
|
||||
requestTimeoutMs: Math.round(entry.requestTimeoutSeconds * 1000),
|
||||
headers,
|
||||
useProxy: entry.useProxy
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a server is enabled, considering per-chat overrides.
|
||||
* @deprecated Use MCPStore.#checkServerEnabled instead
|
||||
*/
|
||||
function checkServerEnabled(
|
||||
server: MCPServerSettingsEntry,
|
||||
perChatOverrides?: McpServerOverride[]
|
||||
): boolean {
|
||||
if (!server.enabled) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (perChatOverrides) {
|
||||
const override = perChatOverrides.find((o) => o.serverId === server.id);
|
||||
|
||||
return override?.enabled ?? false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
class MCPStore {
|
||||
private _isInitializing = $state(false);
|
||||
private _error = $state<string | null>(null);
|
||||
private _toolCount = $state(0);
|
||||
private _connectedServers = $state<string[]>([]);
|
||||
private _healthChecks = $state<Record<string, HealthCheckState>>({});
|
||||
private _proxyAvailable = $state(false);
|
||||
|
||||
private connections = new Map<string, MCPConnection>();
|
||||
private toolsIndex = new Map<string, string>();
|
||||
@@ -246,6 +96,29 @@ class MCPStore {
|
||||
private initPromise: Promise<boolean> | null = null;
|
||||
private activeFlowCount = 0;
|
||||
|
||||
constructor() {
|
||||
if (browser) {
|
||||
this.probeProxy();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Probes the CORS proxy endpoint to determine availability.
|
||||
* The endpoint is only registered when llama-server runs with --webui-mcp-proxy.
|
||||
*/
|
||||
async probeProxy(): Promise<void> {
|
||||
try {
|
||||
const response = await fetch(`${base}${CORS_PROXY_ENDPOINT}`, { method: 'HEAD' });
|
||||
this._proxyAvailable = response.status !== 404;
|
||||
} catch {
|
||||
this._proxyAvailable = false;
|
||||
}
|
||||
}
|
||||
|
||||
get isProxyAvailable(): boolean {
|
||||
return this._proxyAvailable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a unique server ID from an optional ID string or index.
|
||||
*/
|
||||
@@ -520,6 +393,7 @@ class MCPStore {
|
||||
|
||||
getServerLabel(server: MCPServerSettingsEntry): string {
|
||||
const healthState = this.getHealthCheckState(server.id);
|
||||
|
||||
if (healthState?.status === HealthCheckStatus.SUCCESS)
|
||||
return (
|
||||
healthState.serverInfo?.title || healthState.serverInfo?.name || server.name || server.url
|
||||
@@ -603,6 +477,7 @@ class MCPStore {
|
||||
*/
|
||||
#proxyIconSrc(src: string): string {
|
||||
if (src.startsWith('data:')) return src;
|
||||
if (!this._proxyAvailable) return src;
|
||||
|
||||
return getProxiedUrlString(src);
|
||||
}
|
||||
@@ -629,7 +504,7 @@ class MCPStore {
|
||||
}
|
||||
}
|
||||
|
||||
return getFaviconUrl(server.url);
|
||||
return getFaviconUrl(server.url, this._proxyAvailable);
|
||||
}
|
||||
|
||||
isAnyServerLoading(): boolean {
|
||||
@@ -2072,6 +1947,7 @@ export const mcpIsInitializing = () => mcpStore.isInitializing;
|
||||
export const mcpIsInitialized = () => mcpStore.isInitialized;
|
||||
export const mcpError = () => mcpStore.error;
|
||||
export const mcpIsEnabled = () => mcpStore.isEnabled;
|
||||
export const mcpIsProxyAvailable = () => mcpStore.isProxyAvailable;
|
||||
export const mcpAvailableTools = () => mcpStore.availableTools;
|
||||
export const mcpConnectedServerCount = () => mcpStore.connectedServerCount;
|
||||
export const mcpConnectedServerNames = () => mcpStore.connectedServerNames;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
/**
|
||||
* Utility functions for conversation data manipulation
|
||||
*/
|
||||
import type { DatabaseMessage } from '$lib/types';
|
||||
|
||||
/**
|
||||
* Creates a map of conversation IDs to their message counts from exported conversation data
|
||||
|
||||
@@ -17,7 +17,7 @@ import {
|
||||
* @param urlString - The URL to get the favicon for
|
||||
* @returns The favicon URL or null if invalid
|
||||
*/
|
||||
export function getFaviconUrl(urlString: string): string | null {
|
||||
export function getFaviconUrl(urlString: string, useProxy = true): string | null {
|
||||
try {
|
||||
const url = new URL(urlString);
|
||||
const hostnameParts = url.hostname.split(DOMAIN_SEPARATOR);
|
||||
@@ -27,7 +27,7 @@ export function getFaviconUrl(urlString: string): string | null {
|
||||
: url.hostname;
|
||||
|
||||
const googleFaviconUrl = `${GOOGLE_FAVICON_BASE_URL}?domain=${rootDomain}&sz=${DEFAULT_FAVICON_SIZE}`;
|
||||
return getProxiedUrlString(googleFaviconUrl);
|
||||
return useProxy ? getProxiedUrlString(googleFaviconUrl) : googleFaviconUrl;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -231,7 +231,7 @@
|
||||
<Sidebar.Trigger
|
||||
class="transition-left absolute left-0 z-[900] duration-200 ease-linear {sidebarOpen
|
||||
? 'md:left-[var(--sidebar-width)]'
|
||||
: ''}"
|
||||
: 'md:left-0!'}"
|
||||
style="translate: 1rem 1rem;"
|
||||
/>
|
||||
{/if}
|
||||
|
||||
Vendored
+271
-116
@@ -1025,6 +1025,30 @@ bool is_valid_path(const std::string &path) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool canonicalize_path(const char *path, std::string &resolved) {
|
||||
#if defined(_WIN32)
|
||||
char buf[_MAX_PATH];
|
||||
if (_fullpath(buf, path, _MAX_PATH) == nullptr) { return false; }
|
||||
resolved = buf;
|
||||
#else
|
||||
char buf[PATH_MAX];
|
||||
if (realpath(path, buf) == nullptr) { return false; }
|
||||
resolved = buf;
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_path_within_base(const std::string &resolved_path,
|
||||
const std::string &resolved_base) {
|
||||
#if defined(_WIN32)
|
||||
return _strnicmp(resolved_path.c_str(), resolved_base.c_str(),
|
||||
resolved_base.size()) == 0;
|
||||
#else
|
||||
return strncmp(resolved_path.c_str(), resolved_base.c_str(),
|
||||
resolved_base.size()) == 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
FileStat::FileStat(const std::string &path) {
|
||||
#if defined(_WIN32)
|
||||
auto wpath = u8string_to_wstring(path.c_str());
|
||||
@@ -2627,33 +2651,114 @@ bool can_compress_content_type(const std::string &content_type) {
|
||||
}
|
||||
}
|
||||
|
||||
bool parse_quality(const char *b, const char *e, std::string &token,
|
||||
double &quality) {
|
||||
quality = 1.0;
|
||||
token.clear();
|
||||
|
||||
// Split on first ';': left = token name, right = parameters
|
||||
const char *params_b = nullptr;
|
||||
std::size_t params_len = 0;
|
||||
|
||||
divide(
|
||||
b, static_cast<std::size_t>(e - b), ';',
|
||||
[&](const char *lb, std::size_t llen, const char *rb, std::size_t rlen) {
|
||||
auto r = trim(lb, lb + llen, 0, llen);
|
||||
if (r.first < r.second) { token.assign(lb + r.first, lb + r.second); }
|
||||
params_b = rb;
|
||||
params_len = rlen;
|
||||
});
|
||||
|
||||
if (token.empty()) { return false; }
|
||||
if (params_len == 0) { return true; }
|
||||
|
||||
// Scan parameters for q= (stops on first match)
|
||||
bool invalid = false;
|
||||
split_find(params_b, params_b + params_len, ';',
|
||||
(std::numeric_limits<size_t>::max)(),
|
||||
[&](const char *pb, const char *pe) -> bool {
|
||||
// Match exactly "q=" or "Q=" (not "query=" etc.)
|
||||
auto len = static_cast<size_t>(pe - pb);
|
||||
if (len < 2) { return false; }
|
||||
if ((pb[0] != 'q' && pb[0] != 'Q') || pb[1] != '=') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Trim the value portion
|
||||
auto r = trim(pb, pe, 2, len);
|
||||
if (r.first >= r.second) {
|
||||
invalid = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
double v = 0.0;
|
||||
auto res = from_chars(pb + r.first, pb + r.second, v);
|
||||
if (res.ec != std::errc{} || v < 0.0 || v > 1.0) {
|
||||
invalid = true;
|
||||
return true;
|
||||
}
|
||||
quality = v;
|
||||
return true;
|
||||
});
|
||||
|
||||
return !invalid;
|
||||
}
|
||||
|
||||
EncodingType encoding_type(const Request &req, const Response &res) {
|
||||
auto ret =
|
||||
detail::can_compress_content_type(res.get_header_value("Content-Type"));
|
||||
if (!ret) { return EncodingType::None; }
|
||||
if (!can_compress_content_type(res.get_header_value("Content-Type"))) {
|
||||
return EncodingType::None;
|
||||
}
|
||||
|
||||
const auto &s = req.get_header_value("Accept-Encoding");
|
||||
(void)(s);
|
||||
if (s.empty()) { return EncodingType::None; }
|
||||
|
||||
// Single-pass: iterate tokens and track the best supported encoding.
|
||||
// Server preference breaks ties (br > gzip > zstd).
|
||||
EncodingType best = EncodingType::None;
|
||||
double best_q = 0.0; // q=0 means "not acceptable"
|
||||
|
||||
// Server preference: Brotli > Gzip > Zstd (lower = more preferred)
|
||||
auto priority = [](EncodingType t) -> int {
|
||||
switch (t) {
|
||||
case EncodingType::Brotli: return 0;
|
||||
case EncodingType::Gzip: return 1;
|
||||
case EncodingType::Zstd: return 2;
|
||||
default: return 3;
|
||||
}
|
||||
};
|
||||
|
||||
std::string name;
|
||||
split(s.data(), s.data() + s.size(), ',', [&](const char *b, const char *e) {
|
||||
double quality = 1.0;
|
||||
if (!parse_quality(b, e, name, quality)) { return; }
|
||||
if (quality <= 0.0) { return; }
|
||||
|
||||
EncodingType type = EncodingType::None;
|
||||
#ifdef CPPHTTPLIB_BROTLI_SUPPORT
|
||||
// TODO: 'Accept-Encoding' has br, not br;q=0
|
||||
ret = s.find("br") != std::string::npos;
|
||||
if (ret) { return EncodingType::Brotli; }
|
||||
if (case_ignore::equal(name, "br")) { type = EncodingType::Brotli; }
|
||||
#endif
|
||||
|
||||
#ifdef CPPHTTPLIB_ZLIB_SUPPORT
|
||||
// TODO: 'Accept-Encoding' has gzip, not gzip;q=0
|
||||
ret = s.find("gzip") != std::string::npos;
|
||||
if (ret) { return EncodingType::Gzip; }
|
||||
if (type == EncodingType::None && case_ignore::equal(name, "gzip")) {
|
||||
type = EncodingType::Gzip;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CPPHTTPLIB_ZSTD_SUPPORT
|
||||
// TODO: 'Accept-Encoding' has zstd, not zstd;q=0
|
||||
ret = s.find("zstd") != std::string::npos;
|
||||
if (ret) { return EncodingType::Zstd; }
|
||||
if (type == EncodingType::None && case_ignore::equal(name, "zstd")) {
|
||||
type = EncodingType::Zstd;
|
||||
}
|
||||
#endif
|
||||
|
||||
return EncodingType::None;
|
||||
if (type == EncodingType::None) { return; }
|
||||
|
||||
// Higher q-value wins; for equal q, server preference breaks ties
|
||||
if (quality > best_q ||
|
||||
(quality == best_q && priority(type) < priority(best))) {
|
||||
best_q = quality;
|
||||
best = type;
|
||||
}
|
||||
});
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
bool nocompressor::compress(const char *data, size_t data_length,
|
||||
@@ -2937,6 +3042,21 @@ create_decompressor(const std::string &encoding) {
|
||||
return decompressor;
|
||||
}
|
||||
|
||||
// Returns the best available compressor and its Content-Encoding name.
|
||||
// Priority: Brotli > Gzip > Zstd (matches server-side preference).
|
||||
std::pair<std::unique_ptr<compressor>, const char *>
|
||||
create_compressor() {
|
||||
#ifdef CPPHTTPLIB_BROTLI_SUPPORT
|
||||
return {detail::make_unique<brotli_compressor>(), "br"};
|
||||
#elif defined(CPPHTTPLIB_ZLIB_SUPPORT)
|
||||
return {detail::make_unique<gzip_compressor>(), "gzip"};
|
||||
#elif defined(CPPHTTPLIB_ZSTD_SUPPORT)
|
||||
return {detail::make_unique<zstd_compressor>(), "zstd"};
|
||||
#else
|
||||
return {nullptr, nullptr};
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_prohibited_header_name(const std::string &name) {
|
||||
using udl::operator""_t;
|
||||
|
||||
@@ -3769,7 +3889,7 @@ bool parse_accept_header(const std::string &s,
|
||||
struct AcceptEntry {
|
||||
std::string media_type;
|
||||
double quality;
|
||||
int order; // Original order in header
|
||||
int order;
|
||||
};
|
||||
|
||||
std::vector<AcceptEntry> entries;
|
||||
@@ -3787,48 +3907,12 @@ bool parse_accept_header(const std::string &s,
|
||||
}
|
||||
|
||||
AcceptEntry accept_entry;
|
||||
accept_entry.quality = 1.0; // Default quality
|
||||
accept_entry.order = order++;
|
||||
|
||||
// Find q= parameter
|
||||
auto q_pos = entry.find(";q=");
|
||||
if (q_pos == std::string::npos) { q_pos = entry.find("; q="); }
|
||||
|
||||
if (q_pos != std::string::npos) {
|
||||
// Extract media type (before q parameter)
|
||||
accept_entry.media_type = trim_copy(entry.substr(0, q_pos));
|
||||
|
||||
// Extract quality value
|
||||
auto q_start = entry.find('=', q_pos) + 1;
|
||||
auto q_end = entry.find(';', q_start);
|
||||
if (q_end == std::string::npos) { q_end = entry.length(); }
|
||||
|
||||
std::string quality_str =
|
||||
trim_copy(entry.substr(q_start, q_end - q_start));
|
||||
if (quality_str.empty()) {
|
||||
has_invalid_entry = true;
|
||||
return;
|
||||
}
|
||||
|
||||
{
|
||||
double v = 0.0;
|
||||
auto res = detail::from_chars(
|
||||
quality_str.data(), quality_str.data() + quality_str.size(), v);
|
||||
if (res.ec == std::errc{}) {
|
||||
accept_entry.quality = v;
|
||||
} else {
|
||||
has_invalid_entry = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Check if quality is in valid range [0.0, 1.0]
|
||||
if (accept_entry.quality < 0.0 || accept_entry.quality > 1.0) {
|
||||
has_invalid_entry = true;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// No quality parameter, use entire entry as media type
|
||||
accept_entry.media_type = entry;
|
||||
if (!parse_quality(entry.data(), entry.data() + entry.size(),
|
||||
accept_entry.media_type, accept_entry.quality)) {
|
||||
has_invalid_entry = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove additional parameters from media type
|
||||
@@ -5481,7 +5565,8 @@ std::string decode_path_component(const std::string &component) {
|
||||
// Unicode %uXXXX encoding
|
||||
auto val = 0;
|
||||
if (detail::from_hex_to_i(component, i + 2, 4, val)) {
|
||||
// 4 digits Unicode codes
|
||||
// 4 digits Unicode codes: val is 0x0000-0xFFFF (from 4 hex digits),
|
||||
// so to_utf8 writes at most 3 bytes. buff[4] is safe.
|
||||
char buff[4];
|
||||
size_t len = detail::to_utf8(val, buff);
|
||||
if (len > 0) { result.append(buff, len); }
|
||||
@@ -5586,6 +5671,30 @@ std::string decode_query_component(const std::string &component,
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string sanitize_filename(const std::string &filename) {
|
||||
// Extract basename: find the last path separator (/ or \)
|
||||
auto pos = filename.find_last_of("/\\");
|
||||
auto result =
|
||||
(pos != std::string::npos) ? filename.substr(pos + 1) : filename;
|
||||
|
||||
// Strip null bytes
|
||||
result.erase(std::remove(result.begin(), result.end(), '\0'), result.end());
|
||||
|
||||
// Trim whitespace
|
||||
{
|
||||
auto start = result.find_first_not_of(" \t");
|
||||
auto end = result.find_last_not_of(" \t");
|
||||
result = (start == std::string::npos)
|
||||
? ""
|
||||
: result.substr(start, end - start + 1);
|
||||
}
|
||||
|
||||
// Reject . and ..
|
||||
if (result == "." || result == "..") { return ""; }
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string append_query_params(const std::string &path,
|
||||
const Params ¶ms) {
|
||||
std::string path_with_query = path;
|
||||
@@ -6714,7 +6823,18 @@ bool Server::set_mount_point(const std::string &mount_point,
|
||||
if (stat.is_dir()) {
|
||||
std::string mnt = !mount_point.empty() ? mount_point : "/";
|
||||
if (!mnt.empty() && mnt[0] == '/') {
|
||||
base_dirs_.push_back({std::move(mnt), dir, std::move(headers)});
|
||||
std::string resolved_base;
|
||||
if (detail::canonicalize_path(dir.c_str(), resolved_base)) {
|
||||
#if defined(_WIN32)
|
||||
if (resolved_base.back() != '\\' && resolved_base.back() != '/') {
|
||||
resolved_base += '\\';
|
||||
}
|
||||
#else
|
||||
if (resolved_base.back() != '/') { resolved_base += '/'; }
|
||||
#endif
|
||||
}
|
||||
base_dirs_.push_back(
|
||||
{std::move(mnt), dir, std::move(resolved_base), std::move(headers)});
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -6874,6 +6994,20 @@ Server &Server::set_payload_max_length(size_t length) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
Server &Server::set_websocket_ping_interval(time_t sec) {
|
||||
websocket_ping_interval_sec_ = sec;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <class Rep, class Period>
|
||||
Server &Server::set_websocket_ping_interval(
|
||||
const std::chrono::duration<Rep, Period> &duration) {
|
||||
detail::duration_to_sec_and_usec(duration, [&](time_t sec, time_t /*usec*/) {
|
||||
set_websocket_ping_interval(sec);
|
||||
});
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool Server::bind_to_port(const std::string &host, int port,
|
||||
int socket_flags) {
|
||||
auto ret = bind_internal(host, port, socket_flags);
|
||||
@@ -7294,6 +7428,18 @@ bool Server::handle_file_request(Request &req, Response &res) {
|
||||
auto path = entry.base_dir + sub_path;
|
||||
if (path.back() == '/') { path += "index.html"; }
|
||||
|
||||
// Defense-in-depth: is_valid_path blocks ".." traversal in the URL,
|
||||
// but symlinks/junctions can still escape the base directory.
|
||||
if (!entry.resolved_base_dir.empty()) {
|
||||
std::string resolved_path;
|
||||
if (detail::canonicalize_path(path.c_str(), resolved_path) &&
|
||||
!detail::is_path_within_base(resolved_path,
|
||||
entry.resolved_base_dir)) {
|
||||
res.status = StatusCode::Forbidden_403;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
detail::FileStat stat(path);
|
||||
|
||||
if (stat.is_dir()) {
|
||||
@@ -8012,7 +8158,7 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
|
||||
{
|
||||
// Use WebSocket-specific read timeout instead of HTTP timeout
|
||||
strm.set_read_timeout(CPPHTTPLIB_WEBSOCKET_READ_TIMEOUT_SECOND, 0);
|
||||
ws::WebSocket ws(strm, req, true);
|
||||
ws::WebSocket ws(strm, req, true, websocket_ping_interval_sec_);
|
||||
entry.handler(req, ws);
|
||||
}
|
||||
return true;
|
||||
@@ -8256,6 +8402,13 @@ bool ClientImpl::ensure_socket_connection(Socket &socket, Error &error) {
|
||||
return create_and_connect_socket(socket, error);
|
||||
}
|
||||
|
||||
bool ClientImpl::setup_proxy_connection(
|
||||
Socket & /*socket*/,
|
||||
std::chrono::time_point<std::chrono::steady_clock> /*start_time*/,
|
||||
Response & /*res*/, bool & /*success*/, Error & /*error*/) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void ClientImpl::shutdown_ssl(Socket & /*socket*/,
|
||||
bool /*shutdown_gracefully*/) {
|
||||
// If there are any requests in flight from threads other than us, then it's
|
||||
@@ -8377,27 +8530,14 @@ bool ClientImpl::send_(Request &req, Response &res, Error &error) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
// TODO: refactoring
|
||||
if (is_ssl()) {
|
||||
auto &scli = static_cast<SSLClient &>(*this);
|
||||
if (!proxy_host_.empty() && proxy_port_ != -1) {
|
||||
auto success = false;
|
||||
if (!scli.connect_with_proxy(socket_, req.start_time_, res, success,
|
||||
error)) {
|
||||
if (!success) { output_error_log(error, &req); }
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
if (!proxy_host_.empty() && proxy_port_ != -1) {
|
||||
if (!scli.initialize_ssl(socket_, error)) {
|
||||
output_error_log(error, &req);
|
||||
return false;
|
||||
}
|
||||
{
|
||||
auto success = true;
|
||||
if (!setup_proxy_connection(socket_, req.start_time_, res, success,
|
||||
error)) {
|
||||
if (!success) { output_error_log(error, &req); }
|
||||
return success;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Mark the current socket as being in use so that it cannot be closed by
|
||||
@@ -8558,17 +8698,15 @@ ClientImpl::open_stream(const std::string &method, const std::string &path,
|
||||
return handle;
|
||||
}
|
||||
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
if (is_ssl()) {
|
||||
auto &scli = static_cast<SSLClient &>(*this);
|
||||
if (!proxy_host_.empty() && proxy_port_ != -1) {
|
||||
if (!scli.initialize_ssl(socket_, handle.error)) {
|
||||
handle.response.reset();
|
||||
return handle;
|
||||
}
|
||||
{
|
||||
auto success = true;
|
||||
auto start_time = std::chrono::steady_clock::now();
|
||||
if (!setup_proxy_connection(socket_, start_time, *handle.response,
|
||||
success, handle.error)) {
|
||||
if (!success) { handle.response.reset(); }
|
||||
return handle;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
transfer_socket_ownership_to_handle(handle);
|
||||
@@ -8847,7 +8985,7 @@ bool ClientImpl::handle_request(Stream &strm, Request &req,
|
||||
|
||||
if (res.get_header_value("Connection") == "close" ||
|
||||
(res.version == "HTTP/1.0" && res.reason != "Connection established")) {
|
||||
// TODO this requires a not-entirely-obvious chain of calls to be correct
|
||||
// NOTE: this requires a not-entirely-obvious chain of calls to be correct
|
||||
// for this to be safe.
|
||||
|
||||
// This is safe to call because handle_request is only called by send_
|
||||
@@ -9086,14 +9224,9 @@ bool ClientImpl::write_content_with_provider(Stream &strm,
|
||||
auto is_shutting_down = []() { return false; };
|
||||
|
||||
if (req.is_chunked_content_provider_) {
|
||||
// TODO: Brotli support
|
||||
std::unique_ptr<detail::compressor> compressor;
|
||||
#ifdef CPPHTTPLIB_ZLIB_SUPPORT
|
||||
if (compress_) {
|
||||
compressor = detail::make_unique<detail::gzip_compressor>();
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
auto compressor = compress_ ? detail::create_compressor().first
|
||||
: std::unique_ptr<detail::compressor>();
|
||||
if (!compressor) {
|
||||
compressor = detail::make_unique<detail::nocompressor>();
|
||||
}
|
||||
|
||||
@@ -9324,14 +9457,15 @@ ClientImpl::send_with_content_provider_and_receiver(
|
||||
Error &error) {
|
||||
if (!content_type.empty()) { req.set_header("Content-Type", content_type); }
|
||||
|
||||
#ifdef CPPHTTPLIB_ZLIB_SUPPORT
|
||||
if (compress_) { req.set_header("Content-Encoding", "gzip"); }
|
||||
#endif
|
||||
auto enc = compress_
|
||||
? detail::create_compressor()
|
||||
: std::pair<std::unique_ptr<detail::compressor>, const char *>(
|
||||
nullptr, nullptr);
|
||||
|
||||
#ifdef CPPHTTPLIB_ZLIB_SUPPORT
|
||||
if (compress_ && !content_provider_without_length) {
|
||||
// TODO: Brotli support
|
||||
detail::gzip_compressor compressor;
|
||||
if (enc.second) { req.set_header("Content-Encoding", enc.second); }
|
||||
|
||||
if (enc.first && !content_provider_without_length) {
|
||||
auto &compressor = enc.first;
|
||||
|
||||
if (content_provider) {
|
||||
auto ok = true;
|
||||
@@ -9342,7 +9476,7 @@ ClientImpl::send_with_content_provider_and_receiver(
|
||||
if (ok) {
|
||||
auto last = offset + data_len == content_length;
|
||||
|
||||
auto ret = compressor.compress(
|
||||
auto ret = compressor->compress(
|
||||
data, data_len, last,
|
||||
[&](const char *compressed_data, size_t compressed_data_len) {
|
||||
req.body.append(compressed_data, compressed_data_len);
|
||||
@@ -9366,19 +9500,17 @@ ClientImpl::send_with_content_provider_and_receiver(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (!compressor.compress(body, content_length, true,
|
||||
[&](const char *data, size_t data_len) {
|
||||
req.body.append(data, data_len);
|
||||
return true;
|
||||
})) {
|
||||
if (!compressor->compress(body, content_length, true,
|
||||
[&](const char *data, size_t data_len) {
|
||||
req.body.append(data, data_len);
|
||||
return true;
|
||||
})) {
|
||||
error = Error::Compression;
|
||||
output_error_log(error, &req);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
} else {
|
||||
if (content_provider) {
|
||||
req.content_length_ = content_length;
|
||||
req.content_provider_ = std::move(content_provider);
|
||||
@@ -11545,6 +11677,24 @@ bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) {
|
||||
return ClientImpl::create_and_connect_socket(socket, error);
|
||||
}
|
||||
|
||||
bool SSLClient::setup_proxy_connection(
|
||||
Socket &socket,
|
||||
std::chrono::time_point<std::chrono::steady_clock> start_time,
|
||||
Response &res, bool &success, Error &error) {
|
||||
if (proxy_host_.empty() || proxy_port_ == -1) { return true; }
|
||||
|
||||
if (!connect_with_proxy(socket, start_time, res, success, error)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!initialize_ssl(socket, error)) {
|
||||
success = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Assumes that socket_mutex_ is locked and that there are no requests in
|
||||
// flight
|
||||
bool SSLClient::connect_with_proxy(
|
||||
@@ -16061,11 +16211,11 @@ WebSocket::~WebSocket() {
|
||||
}
|
||||
|
||||
void WebSocket::start_heartbeat() {
|
||||
if (ping_interval_sec_ == 0) { return; }
|
||||
ping_thread_ = std::thread([this]() {
|
||||
std::unique_lock<std::mutex> lock(ping_mutex_);
|
||||
while (!closed_) {
|
||||
ping_cv_.wait_for(lock, std::chrono::seconds(
|
||||
CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND));
|
||||
ping_cv_.wait_for(lock, std::chrono::seconds(ping_interval_sec_));
|
||||
if (closed_) { break; }
|
||||
lock.unlock();
|
||||
if (!send_frame(Opcode::Ping, nullptr, 0)) {
|
||||
@@ -16203,7 +16353,8 @@ bool WebSocketClient::connect() {
|
||||
Request req;
|
||||
req.method = "GET";
|
||||
req.path = path_;
|
||||
ws_ = std::unique_ptr<WebSocket>(new WebSocket(std::move(strm), req, false));
|
||||
ws_ = std::unique_ptr<WebSocket>(
|
||||
new WebSocket(std::move(strm), req, false, websocket_ping_interval_sec_));
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -16243,6 +16394,10 @@ void WebSocketClient::set_write_timeout(time_t sec, time_t usec) {
|
||||
write_timeout_usec_ = usec;
|
||||
}
|
||||
|
||||
void WebSocketClient::set_websocket_ping_interval(time_t sec) {
|
||||
websocket_ping_interval_sec_ = sec;
|
||||
}
|
||||
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
|
||||
void WebSocketClient::set_ca_cert_path(const std::string &path) {
|
||||
|
||||
Vendored
+33
-7
@@ -8,8 +8,8 @@
|
||||
#ifndef CPPHTTPLIB_HTTPLIB_H
|
||||
#define CPPHTTPLIB_HTTPLIB_H
|
||||
|
||||
#define CPPHTTPLIB_VERSION "0.37.2"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002502"
|
||||
#define CPPHTTPLIB_VERSION "0.38.0"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002600"
|
||||
|
||||
#ifdef _WIN32
|
||||
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
|
||||
@@ -1666,6 +1666,11 @@ public:
|
||||
|
||||
Server &set_payload_max_length(size_t length);
|
||||
|
||||
Server &set_websocket_ping_interval(time_t sec);
|
||||
template <class Rep, class Period>
|
||||
Server &set_websocket_ping_interval(
|
||||
const std::chrono::duration<Rep, Period> &duration);
|
||||
|
||||
bool bind_to_port(const std::string &host, int port, int socket_flags = 0);
|
||||
int bind_to_any_port(const std::string &host, int socket_flags = 0);
|
||||
bool listen_after_bind();
|
||||
@@ -1700,6 +1705,8 @@ protected:
|
||||
time_t idle_interval_sec_ = CPPHTTPLIB_IDLE_INTERVAL_SECOND;
|
||||
time_t idle_interval_usec_ = CPPHTTPLIB_IDLE_INTERVAL_USECOND;
|
||||
size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH;
|
||||
time_t websocket_ping_interval_sec_ =
|
||||
CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND;
|
||||
|
||||
private:
|
||||
using Handlers =
|
||||
@@ -1769,6 +1776,7 @@ private:
|
||||
struct MountPointEntry {
|
||||
std::string mount_point;
|
||||
std::string base_dir;
|
||||
std::string resolved_base_dir;
|
||||
Headers headers;
|
||||
};
|
||||
std::vector<MountPointEntry> base_dirs_;
|
||||
@@ -2186,6 +2194,10 @@ protected:
|
||||
|
||||
virtual bool create_and_connect_socket(Socket &socket, Error &error);
|
||||
virtual bool ensure_socket_connection(Socket &socket, Error &error);
|
||||
virtual bool setup_proxy_connection(
|
||||
Socket &socket,
|
||||
std::chrono::time_point<std::chrono::steady_clock> start_time,
|
||||
Response &res, bool &success, Error &error);
|
||||
|
||||
// All of:
|
||||
// shutdown_ssl
|
||||
@@ -2712,6 +2724,10 @@ private:
|
||||
std::function<bool(Stream &strm)> callback) override;
|
||||
bool is_ssl() const override;
|
||||
|
||||
bool setup_proxy_connection(
|
||||
Socket &socket,
|
||||
std::chrono::time_point<std::chrono::steady_clock> start_time,
|
||||
Response &res, bool &success, Error &error) override;
|
||||
bool connect_with_proxy(
|
||||
Socket &sock,
|
||||
std::chrono::time_point<std::chrono::steady_clock> start_time,
|
||||
@@ -2911,6 +2927,8 @@ std::string encode_query_component(const std::string &component,
|
||||
std::string decode_query_component(const std::string &component,
|
||||
bool plus_as_space = true);
|
||||
|
||||
std::string sanitize_filename(const std::string &filename);
|
||||
|
||||
std::string append_query_params(const std::string &path, const Params ¶ms);
|
||||
|
||||
std::pair<std::string, std::string> make_range_header(const Ranges &ranges);
|
||||
@@ -3714,15 +3732,19 @@ private:
|
||||
friend class httplib::Server;
|
||||
friend class WebSocketClient;
|
||||
|
||||
WebSocket(Stream &strm, const Request &req, bool is_server)
|
||||
: strm_(strm), req_(req), is_server_(is_server) {
|
||||
WebSocket(
|
||||
Stream &strm, const Request &req, bool is_server,
|
||||
time_t ping_interval_sec = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND)
|
||||
: strm_(strm), req_(req), is_server_(is_server),
|
||||
ping_interval_sec_(ping_interval_sec) {
|
||||
start_heartbeat();
|
||||
}
|
||||
|
||||
WebSocket(std::unique_ptr<Stream> &&owned_strm, const Request &req,
|
||||
bool is_server)
|
||||
WebSocket(
|
||||
std::unique_ptr<Stream> &&owned_strm, const Request &req, bool is_server,
|
||||
time_t ping_interval_sec = CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND)
|
||||
: strm_(*owned_strm), owned_strm_(std::move(owned_strm)), req_(req),
|
||||
is_server_(is_server) {
|
||||
is_server_(is_server), ping_interval_sec_(ping_interval_sec) {
|
||||
start_heartbeat();
|
||||
}
|
||||
|
||||
@@ -3733,6 +3755,7 @@ private:
|
||||
std::unique_ptr<Stream> owned_strm_;
|
||||
Request req_;
|
||||
bool is_server_;
|
||||
time_t ping_interval_sec_;
|
||||
std::atomic<bool> closed_{false};
|
||||
std::mutex write_mutex_;
|
||||
std::thread ping_thread_;
|
||||
@@ -3761,6 +3784,7 @@ public:
|
||||
const std::string &subprotocol() const;
|
||||
void set_read_timeout(time_t sec, time_t usec = 0);
|
||||
void set_write_timeout(time_t sec, time_t usec = 0);
|
||||
void set_websocket_ping_interval(time_t sec);
|
||||
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
void set_ca_cert_path(const std::string &path);
|
||||
@@ -3784,6 +3808,8 @@ private:
|
||||
time_t read_timeout_usec_ = 0;
|
||||
time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND;
|
||||
time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND;
|
||||
time_t websocket_ping_interval_sec_ =
|
||||
CPPHTTPLIB_WEBSOCKET_PING_INTERVAL_SECOND;
|
||||
|
||||
#ifdef CPPHTTPLIB_SSL_ENABLED
|
||||
bool is_ssl_ = false;
|
||||
|
||||
Reference in New Issue
Block a user