mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 10:07:44 +02:00
Compare commits
18 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 37adc9c6ba | |||
| 16cc3c606e | |||
| 13628d8bdb | |||
| a96283adc4 | |||
| 4eba8d9451 | |||
| 61bde8e21f | |||
| e251e5ebbe | |||
| c4357dcc35 | |||
| e148380c7c | |||
| a2b0fe8d37 | |||
| 7f3a72a8ed | |||
| b9a37717b0 | |||
| f3a9674ae8 | |||
| 2c453c6c77 | |||
| 5d6bd842ea | |||
| fd3abe849e | |||
| 682e6658bb | |||
| 4574f2949e |
@@ -1,120 +0,0 @@
|
||||
name: Build on RISCV Linux Machine by Cloud-V
|
||||
on:
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
debian-13-riscv64-native: # Bianbu 2.2
|
||||
runs-on: [self-hosted, RISCV64]
|
||||
|
||||
steps:
|
||||
- name: Install prerequisites
|
||||
run: |
|
||||
sudo apt-get update || true
|
||||
sudo apt-get install -y libatomic1
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo apt-get update || true
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
g++-14-riscv64-linux-gnu \
|
||||
ccache \
|
||||
cmake
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
mkdir -p $HOME/.ccache
|
||||
ccache -M 5G -d $HOME/.ccache
|
||||
export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
|
||||
export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
|
||||
echo "$GITHUB_WORKSPACE"
|
||||
echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
|
||||
echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
|
||||
echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
|
||||
echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
# debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
|
||||
# runs-on: [self-hosted, RISCV64]
|
||||
|
||||
# steps:
|
||||
# - name: Install prerequisites
|
||||
# run: |
|
||||
# sudo apt-get update || true
|
||||
# sudo apt-get install -y libatomic1
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Setup Riscv
|
||||
# run: |
|
||||
# sudo apt-get update || true
|
||||
# sudo apt-get install -y --no-install-recommends \
|
||||
# build-essential \
|
||||
# gcc-14-riscv64-linux-gnu \
|
||||
# g++-14-riscv64-linux-gnu \
|
||||
# ccache \
|
||||
# cmake
|
||||
# sudo apt-get upgrade binutils -y
|
||||
|
||||
# - name: Setup ccache
|
||||
# run: |
|
||||
# mkdir -p $HOME/.ccache
|
||||
# ccache -M 5G -d $HOME/.ccache
|
||||
# export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
|
||||
# export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
|
||||
# echo "$GITHUB_WORKSPACE"
|
||||
# echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
|
||||
# echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
|
||||
# echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
|
||||
# echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
|
||||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build \
|
||||
# -DLLAMA_CURL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
# -DLLAMA_BUILD_EXAMPLES=ON \
|
||||
# -DLLAMA_BUILD_TOOLS=ON \
|
||||
# -DLLAMA_BUILD_TESTS=OFF \
|
||||
# -DCMAKE_SYSTEM_NAME=Linux \
|
||||
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
# -DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
# -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
|
||||
# -DGGML_RVV=ON \
|
||||
# -DGGML_RV_ZFH=ON \
|
||||
# -DGGML_RV_ZICBOP=ON \
|
||||
# -DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
# -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
|
||||
|
||||
# cmake --build build --config Release -j $(nproc)
|
||||
@@ -1642,6 +1642,337 @@ jobs:
|
||||
run: |
|
||||
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ubuntu-cpu-cmake-riscv64-native:
|
||||
runs-on: RISCV64
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||
|
||||
# Install Rust stable version
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Check environment
|
||||
run: |
|
||||
uname -a
|
||||
gcc --version
|
||||
g++ --version
|
||||
ldd --version
|
||||
cmake --version
|
||||
rustc --version
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
# Set unique cache directory for this job
|
||||
export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
|
||||
mkdir -p "$CCACHE_DIR"
|
||||
|
||||
# Configure ccache for optimal performance
|
||||
ccache --set-config=max_size=5G
|
||||
ccache --set-config=compression=true
|
||||
ccache --set-config=compression_level=6
|
||||
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||
|
||||
# Enable more aggressive caching
|
||||
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||
ccache --set-config=hash_dir=false
|
||||
|
||||
# Export for subsequent steps
|
||||
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=ON \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-cmake-sanitizer-riscv64-native:
|
||||
runs-on: RISCV64
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||
|
||||
# Install Rust stable version
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
|
||||
- name: GCC version check
|
||||
run: |
|
||||
gcc --version
|
||||
g++ --version
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
# Unique cache directory per matrix combination
|
||||
export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
|
||||
mkdir -p "$CCACHE_DIR"
|
||||
|
||||
# Configure ccache
|
||||
ccache --set-config=max_size=5G
|
||||
ccache --set-config=compression=true
|
||||
ccache --set-config=compression_level=6
|
||||
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||
ccache --set-config=hash_dir=false
|
||||
|
||||
# Export for subsequent steps
|
||||
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
|
||||
ubuntu-llguidance-riscv64-native:
|
||||
runs-on: RISCV64
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||
|
||||
# Install Rust stable version
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
|
||||
- name: GCC version check
|
||||
run: |
|
||||
gcc --version
|
||||
g++ --version
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64"
|
||||
mkdir -p "$CCACHE_DIR"
|
||||
|
||||
ccache --set-config=max_size=5G
|
||||
ccache --set-config=compression=true
|
||||
ccache --set-config=compression_level=6
|
||||
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||
ccache --set-config=hash_dir=false
|
||||
|
||||
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DLLAMA_LLGUIDANCE=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
|
||||
ubuntu-cmake-rpc-riscv64-native:
|
||||
runs-on: RISCV64
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||
|
||||
# Install Rust stable version
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
|
||||
- name: GCC version check
|
||||
run: |
|
||||
gcc --version
|
||||
g++ --version
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
export CCACHE_DIR="$HOME/.ccache/rpc-riscv64"
|
||||
mkdir -p "$CCACHE_DIR"
|
||||
|
||||
ccache --set-config=max_size=5G
|
||||
ccache --set-config=compression=true
|
||||
ccache --set-config=compression_level=6
|
||||
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||
ccache --set-config=hash_dir=false
|
||||
|
||||
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_OPENSSL=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=ON \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
-DGGML_RPC=ON
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
|
||||
ggml-ci-arm64-graviton4-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ jobs:
|
||||
update:
|
||||
name: Update Winget Package
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.repository.owner.login == 'ggml-org' }}
|
||||
|
||||
steps:
|
||||
- name: Install cargo binstall
|
||||
|
||||
+2
-3
@@ -7,7 +7,7 @@
|
||||
/ci/ @ggerganov
|
||||
/cmake/ @ggerganov
|
||||
/common/CMakeLists.txt @ggerganov
|
||||
/common/arg.* @ggerganov @ericcurtin
|
||||
/common/arg.* @ggerganov
|
||||
/common/base64.hpp.* @ggerganov
|
||||
/common/build-info.* @ggerganov
|
||||
/common/common.* @ggerganov
|
||||
@@ -87,8 +87,7 @@
|
||||
/tools/perplexity/ @ggerganov
|
||||
/tools/quantize/ @ggerganov
|
||||
/tools/rpc/ @rgerganov
|
||||
/tools/run/ @ericcurtin
|
||||
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
|
||||
/tools/server/* @ngxson @ggerganov # no subdir
|
||||
/tools/server/webui/ @allozaur
|
||||
/tools/tokenize/ @ggerganov
|
||||
/tools/tts/ @ggerganov
|
||||
|
||||
@@ -45,7 +45,7 @@ sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
|
||||
+18
-1
@@ -1226,7 +1226,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.warmup = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
||||
add_opt(common_arg(
|
||||
{"--spm-infill"},
|
||||
string_format(
|
||||
@@ -2488,12 +2488,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"path to save slot kv cache (default: disabled)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.slot_save_path = value;
|
||||
if (!fs_is_directory(params.slot_save_path)) {
|
||||
throw std::invalid_argument("not a directory: " + value);
|
||||
}
|
||||
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
||||
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
||||
params.slot_save_path += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--media-path"}, "PATH",
|
||||
"directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.media_path = value;
|
||||
if (!fs_is_directory(params.media_path)) {
|
||||
throw std::invalid_argument("not a directory: " + value);
|
||||
}
|
||||
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
||||
if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
||||
params.media_path += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--models-dir"}, "PATH",
|
||||
"directory containing models for the router server (default: disabled)",
|
||||
|
||||
+16
-16
@@ -163,7 +163,7 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
|
||||
if (tool_choice == "required") {
|
||||
return COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
}
|
||||
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
||||
throw std::invalid_argument("Invalid tool_choice: " + tool_choice);
|
||||
}
|
||||
|
||||
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
|
||||
@@ -186,17 +186,17 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
try {
|
||||
|
||||
if (!messages.is_array()) {
|
||||
throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump());
|
||||
throw std::invalid_argument("Expected 'messages' to be an array, got " + messages.dump());
|
||||
}
|
||||
|
||||
for (const auto & message : messages) {
|
||||
if (!message.is_object()) {
|
||||
throw std::runtime_error("Expected 'message' to be an object, got " + message.dump());
|
||||
throw std::invalid_argument("Expected 'message' to be an object, got " + message.dump());
|
||||
}
|
||||
|
||||
common_chat_msg msg;
|
||||
if (!message.contains("role")) {
|
||||
throw std::runtime_error("Missing 'role' in message: " + message.dump());
|
||||
throw std::invalid_argument("Missing 'role' in message: " + message.dump());
|
||||
}
|
||||
msg.role = message.at("role");
|
||||
|
||||
@@ -209,11 +209,11 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
} else if (content.is_array()) {
|
||||
for (const auto & part : content) {
|
||||
if (!part.contains("type")) {
|
||||
throw std::runtime_error("Missing content part type: " + part.dump());
|
||||
throw std::invalid_argument("Missing content part type: " + part.dump());
|
||||
}
|
||||
const auto & type = part.at("type");
|
||||
if (type != "text") {
|
||||
throw std::runtime_error("Unsupported content part type: " + type.dump());
|
||||
throw std::invalid_argument("Unsupported content part type: " + type.dump());
|
||||
}
|
||||
common_chat_msg_content_part msg_part;
|
||||
msg_part.type = type;
|
||||
@@ -221,25 +221,25 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
msg.content_parts.push_back(msg_part);
|
||||
}
|
||||
} else if (!content.is_null()) {
|
||||
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||
throw std::invalid_argument("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||
}
|
||||
}
|
||||
if (has_tool_calls) {
|
||||
for (const auto & tool_call : message.at("tool_calls")) {
|
||||
common_chat_tool_call tc;
|
||||
if (!tool_call.contains("type")) {
|
||||
throw std::runtime_error("Missing tool call type: " + tool_call.dump());
|
||||
throw std::invalid_argument("Missing tool call type: " + tool_call.dump());
|
||||
}
|
||||
const auto & type = tool_call.at("type");
|
||||
if (type != "function") {
|
||||
throw std::runtime_error("Unsupported tool call type: " + tool_call.dump());
|
||||
throw std::invalid_argument("Unsupported tool call type: " + tool_call.dump());
|
||||
}
|
||||
if (!tool_call.contains("function")) {
|
||||
throw std::runtime_error("Missing tool call function: " + tool_call.dump());
|
||||
throw std::invalid_argument("Missing tool call function: " + tool_call.dump());
|
||||
}
|
||||
const auto & fc = tool_call.at("function");
|
||||
if (!fc.contains("name")) {
|
||||
throw std::runtime_error("Missing tool call name: " + tool_call.dump());
|
||||
throw std::invalid_argument("Missing tool call name: " + tool_call.dump());
|
||||
}
|
||||
tc.name = fc.at("name");
|
||||
tc.arguments = fc.at("arguments");
|
||||
@@ -250,7 +250,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
}
|
||||
}
|
||||
if (!has_content && !has_tool_calls) {
|
||||
throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
|
||||
throw std::invalid_argument("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
|
||||
}
|
||||
if (message.contains("reasoning_content")) {
|
||||
msg.reasoning_content = message.at("reasoning_content");
|
||||
@@ -353,18 +353,18 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
|
||||
try {
|
||||
if (!tools.is_null()) {
|
||||
if (!tools.is_array()) {
|
||||
throw std::runtime_error("Expected 'tools' to be an array, got " + tools.dump());
|
||||
throw std::invalid_argument("Expected 'tools' to be an array, got " + tools.dump());
|
||||
}
|
||||
for (const auto & tool : tools) {
|
||||
if (!tool.contains("type")) {
|
||||
throw std::runtime_error("Missing tool type: " + tool.dump());
|
||||
throw std::invalid_argument("Missing tool type: " + tool.dump());
|
||||
}
|
||||
const auto & type = tool.at("type");
|
||||
if (!type.is_string() || type != "function") {
|
||||
throw std::runtime_error("Unsupported tool type: " + tool.dump());
|
||||
throw std::invalid_argument("Unsupported tool type: " + tool.dump());
|
||||
}
|
||||
if (!tool.contains("function")) {
|
||||
throw std::runtime_error("Missing tool function: " + tool.dump());
|
||||
throw std::invalid_argument("Missing tool function: " + tool.dump());
|
||||
}
|
||||
|
||||
const auto & function = tool.at("function");
|
||||
|
||||
+11
-2
@@ -694,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
||||
|
||||
// Validate if a filename is safe to use
|
||||
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||
bool fs_validate_filename(const std::string & filename) {
|
||||
bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
||||
if (!filename.length()) {
|
||||
// Empty filename invalid
|
||||
return false;
|
||||
@@ -754,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
|
||||
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||
|| c == 0xFEFF // Byte Order Mark (BOM)
|
||||
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|
||||
|| c == ':' || c == '*' // Illegal characters
|
||||
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||
return false;
|
||||
}
|
||||
if (!allow_subdirs && (c == '/' || c == '\\')) {
|
||||
// Subdirectories not allowed, reject path separators
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||
@@ -859,6 +863,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
bool fs_is_directory(const std::string & path) {
|
||||
std::filesystem::path dir(path);
|
||||
return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
|
||||
}
|
||||
|
||||
std::string fs_get_cache_directory() {
|
||||
std::string cache_directory = "";
|
||||
auto ensure_trailing_slash = [](std::string p) {
|
||||
|
||||
+3
-1
@@ -485,6 +485,7 @@ struct common_params {
|
||||
bool log_json = false;
|
||||
|
||||
std::string slot_save_path;
|
||||
std::string media_path; // path to directory for loading media files
|
||||
|
||||
float slot_prompt_similarity = 0.1f;
|
||||
|
||||
@@ -635,8 +636,9 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
||||
// Filesystem utils
|
||||
//
|
||||
|
||||
bool fs_validate_filename(const std::string & filename);
|
||||
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
|
||||
bool fs_create_directory_with_parents(const std::string & path);
|
||||
bool fs_is_directory(const std::string & path);
|
||||
|
||||
std::string fs_get_cache_directory();
|
||||
std::string fs_get_cache_file(const std::string & filename);
|
||||
|
||||
@@ -974,7 +974,7 @@ public:
|
||||
|
||||
void check_errors() {
|
||||
if (!_errors.empty()) {
|
||||
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||
throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||
}
|
||||
if (!_warnings.empty()) {
|
||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
||||
|
||||
@@ -2842,6 +2842,10 @@ class Mistral3Model(LlamaModel):
|
||||
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
# TODO: probably not worth supporting quantized weight, as official BF16 is also available
|
||||
if name.endswith("weight_scale_inv"):
|
||||
raise ValueError("This is a quantized weight, please use BF16 weight instead")
|
||||
|
||||
name = name.replace("language_model.", "")
|
||||
if "multi_modal_projector" in name or "vision_tower" in name:
|
||||
return []
|
||||
|
||||
@@ -431,11 +431,22 @@ docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/ren
|
||||
|
||||
### For Linux users:
|
||||
|
||||
#### Using the LunarG Vulkan SDK
|
||||
|
||||
First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
|
||||
|
||||
#### Using system packages
|
||||
|
||||
On Debian / Ubuntu, you can install the required dependencies using:
|
||||
```sh
|
||||
sudo apt-get install libvulkan-dev glslc
|
||||
```
|
||||
|
||||
#### Common steps
|
||||
|
||||
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
|
||||
```bash
|
||||
vulkaninfo
|
||||
|
||||
+47
-42
@@ -408,62 +408,67 @@ if (MSVC)
|
||||
/wd4996 # Disable POSIX deprecation warnings
|
||||
/wd4702 # Unreachable code warnings
|
||||
)
|
||||
function(disable_msvc_warnings target_name)
|
||||
set(MSVC_COMPILE_OPTIONS
|
||||
"$<$<COMPILE_LANGUAGE:C>:/utf-8>"
|
||||
"$<$<COMPILE_LANGUAGE:CXX>:/utf-8>"
|
||||
)
|
||||
function(configure_msvc_target target_name)
|
||||
if(TARGET ${target_name})
|
||||
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
|
||||
target_compile_options(${target_name} PRIVATE ${MSVC_COMPILE_OPTIONS})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
disable_msvc_warnings(ggml-base)
|
||||
disable_msvc_warnings(ggml)
|
||||
disable_msvc_warnings(ggml-cpu)
|
||||
disable_msvc_warnings(ggml-cpu-x64)
|
||||
disable_msvc_warnings(ggml-cpu-sse42)
|
||||
disable_msvc_warnings(ggml-cpu-sandybridge)
|
||||
disable_msvc_warnings(ggml-cpu-haswell)
|
||||
disable_msvc_warnings(ggml-cpu-skylakex)
|
||||
disable_msvc_warnings(ggml-cpu-icelake)
|
||||
disable_msvc_warnings(ggml-cpu-alderlake)
|
||||
configure_msvc_target(ggml-base)
|
||||
configure_msvc_target(ggml)
|
||||
configure_msvc_target(ggml-cpu)
|
||||
configure_msvc_target(ggml-cpu-x64)
|
||||
configure_msvc_target(ggml-cpu-sse42)
|
||||
configure_msvc_target(ggml-cpu-sandybridge)
|
||||
configure_msvc_target(ggml-cpu-haswell)
|
||||
configure_msvc_target(ggml-cpu-skylakex)
|
||||
configure_msvc_target(ggml-cpu-icelake)
|
||||
configure_msvc_target(ggml-cpu-alderlake)
|
||||
|
||||
if (GGML_BUILD_EXAMPLES)
|
||||
disable_msvc_warnings(common-ggml)
|
||||
disable_msvc_warnings(common)
|
||||
configure_msvc_target(common-ggml)
|
||||
configure_msvc_target(common)
|
||||
|
||||
disable_msvc_warnings(mnist-common)
|
||||
disable_msvc_warnings(mnist-eval)
|
||||
disable_msvc_warnings(mnist-train)
|
||||
configure_msvc_target(mnist-common)
|
||||
configure_msvc_target(mnist-eval)
|
||||
configure_msvc_target(mnist-train)
|
||||
|
||||
disable_msvc_warnings(gpt-2-ctx)
|
||||
disable_msvc_warnings(gpt-2-alloc)
|
||||
disable_msvc_warnings(gpt-2-backend)
|
||||
disable_msvc_warnings(gpt-2-sched)
|
||||
disable_msvc_warnings(gpt-2-quantize)
|
||||
disable_msvc_warnings(gpt-2-batched)
|
||||
configure_msvc_target(gpt-2-ctx)
|
||||
configure_msvc_target(gpt-2-alloc)
|
||||
configure_msvc_target(gpt-2-backend)
|
||||
configure_msvc_target(gpt-2-sched)
|
||||
configure_msvc_target(gpt-2-quantize)
|
||||
configure_msvc_target(gpt-2-batched)
|
||||
|
||||
disable_msvc_warnings(gpt-j)
|
||||
disable_msvc_warnings(gpt-j-quantize)
|
||||
configure_msvc_target(gpt-j)
|
||||
configure_msvc_target(gpt-j-quantize)
|
||||
|
||||
disable_msvc_warnings(magika)
|
||||
disable_msvc_warnings(yolov3-tiny)
|
||||
disable_msvc_warnings(sam)
|
||||
configure_msvc_target(magika)
|
||||
configure_msvc_target(yolov3-tiny)
|
||||
configure_msvc_target(sam)
|
||||
|
||||
disable_msvc_warnings(simple-ctx)
|
||||
disable_msvc_warnings(simple-backend)
|
||||
configure_msvc_target(simple-ctx)
|
||||
configure_msvc_target(simple-backend)
|
||||
endif()
|
||||
|
||||
if (GGML_BUILD_TESTS)
|
||||
disable_msvc_warnings(test-mul-mat)
|
||||
disable_msvc_warnings(test-arange)
|
||||
disable_msvc_warnings(test-backend-ops)
|
||||
disable_msvc_warnings(test-cont)
|
||||
disable_msvc_warnings(test-conv-transpose)
|
||||
disable_msvc_warnings(test-conv-transpose-1d)
|
||||
disable_msvc_warnings(test-conv1d)
|
||||
disable_msvc_warnings(test-conv2d)
|
||||
disable_msvc_warnings(test-conv2d-dw)
|
||||
disable_msvc_warnings(test-customop)
|
||||
disable_msvc_warnings(test-dup)
|
||||
disable_msvc_warnings(test-opt)
|
||||
disable_msvc_warnings(test-pool)
|
||||
configure_msvc_target(test-mul-mat)
|
||||
configure_msvc_target(test-arange)
|
||||
configure_msvc_target(test-backend-ops)
|
||||
configure_msvc_target(test-cont)
|
||||
configure_msvc_target(test-conv-transpose)
|
||||
configure_msvc_target(test-conv-transpose-1d)
|
||||
configure_msvc_target(test-conv1d)
|
||||
configure_msvc_target(test-conv2d)
|
||||
configure_msvc_target(test-conv2d-dw)
|
||||
configure_msvc_target(test-customop)
|
||||
configure_msvc_target(test-dup)
|
||||
configure_msvc_target(test-opt)
|
||||
configure_msvc_target(test-pool)
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
@@ -1240,10 +1240,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
||||
}
|
||||
if (sched->n_copies > 1) {
|
||||
ggml_set_input(tensor_copy);
|
||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||
}
|
||||
ggml_set_input(tensor_copy);
|
||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
}
|
||||
|
||||
@@ -2564,6 +2564,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
return true;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
#ifdef ASCEND_310P
|
||||
// Ger is not supported on 310p device
|
||||
return false;
|
||||
#endif
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
|
||||
@@ -683,22 +683,14 @@ bool ggml_is_numa(void) {
|
||||
}
|
||||
|
||||
#if defined(__ARM_ARCH)
|
||||
|
||||
#if defined(__linux__) && defined(__aarch64__)
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
static void ggml_init_arm_arch_features(void) {
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
||||
#if defined(__linux__)
|
||||
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
||||
#else
|
||||
// TODO: add support of SVE for non-linux systems
|
||||
#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
|
||||
#endif
|
||||
#endif
|
||||
#include <arm_sve.h>
|
||||
static void ggml_init_arm_arch_features(void) {
|
||||
ggml_arm_arch_features.sve_cnt = svcntb();
|
||||
}
|
||||
|
||||
#else
|
||||
static void ggml_init_arm_arch_features(void) {}
|
||||
#endif
|
||||
#endif // __ARM_ARCH
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
||||
|
||||
@@ -1227,6 +1227,7 @@ struct vk_op_topk_push_constants {
|
||||
uint32_t orig_ncols;
|
||||
uint32_t ncols_input;
|
||||
uint32_t ncols_output;
|
||||
uint32_t k;
|
||||
uint32_t nrows;
|
||||
uint32_t first_pass;
|
||||
uint32_t last_pass;
|
||||
@@ -1673,6 +1674,14 @@ class vk_perf_logger {
|
||||
timings[name.str()].push_back(time);
|
||||
return;
|
||||
}
|
||||
if (node->op == GGML_OP_TOP_K) {
|
||||
std::stringstream name;
|
||||
name << ggml_op_name(node->op) <<
|
||||
" K=" << node->ne[0] <<
|
||||
" (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
|
||||
timings[name.str()].push_back(time);
|
||||
return;
|
||||
}
|
||||
timings[ggml_op_name(node->op)].push_back(time);
|
||||
}
|
||||
private:
|
||||
@@ -10345,17 +10354,8 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
|
||||
uint32_t nrows = ggml_nrows(src0);
|
||||
uint32_t k = dst->ne[0];
|
||||
|
||||
vk_op_topk_push_constants pc { ncols, ncols, k, nrows, 0, 0 };
|
||||
vk_op_topk_push_constants pc { ncols, ncols, ncols, k, nrows, 0, 0 };
|
||||
|
||||
// Reserve space for ivec2 per element, double buffered
|
||||
const size_t dbl_buf_size = size_t{ncols} * nrows * 2 * sizeof(int);
|
||||
const size_t x_sz = dbl_buf_size * 2;
|
||||
uint32_t dbl_buf_index = 0;
|
||||
|
||||
if (ctx->prealloc_size_x < x_sz) {
|
||||
ctx->prealloc_size_x = x_sz;
|
||||
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||
}
|
||||
if (ctx->prealloc_x_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
@@ -10370,8 +10370,9 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
|
||||
// largest elements. Repeat until we have the top K elements.
|
||||
// Need to do at least one iteration to write out the results.
|
||||
bool done_one_iter = false;
|
||||
uint32_t dbl_buf_index = 0;
|
||||
size_t dbl_buf_size;
|
||||
while (num_elements > k || !done_one_iter) {
|
||||
done_one_iter = true;
|
||||
|
||||
// Prefer going as small as num_topk_pipelines - 3 for perf reasons.
|
||||
// But if K is larger, then we need a larger workgroup
|
||||
@@ -10411,6 +10412,21 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
|
||||
// Number of elements remaining after this pass
|
||||
uint32_t num_dst_elements = (num_elements / pipeline->wg_denoms[0]) * k + std::min(k, num_elements % pipeline->wg_denoms[0]);
|
||||
|
||||
pc2.ncols_output = num_dst_elements;
|
||||
|
||||
if (!done_one_iter) {
|
||||
// Reserve space for ivec2 per element, double buffered
|
||||
// K per workgroup per row
|
||||
dbl_buf_size = num_dst_elements * nrows * 2 * sizeof(int);
|
||||
dbl_buf_size = ROUNDUP_POW2(dbl_buf_size, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
const size_t x_sz = dbl_buf_size * 2;
|
||||
|
||||
if (ctx->prealloc_size_x < x_sz) {
|
||||
ctx->prealloc_size_x = x_sz;
|
||||
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||
}
|
||||
}
|
||||
|
||||
vk_subbuffer src_buf;
|
||||
vk_subbuffer dst_buf;
|
||||
|
||||
@@ -10436,6 +10452,7 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
|
||||
if (num_elements > k) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
done_one_iter = true;
|
||||
}
|
||||
ctx->prealloc_x_need_sync = true;
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ layout (push_constant) uniform parameter {
|
||||
uint orig_ncols;
|
||||
uint ncols_input;
|
||||
uint ncols_output;
|
||||
uint k;
|
||||
uint nrows;
|
||||
uint first_pass;
|
||||
uint last_pass;
|
||||
@@ -36,7 +37,7 @@ void topk(bool needs_bounds_check, const uint row) {
|
||||
const uint row_offset = row * p.ncols_input;
|
||||
dst_row[col] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
|
||||
} else {
|
||||
const uint row_offset = row * p.orig_ncols;
|
||||
const uint row_offset = row * p.ncols_input;
|
||||
dst_row[col] = data_s[row_offset + gl_GlobalInvocationID.x];
|
||||
}
|
||||
} else {
|
||||
@@ -44,7 +45,7 @@ void topk(bool needs_bounds_check, const uint row) {
|
||||
}
|
||||
barrier();
|
||||
|
||||
if (p.ncols_output == 1) {
|
||||
if (p.k == 1) {
|
||||
// Fast path for single output - just do a max reduction
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
|
||||
if (col < s) {
|
||||
@@ -84,13 +85,17 @@ void topk(bool needs_bounds_check, const uint row) {
|
||||
}
|
||||
}
|
||||
|
||||
if (col < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
|
||||
if (col < p.k) {
|
||||
if (p.last_pass != 0) {
|
||||
const uint row_offset = row * p.ncols_output;
|
||||
data_d[row_offset + col] = dst_row[col].x;
|
||||
if (gl_GlobalInvocationID.x < p.ncols_input) {
|
||||
const uint row_offset = row * p.k;
|
||||
data_d[row_offset + col] = dst_row[col].x;
|
||||
}
|
||||
} else {
|
||||
const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
|
||||
data_t[row_offset + col] = dst_row[col];
|
||||
if (gl_WorkGroupID.x * p.k + col < p.ncols_output) {
|
||||
const uint row_offset = row * p.ncols_output + gl_WorkGroupID.x * p.k;
|
||||
data_t[row_offset + col] = dst_row[col];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ layout (push_constant) uniform parameter {
|
||||
uint orig_ncols;
|
||||
uint ncols_input;
|
||||
uint ncols_output;
|
||||
uint k;
|
||||
uint nrows;
|
||||
uint first_pass;
|
||||
uint last_pass;
|
||||
@@ -60,7 +61,7 @@ void topk(const uint row) {
|
||||
const uint row_offset = row * p.ncols_input;
|
||||
dst_row[tid] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
|
||||
} else {
|
||||
const uint row_offset = row * p.orig_ncols;
|
||||
const uint row_offset = row * p.ncols_input;
|
||||
dst_row[tid] = data_s[row_offset + gl_GlobalInvocationID.x];
|
||||
}
|
||||
} else {
|
||||
@@ -68,7 +69,7 @@ void topk(const uint row) {
|
||||
}
|
||||
barrier();
|
||||
|
||||
if (p.ncols_output == 1) {
|
||||
if (p.k == 1) {
|
||||
// Fast path for single output - just do a max reduction
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
|
||||
if (tid < s) {
|
||||
@@ -98,7 +99,7 @@ void topk(const uint row) {
|
||||
uint range_max = 0xFF800000;
|
||||
// How many are above the current range, and how many we need to find.
|
||||
uint total = 0;
|
||||
uint limit = min(p.ncols_output, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);
|
||||
uint limit = min(p.k, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);
|
||||
|
||||
while (mask != 0) {
|
||||
barrier();
|
||||
@@ -139,7 +140,7 @@ void topk(const uint row) {
|
||||
range_max = range_min + ((min_idx + 1) << shift);
|
||||
range_min = range_min + (min_idx << shift);
|
||||
|
||||
if (total == p.ncols_output) {
|
||||
if (total == p.k) {
|
||||
break;
|
||||
}
|
||||
total -= counts[min_idx];
|
||||
@@ -179,13 +180,17 @@ void topk(const uint row) {
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (tid < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
|
||||
if (tid < p.k) {
|
||||
if (p.last_pass != 0) {
|
||||
const uint row_offset = row * p.ncols_output;
|
||||
data_d[row_offset + tid] = dst_row[tid].x;
|
||||
if (gl_GlobalInvocationID.x < p.ncols_input) {
|
||||
const uint row_offset = row * p.k;
|
||||
data_d[row_offset + tid] = dst_row[tid].x;
|
||||
}
|
||||
} else {
|
||||
const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
|
||||
data_t[row_offset + tid] = dst_row[tid];
|
||||
if (gl_WorkGroupID.x * p.k + tid < p.ncols_output) {
|
||||
const uint row_offset = row * p.ncols_output + gl_WorkGroupID.x * p.k;
|
||||
data_t[row_offset + tid] = dst_row[tid];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+1
-1
@@ -1169,7 +1169,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
|
||||
struct gguf_writer_base {
|
||||
size_t written_bytes {0u};
|
||||
|
||||
~gguf_writer_base(void) {}
|
||||
~gguf_writer_base(void) = default;
|
||||
|
||||
// we bet on devirtualization
|
||||
virtual void write(int8_t val) = 0;
|
||||
|
||||
+1
-1
@@ -37,7 +37,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
||||
template <typename T>
|
||||
struct no_init {
|
||||
T value;
|
||||
no_init() { /* do nothing */ }
|
||||
no_init() = default;
|
||||
};
|
||||
|
||||
struct time_meas {
|
||||
|
||||
+1
-1
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
|
||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
||||
suggest = false;
|
||||
}
|
||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
||||
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
|
||||
suggest = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
+3
-3
@@ -423,8 +423,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
|
||||
}
|
||||
|
||||
struct llama_model::impl {
|
||||
impl() {}
|
||||
~impl() {}
|
||||
impl() = default;
|
||||
~impl() = default;
|
||||
|
||||
uint64_t n_elements = 0;
|
||||
|
||||
@@ -461,7 +461,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
||||
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
||||
}
|
||||
|
||||
llama_model::~llama_model() {}
|
||||
llama_model::~llama_model() = default;
|
||||
|
||||
void llama_model::load_stats(llama_model_loader & ml) {
|
||||
pimpl->n_elements = ml.n_elements;
|
||||
|
||||
+1
-2
@@ -3253,8 +3253,7 @@ void llama_vocab::impl::print_info() const {
|
||||
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
||||
}
|
||||
|
||||
llama_vocab::~llama_vocab() {
|
||||
}
|
||||
llama_vocab::~llama_vocab() = default;
|
||||
|
||||
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pimpl->load(ml, kv);
|
||||
|
||||
@@ -1375,7 +1375,7 @@ int main() {
|
||||
try {
|
||||
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
|
||||
tc.verify_status(SUCCESS);
|
||||
} catch (const std::runtime_error & ex) {
|
||||
} catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "Error: %s\n", ex.what());
|
||||
tc.verify_status(FAILURE);
|
||||
}
|
||||
|
||||
+30
-19
@@ -428,6 +428,7 @@ struct clip_ctx {
|
||||
int max_nodes = 8192;
|
||||
ggml_backend_sched_ptr sched;
|
||||
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
bool is_allocated = false;
|
||||
|
||||
// for debugging
|
||||
bool debug_graph = false;
|
||||
@@ -3305,12 +3306,30 @@ struct clip_model_loader {
|
||||
};
|
||||
|
||||
static void warmup(clip_ctx & ctx_clip) {
|
||||
// create a fake batch
|
||||
const auto & hparams = ctx_clip.model.hparams;
|
||||
clip_image_f32_batch batch;
|
||||
clip_image_f32_ptr img(clip_image_f32_init());
|
||||
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
|
||||
img->nx = hparams.warmup_image_size;
|
||||
img->ny = hparams.warmup_image_size;
|
||||
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
|
||||
} else {
|
||||
img->nx = hparams.warmup_audio_size;
|
||||
img->ny = hparams.n_mel_bins;
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
|
||||
}
|
||||
batch.entries.push_back(std::move(img));
|
||||
warmup(ctx_clip, batch);
|
||||
}
|
||||
|
||||
static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
||||
support_info_graph info;
|
||||
|
||||
if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
|
||||
// try to enable flash attention to see if it's supported
|
||||
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
|
||||
info = alloc_compute_meta(ctx_clip);
|
||||
info = alloc_compute_meta(ctx_clip, batch);
|
||||
if (!info.fattn && info.fattn_op) {
|
||||
auto op = info.fattn_op;
|
||||
LOG_WRN("%s: *****************************************************************\n", __func__);
|
||||
@@ -3329,15 +3348,17 @@ struct clip_model_loader {
|
||||
LOG_WRN("%s: please report this on github as an issue\n", __func__);
|
||||
LOG_WRN("%s: *****************************************************************\n", __func__);
|
||||
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
|
||||
alloc_compute_meta(ctx_clip);
|
||||
alloc_compute_meta(ctx_clip, batch);
|
||||
}
|
||||
} else {
|
||||
info = alloc_compute_meta(ctx_clip);
|
||||
info = alloc_compute_meta(ctx_clip, batch);
|
||||
if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
ctx_clip.is_allocated = true; // mark buffers as allocated
|
||||
|
||||
LOG_INF("%s: flash attention is %s\n", __func__,
|
||||
(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
|
||||
|
||||
@@ -3369,24 +3390,9 @@ struct clip_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
|
||||
const auto & hparams = ctx_clip.model.hparams;
|
||||
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
||||
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
|
||||
// create a fake batch
|
||||
clip_image_f32_batch batch;
|
||||
clip_image_f32_ptr img(clip_image_f32_init());
|
||||
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
|
||||
img->nx = hparams.warmup_image_size;
|
||||
img->ny = hparams.warmup_image_size;
|
||||
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
|
||||
} else {
|
||||
img->nx = hparams.warmup_audio_size;
|
||||
img->ny = hparams.n_mel_bins;
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
|
||||
}
|
||||
batch.entries.push_back(std::move(img));
|
||||
|
||||
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
|
||||
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
|
||||
|
||||
@@ -4630,6 +4636,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
return false; // only support batch size of 1
|
||||
}
|
||||
|
||||
// if buffers are not allocated, we need to do a warmup run to allocate them
|
||||
if (!ctx->is_allocated) {
|
||||
clip_model_loader::warmup(*ctx, *imgs_c_ptr);
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ctx->debug_print_tensors.clear();
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
json format_error_response(const std::string & message, const enum error_type type) {
|
||||
std::string type_str;
|
||||
@@ -774,6 +775,65 @@ json oaicompat_completion_params_parse(const json & body) {
|
||||
return llama_params;
|
||||
}
|
||||
|
||||
// media_path always end with '/', see arg.cpp
|
||||
static void handle_media(
|
||||
std::vector<raw_buffer> & out_files,
|
||||
json & media_obj,
|
||||
const std::string & media_path) {
|
||||
std::string url = json_value(media_obj, "url", std::string());
|
||||
if (string_starts_with(url, "http")) {
|
||||
// download remote image
|
||||
// TODO @ngxson : maybe make these params configurable
|
||||
common_remote_params params;
|
||||
params.headers.push_back("User-Agent: llama.cpp/" + build_info);
|
||||
params.max_size = 1024 * 1024 * 10; // 10MB
|
||||
params.timeout = 10; // seconds
|
||||
SRV_INF("downloading image from '%s'\n", url.c_str());
|
||||
auto res = common_remote_get_content(url, params);
|
||||
if (200 <= res.first && res.first < 300) {
|
||||
SRV_INF("downloaded %ld bytes\n", res.second.size());
|
||||
raw_buffer data;
|
||||
data.insert(data.end(), res.second.begin(), res.second.end());
|
||||
out_files.push_back(data);
|
||||
} else {
|
||||
throw std::runtime_error("Failed to download image");
|
||||
}
|
||||
|
||||
} else if (string_starts_with(url, "file://")) {
|
||||
if (media_path.empty()) {
|
||||
throw std::invalid_argument("file:// URLs are not allowed unless --media-path is specified");
|
||||
}
|
||||
// load local image file
|
||||
std::string file_path = url.substr(7); // remove "file://"
|
||||
raw_buffer data;
|
||||
if (!fs_validate_filename(file_path, true)) {
|
||||
throw std::invalid_argument("file path is not allowed: " + file_path);
|
||||
}
|
||||
SRV_INF("loading image from local file '%s'\n", (media_path + file_path).c_str());
|
||||
std::ifstream file(media_path + file_path, std::ios::binary);
|
||||
if (!file) {
|
||||
throw std::invalid_argument("file does not exist or cannot be opened: " + file_path);
|
||||
}
|
||||
data.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
out_files.push_back(data);
|
||||
|
||||
} else {
|
||||
// try to decode base64 image
|
||||
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
|
||||
if (parts.size() != 2) {
|
||||
throw std::runtime_error("Invalid url value");
|
||||
} else if (!string_starts_with(parts[0], "data:image/")) {
|
||||
throw std::runtime_error("Invalid url format: " + parts[0]);
|
||||
} else if (!string_ends_with(parts[0], "base64")) {
|
||||
throw std::runtime_error("url must be base64 encoded");
|
||||
} else {
|
||||
auto base64_data = parts[1];
|
||||
auto decoded_data = base64_decode(base64_data);
|
||||
out_files.push_back(decoded_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// used by /chat/completions endpoint
|
||||
json oaicompat_chat_params_parse(
|
||||
json & body, /* openai api json semantics */
|
||||
@@ -819,26 +879,26 @@ json oaicompat_chat_params_parse(
|
||||
auto schema_wrapper = json_value(response_format, "json_schema", json::object());
|
||||
json_schema = json_value(schema_wrapper, "schema", json::object());
|
||||
} else if (!response_type.empty() && response_type != "text") {
|
||||
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
||||
throw std::invalid_argument("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
|
||||
}
|
||||
}
|
||||
|
||||
// get input files
|
||||
if (!body.contains("messages")) {
|
||||
throw std::runtime_error("'messages' is required");
|
||||
throw std::invalid_argument("'messages' is required");
|
||||
}
|
||||
json & messages = body.at("messages");
|
||||
if (!messages.is_array()) {
|
||||
throw std::runtime_error("Expected 'messages' to be an array");
|
||||
throw std::invalid_argument("Expected 'messages' to be an array");
|
||||
}
|
||||
for (auto & msg : messages) {
|
||||
std::string role = json_value(msg, "role", std::string());
|
||||
if (role != "assistant" && !msg.contains("content")) {
|
||||
throw std::runtime_error("All non-assistant messages must contain 'content'");
|
||||
throw std::invalid_argument("All non-assistant messages must contain 'content'");
|
||||
}
|
||||
if (role == "assistant") {
|
||||
if (!msg.contains("content") && !msg.contains("tool_calls")) {
|
||||
throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
|
||||
throw std::invalid_argument("Assistant message must contain either 'content' or 'tool_calls'!");
|
||||
}
|
||||
if (!msg.contains("content")) {
|
||||
continue; // avoid errors with no content
|
||||
@@ -850,7 +910,7 @@ json oaicompat_chat_params_parse(
|
||||
}
|
||||
|
||||
if (!content.is_array()) {
|
||||
throw std::runtime_error("Expected 'content' to be a string or an array");
|
||||
throw std::invalid_argument("Expected 'content' to be a string or an array");
|
||||
}
|
||||
|
||||
for (auto & p : content) {
|
||||
@@ -860,41 +920,8 @@ json oaicompat_chat_params_parse(
|
||||
throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
|
||||
}
|
||||
|
||||
json image_url = json_value(p, "image_url", json::object());
|
||||
std::string url = json_value(image_url, "url", std::string());
|
||||
if (string_starts_with(url, "http")) {
|
||||
// download remote image
|
||||
// TODO @ngxson : maybe make these params configurable
|
||||
common_remote_params params;
|
||||
params.headers.push_back("User-Agent: llama.cpp/" + build_info);
|
||||
params.max_size = 1024 * 1024 * 10; // 10MB
|
||||
params.timeout = 10; // seconds
|
||||
SRV_INF("downloading image from '%s'\n", url.c_str());
|
||||
auto res = common_remote_get_content(url, params);
|
||||
if (200 <= res.first && res.first < 300) {
|
||||
SRV_INF("downloaded %ld bytes\n", res.second.size());
|
||||
raw_buffer data;
|
||||
data.insert(data.end(), res.second.begin(), res.second.end());
|
||||
out_files.push_back(data);
|
||||
} else {
|
||||
throw std::runtime_error("Failed to download image");
|
||||
}
|
||||
|
||||
} else {
|
||||
// try to decode base64 image
|
||||
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
|
||||
if (parts.size() != 2) {
|
||||
throw std::runtime_error("Invalid image_url.url value");
|
||||
} else if (!string_starts_with(parts[0], "data:image/")) {
|
||||
throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
|
||||
} else if (!string_ends_with(parts[0], "base64")) {
|
||||
throw std::runtime_error("image_url.url must be base64 encoded");
|
||||
} else {
|
||||
auto base64_data = parts[1];
|
||||
auto decoded_data = base64_decode(base64_data);
|
||||
out_files.push_back(decoded_data);
|
||||
}
|
||||
}
|
||||
json image_url = json_value(p, "image_url", json::object());
|
||||
handle_media(out_files, image_url, opt.media_path);
|
||||
|
||||
// replace this chunk with a marker
|
||||
p["type"] = "text";
|
||||
@@ -911,18 +938,20 @@ json oaicompat_chat_params_parse(
|
||||
std::string format = json_value(input_audio, "format", std::string());
|
||||
// while we also support flac, we don't allow it here so we matches the OAI spec
|
||||
if (format != "wav" && format != "mp3") {
|
||||
throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
|
||||
throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'");
|
||||
}
|
||||
auto decoded_data = base64_decode(data); // expected to be base64 encoded
|
||||
out_files.push_back(decoded_data);
|
||||
|
||||
// TODO: add audio_url support by reusing handle_media()
|
||||
|
||||
// replace this chunk with a marker
|
||||
p["type"] = "text";
|
||||
p["text"] = mtmd_default_marker();
|
||||
p.erase("input_audio");
|
||||
|
||||
} else if (type != "text") {
|
||||
throw std::runtime_error("unsupported content[].type");
|
||||
throw std::invalid_argument("unsupported content[].type");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -940,7 +969,7 @@ json oaicompat_chat_params_parse(
|
||||
inputs.enable_thinking = opt.enable_thinking;
|
||||
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
if (body.contains("grammar")) {
|
||||
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
||||
throw std::invalid_argument("Cannot use custom grammar constraints with tools.");
|
||||
}
|
||||
llama_params["parse_tool_calls"] = true;
|
||||
}
|
||||
@@ -959,7 +988,7 @@ json oaicompat_chat_params_parse(
|
||||
} else if (enable_thinking_kwarg == "false") {
|
||||
inputs.enable_thinking = false;
|
||||
} else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
|
||||
throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
|
||||
throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)");
|
||||
}
|
||||
|
||||
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
||||
@@ -972,14 +1001,14 @@ json oaicompat_chat_params_parse(
|
||||
|
||||
/* sanity check, max one assistant message at the end of the list */
|
||||
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
|
||||
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
|
||||
throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
|
||||
}
|
||||
|
||||
/* TODO: test this properly */
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
|
||||
if ( inputs.enable_thinking ) {
|
||||
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
|
||||
throw std::invalid_argument("Assistant response prefill is incompatible with enable_thinking.");
|
||||
}
|
||||
|
||||
inputs.add_generation_prompt = true;
|
||||
@@ -1020,18 +1049,18 @@ json oaicompat_chat_params_parse(
|
||||
// Handle "n" field
|
||||
int n_choices = json_value(body, "n", 1);
|
||||
if (n_choices != 1) {
|
||||
throw std::runtime_error("Only one completion choice is allowed");
|
||||
throw std::invalid_argument("Only one completion choice is allowed");
|
||||
}
|
||||
|
||||
// Handle "logprobs" field
|
||||
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
|
||||
if (json_value(body, "logprobs", false)) {
|
||||
if (has_tools && stream) {
|
||||
throw std::runtime_error("logprobs is not supported with tools + stream");
|
||||
throw std::invalid_argument("logprobs is not supported with tools + stream");
|
||||
}
|
||||
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
|
||||
} else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
|
||||
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
|
||||
throw std::invalid_argument("top_logprobs requires logprobs to be set to true");
|
||||
}
|
||||
|
||||
// Copy remaining properties to llama_params
|
||||
@@ -1263,7 +1292,11 @@ json convert_anthropic_to_oai(const json & body) {
|
||||
return oai_body;
|
||||
}
|
||||
|
||||
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) {
|
||||
json format_embeddings_response_oaicompat(
|
||||
const json & request,
|
||||
const std::string & model_name,
|
||||
const json & embeddings,
|
||||
bool use_base64) {
|
||||
json data = json::array();
|
||||
int32_t n_tokens = 0;
|
||||
int i = 0;
|
||||
@@ -1293,7 +1326,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
|
||||
}
|
||||
|
||||
json res = json {
|
||||
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||
{"model", json_value(request, "model", model_name)},
|
||||
{"object", "list"},
|
||||
{"usage", json {
|
||||
{"prompt_tokens", n_tokens},
|
||||
@@ -1307,6 +1340,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
|
||||
|
||||
json format_response_rerank(
|
||||
const json & request,
|
||||
const std::string & model_name,
|
||||
const json & ranks,
|
||||
bool is_tei_format,
|
||||
std::vector<std::string> & texts,
|
||||
@@ -1338,7 +1372,7 @@ json format_response_rerank(
|
||||
if (is_tei_format) return results;
|
||||
|
||||
json res = json{
|
||||
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||
{"model", json_value(request, "model", model_name)},
|
||||
{"object", "list"},
|
||||
{"usage", json{
|
||||
{"prompt_tokens", n_tokens},
|
||||
|
||||
@@ -13,8 +13,6 @@
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
|
||||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
@@ -286,6 +284,7 @@ struct oaicompat_parser_options {
|
||||
bool allow_image;
|
||||
bool allow_audio;
|
||||
bool enable_thinking = true;
|
||||
std::string media_path;
|
||||
};
|
||||
|
||||
// used by /chat/completions endpoint
|
||||
@@ -298,11 +297,16 @@ json oaicompat_chat_params_parse(
|
||||
json convert_anthropic_to_oai(const json & body);
|
||||
|
||||
// TODO: move it to server-task.cpp
|
||||
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
|
||||
json format_embeddings_response_oaicompat(
|
||||
const json & request,
|
||||
const std::string & model_name,
|
||||
const json & embeddings,
|
||||
bool use_base64 = false);
|
||||
|
||||
// TODO: move it to server-task.cpp
|
||||
json format_response_rerank(
|
||||
const json & request,
|
||||
const std::string & model_name,
|
||||
const json & ranks,
|
||||
bool is_tei_format,
|
||||
std::vector<std::string> & texts,
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <cinttypes>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
#include <filesystem>
|
||||
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
@@ -518,6 +519,8 @@ struct server_context_impl {
|
||||
// Necessary similarity of prompt for slot selection
|
||||
float slot_prompt_similarity = 0.0f;
|
||||
|
||||
std::string model_name; // name of the loaded model, to be used by API
|
||||
|
||||
common_chat_templates_ptr chat_templates;
|
||||
oaicompat_parser_options oai_parser_opt;
|
||||
|
||||
@@ -758,6 +761,18 @@ struct server_context_impl {
|
||||
}
|
||||
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
|
||||
|
||||
if (!params_base.model_alias.empty()) {
|
||||
// user explicitly specified model name
|
||||
model_name = params_base.model_alias;
|
||||
} else if (!params_base.model.name.empty()) {
|
||||
// use model name in registry format (for models in cache)
|
||||
model_name = params_base.model.name;
|
||||
} else {
|
||||
// fallback: derive model name from file name
|
||||
auto model_path = std::filesystem::path(params_base.model.path);
|
||||
model_name = model_path.filename().string();
|
||||
}
|
||||
|
||||
// thinking is enabled if:
|
||||
// 1. It's not explicitly disabled (reasoning_budget == 0)
|
||||
// 2. The chat template supports it
|
||||
@@ -773,6 +788,7 @@ struct server_context_impl {
|
||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||
/* enable_thinking */ enable_thinking,
|
||||
/* media_path */ params_base.media_path,
|
||||
};
|
||||
|
||||
// print sample chat example to make it clear which template is used
|
||||
@@ -2611,7 +2627,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
|
||||
// OAI-compat
|
||||
task.params.res_type = res_type;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
task.params.oaicompat_model = ctx_server.model_name;
|
||||
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
@@ -2939,7 +2955,7 @@ void server_routes::init_routes() {
|
||||
json data = {
|
||||
{ "default_generation_settings", default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_alias", ctx_server.params_base.model_alias },
|
||||
{ "model_alias", ctx_server.model_name },
|
||||
{ "model_path", ctx_server.params_base.model.path },
|
||||
{ "modalities", json {
|
||||
{"vision", ctx_server.oai_parser_opt.allow_image},
|
||||
@@ -3181,8 +3197,8 @@ void server_routes::init_routes() {
|
||||
json models = {
|
||||
{"models", {
|
||||
{
|
||||
{"name", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"model", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"name", ctx_server.model_name},
|
||||
{"model", ctx_server.model_name},
|
||||
{"modified_at", ""},
|
||||
{"size", ""},
|
||||
{"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
|
||||
@@ -3204,7 +3220,7 @@ void server_routes::init_routes() {
|
||||
{"object", "list"},
|
||||
{"data", {
|
||||
{
|
||||
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"id", ctx_server.model_name},
|
||||
{"object", "model"},
|
||||
{"created", std::time(0)},
|
||||
{"owned_by", "llamacpp"},
|
||||
@@ -3351,6 +3367,7 @@ void server_routes::init_routes() {
|
||||
// write JSON response
|
||||
json root = format_response_rerank(
|
||||
body,
|
||||
ctx_server.model_name,
|
||||
responses,
|
||||
is_tei_format,
|
||||
documents,
|
||||
@@ -3613,7 +3630,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
|
||||
|
||||
// write JSON response
|
||||
json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
|
||||
? format_embeddings_response_oaicompat(body, responses, use_base64)
|
||||
? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64)
|
||||
: json(responses);
|
||||
res->ok(root);
|
||||
return res;
|
||||
|
||||
@@ -24,8 +24,55 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__)
|
||||
// macOS: use _NSGetExecutablePath to get the executable path
|
||||
#include <mach-o/dyld.h>
|
||||
#include <limits.h>
|
||||
#endif
|
||||
|
||||
#define CMD_EXIT "exit"
|
||||
|
||||
static std::filesystem::path get_server_exec_path() {
|
||||
#if defined(_WIN32)
|
||||
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
|
||||
DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf));
|
||||
if (len == 0 || len >= _countof(buf)) {
|
||||
throw std::runtime_error("GetModuleFileNameW failed or path too long");
|
||||
}
|
||||
return std::filesystem::path(buf);
|
||||
#elif defined(__APPLE__) && defined(__MACH__)
|
||||
char small_path[PATH_MAX];
|
||||
uint32_t size = sizeof(small_path);
|
||||
|
||||
if (_NSGetExecutablePath(small_path, &size) == 0) {
|
||||
// resolve any symlinks to get absolute path
|
||||
try {
|
||||
return std::filesystem::canonical(std::filesystem::path(small_path));
|
||||
} catch (...) {
|
||||
return std::filesystem::path(small_path);
|
||||
}
|
||||
} else {
|
||||
// buffer was too small, allocate required size and call again
|
||||
std::vector<char> buf(size);
|
||||
if (_NSGetExecutablePath(buf.data(), &size) == 0) {
|
||||
try {
|
||||
return std::filesystem::canonical(std::filesystem::path(buf.data()));
|
||||
} catch (...) {
|
||||
return std::filesystem::path(buf.data());
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
|
||||
}
|
||||
#else
|
||||
char path[FILENAME_MAX];
|
||||
ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
|
||||
if (count <= 0) {
|
||||
throw std::runtime_error("failed to resolve /proc/self/exe");
|
||||
}
|
||||
return std::filesystem::path(std::string(path, count));
|
||||
#endif
|
||||
}
|
||||
|
||||
struct local_model {
|
||||
std::string name;
|
||||
std::string path;
|
||||
@@ -99,6 +146,14 @@ server_models::server_models(
|
||||
for (char ** env = envp; *env != nullptr; env++) {
|
||||
base_env.push_back(std::string(*env));
|
||||
}
|
||||
GGML_ASSERT(!base_args.empty());
|
||||
// set binary path
|
||||
try {
|
||||
base_args[0] = get_server_exec_path().string();
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("failed to get server executable path: %s\n", e.what());
|
||||
LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
|
||||
}
|
||||
// TODO: allow refreshing cached model list
|
||||
// add cached models
|
||||
auto cached_models = common_list_cached_models();
|
||||
@@ -587,26 +642,26 @@ static void res_ok(std::unique_ptr<server_http_res> & res, const json & response
|
||||
res->data = safe_json_to_str(response_data);
|
||||
}
|
||||
|
||||
static void res_error(std::unique_ptr<server_http_res> & res, const json & error_data) {
|
||||
static void res_err(std::unique_ptr<server_http_res> & res, const json & error_data) {
|
||||
res->status = json_value(error_data, "code", 500);
|
||||
res->data = safe_json_to_str({{ "error", error_data }});
|
||||
}
|
||||
|
||||
static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
|
||||
if (name.empty()) {
|
||||
res_error(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
|
||||
res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
}
|
||||
auto meta = models.get_meta(name);
|
||||
if (!meta.has_value()) {
|
||||
res_error(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
|
||||
res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
}
|
||||
if (models_autoload) {
|
||||
models.ensure_model_loaded(name);
|
||||
} else {
|
||||
if (meta->status != SERVER_MODEL_STATUS_LOADED) {
|
||||
res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -706,11 +761,11 @@ void server_models_routes::init_routes() {
|
||||
std::string name = json_value(body, "model", std::string());
|
||||
auto model = models.get_meta(name);
|
||||
if (!model.has_value()) {
|
||||
res_error(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
|
||||
res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
|
||||
return res;
|
||||
}
|
||||
if (model->status == SERVER_MODEL_STATUS_LOADED) {
|
||||
res_error(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
models.load(name, false);
|
||||
@@ -768,11 +823,11 @@ void server_models_routes::init_routes() {
|
||||
std::string name = json_value(body, "model", std::string());
|
||||
auto model = models.get_meta(name);
|
||||
if (!model.has_value()) {
|
||||
res_error(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
|
||||
res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
if (model->status != SERVER_MODEL_STATUS_LOADED) {
|
||||
res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
models.unload(name);
|
||||
|
||||
@@ -450,9 +450,6 @@ task_params server_task::params_from_json_cmpl(
|
||||
}
|
||||
}
|
||||
|
||||
std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
|
||||
params.oaicompat_model = json_value(data, "model", model_name);
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
|
||||
@@ -34,18 +34,26 @@ static inline void signal_handler(int signal) {
|
||||
static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
|
||||
return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
|
||||
std::string message;
|
||||
error_type error;
|
||||
try {
|
||||
return func(req);
|
||||
} catch (const std::invalid_argument & e) {
|
||||
// treat invalid_argument as invalid request (400)
|
||||
error = ERROR_TYPE_INVALID_REQUEST;
|
||||
message = e.what();
|
||||
} catch (const std::exception & e) {
|
||||
// treat other exceptions as server error (500)
|
||||
error = ERROR_TYPE_SERVER;
|
||||
message = e.what();
|
||||
} catch (...) {
|
||||
error = ERROR_TYPE_SERVER;
|
||||
message = "unknown error";
|
||||
}
|
||||
|
||||
auto res = std::make_unique<server_http_res>();
|
||||
res->status = 500;
|
||||
try {
|
||||
json error_data = format_error_response(message, ERROR_TYPE_SERVER);
|
||||
json error_data = format_error_response(message, error);
|
||||
res->status = json_value(error_data, "code", 500);
|
||||
res->data = safe_json_to_str({{ "error", error_data }});
|
||||
SRV_WRN("got exception: %s\n", res->data.c_str());
|
||||
|
||||
@@ -41,7 +41,8 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
|
||||
assert res.status_code == 200
|
||||
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
|
||||
assert res.body["system_fingerprint"].startswith("b")
|
||||
assert res.body["model"] == model if model is not None else server.model_alias
|
||||
# we no longer reflect back the model name, see https://github.com/ggml-org/llama.cpp/pull/17668
|
||||
# assert res.body["model"] == model if model is not None else server.model_alias
|
||||
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
||||
assert res.body["usage"]["completion_tokens"] == n_predicted
|
||||
choice = res.body["choices"][0]
|
||||
@@ -59,7 +60,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
|
||||
)
|
||||
def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
|
||||
global server
|
||||
server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
|
||||
server.model_alias = "llama-test-model"
|
||||
server.start()
|
||||
res = server.make_stream_request("POST", "/chat/completions", data={
|
||||
"max_tokens": max_tokens,
|
||||
@@ -81,7 +82,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
|
||||
else:
|
||||
assert "role" not in choice["delta"]
|
||||
assert data["system_fingerprint"].startswith("b")
|
||||
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
||||
assert data["model"] == "llama-test-model"
|
||||
if last_cmpl_id is None:
|
||||
last_cmpl_id = data["id"]
|
||||
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
|
||||
@@ -198,7 +199,7 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
|
||||
choice = res.body["choices"][0]
|
||||
assert match_regex(re_content, choice["message"]["content"])
|
||||
else:
|
||||
assert res.status_code != 200
|
||||
assert res.status_code == 400
|
||||
assert "error" in res.body
|
||||
|
||||
|
||||
|
||||
@@ -94,3 +94,34 @@ def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
|
||||
assert res.status_code == 200
|
||||
assert cors_header in res.headers
|
||||
assert res.headers[cors_header] == cors_header_value
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"media_path, image_url, success",
|
||||
[
|
||||
(None, "file://mtmd/test-1.jpeg", False), # disabled media path, should fail
|
||||
("../../../tools", "file://mtmd/test-1.jpeg", True),
|
||||
("../../../tools", "file:////mtmd//test-1.jpeg", True), # should be the same file as above
|
||||
("../../../tools", "file://mtmd/notfound.jpeg", False), # non-existent file
|
||||
("../../../tools", "file://../mtmd/test-1.jpeg", False), # no directory traversal
|
||||
]
|
||||
)
|
||||
def test_local_media_file(media_path, image_url, success,):
|
||||
server = ServerPreset.tinygemma3()
|
||||
server.media_path = media_path
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 1,
|
||||
"messages": [
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "test"},
|
||||
{"type": "image_url", "image_url": {
|
||||
"url": image_url,
|
||||
}},
|
||||
]},
|
||||
],
|
||||
})
|
||||
if success:
|
||||
assert res.status_code == 200
|
||||
else:
|
||||
assert res.status_code == 400
|
||||
|
||||
@@ -95,6 +95,7 @@ class ServerProcess:
|
||||
chat_template_file: str | None = None
|
||||
server_path: str | None = None
|
||||
mmproj_url: str | None = None
|
||||
media_path: str | None = None
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
@@ -217,6 +218,8 @@ class ServerProcess:
|
||||
server_args.extend(["--chat-template-file", self.chat_template_file])
|
||||
if self.mmproj_url:
|
||||
server_args.extend(["--mmproj-url", self.mmproj_url])
|
||||
if self.media_path:
|
||||
server_args.extend(["--media-path", self.media_path])
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"tests: starting server with: {' '.join(args)}")
|
||||
|
||||
Reference in New Issue
Block a user