mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 01:57:43 +02:00
Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| af76639f72 | |||
| 761797ffdf | |||
| 5d3a4a7da5 | |||
| c08d28d088 | |||
| 661e9acb36 | |||
| b8635075ff | |||
| 9c699074c9 | |||
| d01f6274c0 | |||
| 650bf14eb9 | |||
| b7ad48ebda | |||
| d006858316 | |||
| e439700992 | |||
| 50e0ad08fb | |||
| f1f793ad06 | |||
| af5c13841f | |||
| 277ff5fff7 | |||
| 384c0076bc |
@@ -1,97 +0,0 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=13.1.1
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
|
||||
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
@@ -35,7 +35,7 @@ env:
|
||||
|
||||
jobs:
|
||||
ubuntu-riscv64-native-sanitizer:
|
||||
runs-on: RISCV64
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
@@ -50,17 +50,18 @@ jobs:
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential wget git-lfs
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||
|
||||
# Install Rust stable version
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
if ! which rustc; then
|
||||
# Install Rust stable version
|
||||
sudo apt-get install -y rustup
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
fi
|
||||
|
||||
git lfs install
|
||||
|
||||
@@ -73,23 +74,12 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
# Unique cache directory per matrix combination
|
||||
export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
|
||||
mkdir -p "$CCACHE_DIR"
|
||||
|
||||
# Configure ccache
|
||||
ccache --set-config=max_size=5G
|
||||
ccache --set-config=compression=true
|
||||
ccache --set-config=compression_level=6
|
||||
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||
ccache --set-config=hash_dir=false
|
||||
|
||||
# Export for subsequent steps
|
||||
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||
# FIXME: Enable when ggml-org/ccache-action works on riscv64
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanytizer }}-${{ matrix.build_type }}
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
@@ -213,6 +213,27 @@ jobs:
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-win-intel-vulkan:
|
||||
runs-on: [self-hosted, Windows, X64, Intel]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
shell: C:\msys64\usr\bin\bash.exe --noprofile --norc -eo pipefail "{0}"
|
||||
env:
|
||||
MSYSTEM: UCRT64
|
||||
CHERE_INVOKING: 1
|
||||
PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
# Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
|
||||
# a valid python environment for testing
|
||||
LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-vulkan-llvmpipe
|
||||
uses: ./.github/actions/linux-setup-vulkan
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||
|
||||
+18
-29
@@ -996,7 +996,7 @@ jobs:
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
ubuntu-cpu-riscv64-native:
|
||||
runs-on: RISCV64
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
@@ -1004,24 +1004,21 @@ jobs:
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential libssl-dev wget git-lfs
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||
|
||||
# Install Rust stable version
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
if ! which rustc; then
|
||||
# Install Rust stable version
|
||||
sudo apt-get install -y rustup
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
fi
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Check environment
|
||||
run: |
|
||||
uname -a
|
||||
@@ -1031,25 +1028,17 @@ jobs:
|
||||
cmake --version
|
||||
rustc --version
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
# Set unique cache directory for this job
|
||||
export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
|
||||
mkdir -p "$CCACHE_DIR"
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# Configure ccache for optimal performance
|
||||
ccache --set-config=max_size=5G
|
||||
ccache --set-config=compression=true
|
||||
ccache --set-config=compression_level=6
|
||||
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||
|
||||
# Enable more aggressive caching
|
||||
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||
ccache --set-config=hash_dir=false
|
||||
|
||||
# Export for subsequent steps
|
||||
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||
# FIXME: Enable when ggml-org/ccache-action works on riscv64
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-cpu-riscv64-native
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
@@ -73,10 +73,10 @@ jobs:
|
||||
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
|
||||
|
||||
@@ -119,6 +119,11 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
# Build shared libs on Windows
|
||||
# to reduce binary size and avoid errors in library loading unit tests
|
||||
if uname -s | grep -qi nt; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DBUILD_SHARED_LIBS=ON"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
|
||||
@@ -1311,6 +1311,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.kv_unified = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
|
||||
add_opt(common_arg(
|
||||
{"--clear-idle"},
|
||||
{"--no-clear-idle"},
|
||||
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
|
||||
[](common_params & params, bool value) {
|
||||
params.clear_idle = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--context-shift"},
|
||||
{"--no-context-shift"},
|
||||
|
||||
@@ -6,110 +6,13 @@
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "log.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
namespace {
|
||||
|
||||
// Gemma4-specific PEG builder extending the standard chat builder.
|
||||
// Adds value type parsers that use <|\"|> as string delimiters
|
||||
// instead of JSON's double quotes, and disables json-to-schema
|
||||
// conversion for these types.
|
||||
class common_peg_gemma4_builder {
|
||||
common_chat_peg_builder & p_;
|
||||
static constexpr const char * QUOTE = "<|\"|>";
|
||||
|
||||
public:
|
||||
explicit common_peg_gemma4_builder(common_chat_peg_builder & p) : p_(p) {}
|
||||
|
||||
common_peg_parser gemma4_string() {
|
||||
return p_.rule("gemma4-string", [&]() {
|
||||
return p_.literal(QUOTE) + p_.until(QUOTE) + p_.literal(QUOTE);
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_number() {
|
||||
return p_.rule("gemma4-number", [&]() {
|
||||
auto digit1_9 = p_.chars("[1-9]", 1, 1);
|
||||
auto digits = p_.chars("[0-9]");
|
||||
auto int_part = p_.choice({p_.literal("0"), p_.sequence({digit1_9, p_.chars("[0-9]", 0, -1)})});
|
||||
auto frac = p_.sequence({p_.literal("."), digits});
|
||||
auto exp = p_.sequence({p_.choice({p_.literal("e"), p_.literal("E")}),
|
||||
p_.optional(p_.chars("[+-]", 1, 1)), digits});
|
||||
auto not_number_continuation = p_.negate(p_.chars("[0-9.eE+-]", 1, 1));
|
||||
return p_.sequence({p_.optional(p_.literal("-")), int_part, p_.optional(frac),
|
||||
p_.optional(exp), not_number_continuation});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_bool() {
|
||||
return p_.rule("gemma4-bool", [&]() {
|
||||
return p_.choice({p_.literal("true"), p_.literal("false")});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_null() {
|
||||
return p_.rule("gemma4-null", [&]() {
|
||||
return p_.literal("null");
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_dict() {
|
||||
return p_.rule("gemma4-dict", [&]() {
|
||||
auto ws = p_.space();
|
||||
auto key = p_.until(":");
|
||||
auto member = p_.sequence({key, p_.literal(":"), ws, gemma4_value()});
|
||||
auto members = p_.sequence({member, p_.zero_or_more(p_.sequence({p_.literal(","), ws, member}))});
|
||||
return p_.sequence({
|
||||
p_.literal("{"), ws,
|
||||
p_.choice({p_.literal("}"), p_.sequence({members, ws, p_.literal("}")})})
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_array() {
|
||||
return p_.rule("gemma4-array", [&]() {
|
||||
auto ws = p_.space();
|
||||
auto elements = p_.sequence({gemma4_value(), p_.zero_or_more(p_.sequence({p_.literal(","), ws, gemma4_value()}))});
|
||||
return p_.sequence({
|
||||
p_.literal("["), ws,
|
||||
p_.choice({p_.literal("]"), p_.sequence({elements, ws, p_.literal("]")})})
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser gemma4_value() {
|
||||
return p_.rule("gemma4-value", [&]() {
|
||||
return p_.choice({gemma4_string(), gemma4_dict(), gemma4_array(),
|
||||
gemma4_number(), gemma4_bool(), gemma4_null()});
|
||||
});
|
||||
}
|
||||
|
||||
// Select the appropriate value parser based on JSON schema type.
|
||||
// Does NOT use schema() - the gemma4 types are pure PEG without
|
||||
// JSON schema metadata, so GBNF is generated directly from the
|
||||
// PEG structure.
|
||||
common_peg_parser gemma4_value_for_type(const json & schema) {
|
||||
if (!schema.contains("type") || !schema.at("type").is_string()) {
|
||||
return gemma4_value();
|
||||
}
|
||||
std::string type = schema.at("type").get<std::string>();
|
||||
if (type == "string") { return gemma4_string(); }
|
||||
if (type == "number") { return gemma4_number(); }
|
||||
if (type == "integer") { return gemma4_number(); }
|
||||
if (type == "boolean") { return gemma4_bool(); }
|
||||
if (type == "object") { return gemma4_dict(); }
|
||||
if (type == "array") { return gemma4_array(); }
|
||||
return gemma4_value();
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// Helper to iterate over tools/functions
|
||||
static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
|
||||
for (const auto & tool : tools) {
|
||||
@@ -141,9 +44,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
|
||||
// Create the result structure
|
||||
common_chat_params data;
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.format = (autoparser.tools.format.mode == tool_format::TAG_WITH_GEMMA4_DICT)
|
||||
? COMMON_CHAT_FORMAT_PEG_GEMMA4
|
||||
: COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = autoparser.preserved_tokens;
|
||||
|
||||
auto parser = autoparser.build_parser(inputs);
|
||||
@@ -270,8 +171,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
|
||||
return build_tool_parser_tag_json(ctx);
|
||||
case tool_format::TAG_WITH_TAGGED:
|
||||
return build_tool_parser_tag_tagged(ctx);
|
||||
case tool_format::TAG_WITH_GEMMA4_DICT:
|
||||
return build_tool_parser_tag_gemma4_dict(ctx);
|
||||
default:
|
||||
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
|
||||
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
|
||||
@@ -317,6 +216,44 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
||||
p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
|
||||
const common_peg_parser & call_id_section, bool have_call_id,
|
||||
const common_peg_parser & args,
|
||||
std::optional<common_peg_parser> atomic_peek) const {
|
||||
auto open = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix);
|
||||
bool matched_atomic = false;
|
||||
common_peg_parser func_parser = p.eps();
|
||||
|
||||
if (!function.name_suffix.empty()) {
|
||||
func_parser = open + call_id_section + p.space() + args;
|
||||
matched_atomic = true;
|
||||
} else if (have_call_id) {
|
||||
func_parser = p.atomic(open + call_id_section) + p.space() + args;
|
||||
matched_atomic = true;
|
||||
} else if (atomic_peek.has_value()) {
|
||||
func_parser = p.atomic(open + call_id_section + p.space() + *atomic_peek) + args;
|
||||
matched_atomic = true;
|
||||
} else {
|
||||
func_parser = open + call_id_section + p.space() + args;
|
||||
}
|
||||
|
||||
if (!function.close.empty()) {
|
||||
func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
|
||||
} else if (!format.per_call_end.empty()) {
|
||||
// When there's no func_close but there is a per_call_end marker, use peek() to ensure
|
||||
// we only emit tool_close when we can actually see the closing marker. This prevents
|
||||
// premature closing during partial parsing when we've seen e.g. "</" which could be
|
||||
// either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
|
||||
func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
|
||||
} else {
|
||||
func_parser = func_parser + p.tool_close(p.space()); // force this to process tool closing callbacks in mapper
|
||||
}
|
||||
if (!matched_atomic) {
|
||||
func_parser = p.atomic(func_parser);
|
||||
}
|
||||
return func_parser;
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
@@ -330,17 +267,27 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
|
||||
const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();
|
||||
|
||||
// Build call_id parser based on position (if supported)
|
||||
bool have_call_id = false;
|
||||
common_peg_parser call_id_section = p.eps();
|
||||
if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
|
||||
!call_id.suffix.empty()) {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
|
||||
(!call_id.suffix.empty() || !arguments.start.empty())) {
|
||||
if (!call_id.suffix.empty()) {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
|
||||
} else {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
|
||||
}
|
||||
have_call_id = true;
|
||||
}
|
||||
auto args_parser = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
|
||||
if (!arguments.start.empty()) {
|
||||
args_parser = p.literal(arguments.start) + args_parser;
|
||||
}
|
||||
if (!arguments.end.empty()) {
|
||||
args_parser = args_parser + p.literal(arguments.end);
|
||||
}
|
||||
|
||||
auto func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
|
||||
if (!function.close.empty()) {
|
||||
func_parser = func_parser + function.close;
|
||||
}
|
||||
auto atomic_peek = !arguments.start.empty() ? std::optional(p.peek(p.literal(arguments.start))) : std::nullopt;
|
||||
auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_parser, atomic_peek);
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
@@ -400,12 +347,34 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
for (const auto & [param_name, param_schema] : properties.items()) {
|
||||
bool is_required = required.find(param_name) != required.end();
|
||||
std::string type = "object";
|
||||
auto type_obj = param_schema.contains("type") ? param_schema.at("type") : json::object();
|
||||
if (type_obj.is_string()) {
|
||||
type_obj.get_to(type);
|
||||
} else if (type_obj.is_object()) {
|
||||
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
|
||||
type_obj.at("type").get_to(type);
|
||||
if (param_schema.contains("type")) {
|
||||
const auto & type_obj = param_schema.at("type");
|
||||
if (type_obj.is_string()) {
|
||||
type_obj.get_to(type);
|
||||
} else if (type_obj.is_array()) {
|
||||
// Handle nullable types like ["string", "null"]
|
||||
for (const auto & t : type_obj) {
|
||||
if (t.is_string() && t.get<std::string>() != "null") {
|
||||
type = t.get<std::string>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (type_obj.is_object()) {
|
||||
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
|
||||
type_obj.at("type").get_to(type);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Infer string type from enum values when type is unspecified
|
||||
if (type == "object" && param_schema.contains("enum")) {
|
||||
const auto & enum_vals = param_schema.at("enum");
|
||||
if (enum_vals.is_array()) {
|
||||
for (const auto & v : enum_vals) {
|
||||
if (v.is_string()) {
|
||||
type = "string";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -448,52 +417,31 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
|
||||
}
|
||||
|
||||
if (!arguments.start.empty()) {
|
||||
args_seq = p.literal(arguments.start) + args_seq;
|
||||
}
|
||||
if (!arguments.end.empty()) {
|
||||
args_seq = args_seq + p.literal(arguments.end);
|
||||
}
|
||||
|
||||
// Build call_id parser based on position (if supported)
|
||||
common_peg_parser call_id_section = p.eps();
|
||||
bool have_call_id = false;
|
||||
if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
|
||||
!call_id.suffix.empty()) {
|
||||
(!call_id.suffix.empty() || !arguments.start.empty())) {
|
||||
have_call_id = true;
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
|
||||
}
|
||||
|
||||
bool matched_atomic = false;
|
||||
common_peg_parser func_parser = p.eps();
|
||||
if (!function.name_suffix.empty()) {
|
||||
func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section + p.space() + args_seq;
|
||||
matched_atomic = true;
|
||||
} else if (have_call_id) {
|
||||
func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section) + p.space() + args_seq;
|
||||
matched_atomic = true;
|
||||
} else if (!arguments.name_prefix.empty() && !required_parsers.empty()) {
|
||||
// Only peek for an arg tag when there are required args that must follow.
|
||||
// When all args are optional, the model may emit no arg tags at all (#20650).
|
||||
func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section + p.space() + p.peek(p.literal(arguments.name_prefix))) + args_seq;
|
||||
matched_atomic = true;
|
||||
} else {
|
||||
func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
|
||||
call_id_section + p.space() + args_seq;
|
||||
}
|
||||
|
||||
if (!function.close.empty()) {
|
||||
func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
|
||||
} else if (!format.per_call_end.empty()) {
|
||||
// When there's no func_close but there is a per_call_end marker, use peek() to ensure
|
||||
// we only emit tool_close when we can actually see the closing marker. This prevents
|
||||
// premature closing during partial parsing when we've seen e.g. "</" which could be
|
||||
// either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
|
||||
func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
|
||||
} else {
|
||||
func_parser =
|
||||
func_parser + p.tool_close(p.space()); // force this to process tool closing callbacks in mapper
|
||||
}
|
||||
if (!matched_atomic) {
|
||||
func_parser = p.atomic(func_parser);
|
||||
if (!call_id.suffix.empty()) {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
|
||||
} else {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
|
||||
}
|
||||
}
|
||||
|
||||
// Only peek for an arg tag when there are required args that must follow.
|
||||
// When all args are optional, the model may emit no arg tags at all (#20650).
|
||||
auto atomic_peek = (!arguments.name_prefix.empty() && !required_parsers.empty()) ?
|
||||
std::optional(p.peek(p.literal(arguments.name_prefix))) : std::nullopt;
|
||||
auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_seq, atomic_peek);
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
@@ -536,121 +484,4 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_gemma4_builder g4(p);
|
||||
static const std::string QUOTE = "<|\"|>";
|
||||
|
||||
common_peg_parser tool_choice = p.choice();
|
||||
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & func = tool.at("function");
|
||||
std::string name = func.at("name");
|
||||
const auto & params = func.at("parameters");
|
||||
|
||||
if (!params.contains("properties") || !params.at("properties").is_object()) {
|
||||
auto func_parser = p.atomic(
|
||||
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
|
||||
p.tool_args(p.eps()) +
|
||||
p.tool_close(p.literal("}")));
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto & properties = params.at("properties");
|
||||
std::set<std::string> required;
|
||||
if (params.contains("required") && params.at("required").is_array()) {
|
||||
params.at("required").get_to(required);
|
||||
}
|
||||
|
||||
// Build per-argument parsers, sorted alphabetically (matching template's dictsort)
|
||||
struct arg_entry {
|
||||
std::string param_name;
|
||||
common_peg_parser parser;
|
||||
};
|
||||
std::vector<arg_entry> arg_entries;
|
||||
|
||||
for (const auto & [param_name, param_schema] : properties.items()) {
|
||||
std::string type = "object";
|
||||
auto type_v = param_schema.contains("type") ? param_schema.at("type") : json::object();
|
||||
if (type_v.is_string()) type_v.get_to(type);
|
||||
|
||||
common_peg_parser value_parser = p.eps();
|
||||
if (type == "string") {
|
||||
// String values are delimited by <|"|>...<|"|>
|
||||
value_parser =
|
||||
p.literal(QUOTE) +
|
||||
p.tool_arg_string_value(p.schema(p.until(QUOTE),
|
||||
"tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) +
|
||||
p.literal(QUOTE);
|
||||
} else if (type == "number" || type == "integer") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_number());
|
||||
} else if (type == "boolean") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_bool());
|
||||
} else if (type == "null") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_null());
|
||||
} else if (type == "object") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_dict());
|
||||
} else if (type == "array") {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_array());
|
||||
} else {
|
||||
value_parser = p.tool_arg_value(g4.gemma4_value());
|
||||
}
|
||||
|
||||
auto arg = p.tool_arg(
|
||||
p.tool_arg_open(p.tool_arg_name(p.literal(param_name)) + p.literal(":")) +
|
||||
value_parser +
|
||||
p.tool_arg_close(p.eps()));
|
||||
|
||||
arg_entries.push_back({param_name, p.rule("tool-" + name + "-arg-" + param_name, arg)});
|
||||
}
|
||||
|
||||
// Sort alphabetically to match Jinja's dictsort
|
||||
std::sort(arg_entries.begin(), arg_entries.end(), [](const auto & a, const auto & b) {
|
||||
return a.param_name < b.param_name;
|
||||
});
|
||||
|
||||
// Build arg sequence: any arg, then zero-or-more comma-separated additional args
|
||||
common_peg_parser args_seq = p.eps();
|
||||
if (!arg_entries.empty()) {
|
||||
common_peg_parser any_arg = p.choice();
|
||||
for (auto & entry : arg_entries) {
|
||||
any_arg |= entry.parser;
|
||||
}
|
||||
args_seq = p.optional(
|
||||
any_arg + p.repeat(p.literal(",") + any_arg, 0, (int) arg_entries.size() - 1));
|
||||
}
|
||||
|
||||
// Full parser: call:name{args}
|
||||
auto func_parser = p.atomic(
|
||||
p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
|
||||
p.tool_args(args_seq) +
|
||||
p.tool_close(p.literal("}")));
|
||||
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
// Wrap each call in <|tool_call>...</tool_call|>
|
||||
auto wrapped_call = p.literal(format.per_call_start) + tool_choice + p.literal(format.per_call_end);
|
||||
|
||||
common_peg_parser tool_calls = p.eps();
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call);
|
||||
}
|
||||
|
||||
if (!force_tools) {
|
||||
tool_calls = p.optional(tool_calls);
|
||||
}
|
||||
|
||||
auto content_before_tools = p.until_one_of({ format.per_call_start, ctx.reasoning->start });
|
||||
return ctx.reasoning_parser +
|
||||
(force_tools ? p.eps() : p.optional(p.content(content_before_tools) + p.optional(ctx.reasoning_parser))) +
|
||||
tool_calls + p.end();
|
||||
}
|
||||
|
||||
} // namespace autoparser
|
||||
|
||||
@@ -145,7 +145,6 @@ enum class tool_format {
|
||||
JSON_NATIVE, // Pure JSON: {"name": "X", "arguments": {...}}
|
||||
TAG_WITH_JSON, // Tag-based with JSON args: <function=X>{...}</function>
|
||||
TAG_WITH_TAGGED, // Tag-based with tagged args: <param=key>value</param>
|
||||
TAG_WITH_GEMMA4_DICT, // Gemma4 custom dict: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
|
||||
};
|
||||
|
||||
inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
|
||||
@@ -158,8 +157,6 @@ inline std::ostream & operator<<(std::ostream & os, const tool_format & format)
|
||||
return os << "TAG_WITH_JSON";
|
||||
case tool_format::TAG_WITH_TAGGED:
|
||||
return os << "TAG_WITH_TAGGED";
|
||||
case tool_format::TAG_WITH_GEMMA4_DICT:
|
||||
return os << "TAG_WITH_GEMMA4_DICT";
|
||||
default:
|
||||
return os << "UNKNOWN";
|
||||
}
|
||||
@@ -356,7 +353,13 @@ struct analyze_tools : analyze_base {
|
||||
common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
|
||||
common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
|
||||
common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
|
||||
common_peg_parser build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const;
|
||||
|
||||
// Shared helper: builds func_parser from open+call_id+args, handling atomic wrapping and close.
|
||||
// atomic_peek: if present, used as the peek expression in the third atomicity branch.
|
||||
common_peg_parser build_func_parser(common_chat_peg_builder & p, const std::string & name,
|
||||
const common_peg_parser & call_id_section, bool have_call_id,
|
||||
const common_peg_parser & args,
|
||||
std::optional<common_peg_parser> atomic_peek) const;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
|
||||
@@ -25,6 +25,9 @@ static const std::string ARG_SECOND = "BB_ARG_SND_BB";
|
||||
static const std::string USER_MSG = "U_USER_MSG Hello END_U";
|
||||
static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
|
||||
static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
|
||||
static const std::string CALL_ID_001 = "call00001";
|
||||
static const std::string CALL_ID_002 = "call00002";
|
||||
static const std::string CALL_ID_999 = "call99999";
|
||||
|
||||
static std::vector<std::function<void(const common_chat_template & tmpl, autoparser &)>> workarounds(
|
||||
{ // Old reasoning Qwen templates - they don't really display reasoning content, but we still want to
|
||||
@@ -92,34 +95,6 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Functionary 3.1]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Gemma4 - custom dict format: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("'<|tool_call>call:'") != std::string::npos) {
|
||||
analysis.tools.format.mode = tool_format::TAG_WITH_GEMMA4_DICT;
|
||||
analysis.tools.format.per_call_start = "<|tool_call>";
|
||||
analysis.tools.format.per_call_end = "<tool_call|>";
|
||||
analysis.tools.format.section_start = "";
|
||||
analysis.tools.format.section_end = "";
|
||||
analysis.tools.function.name_prefix = "call:";
|
||||
analysis.tools.function.name_suffix = "";
|
||||
analysis.tools.arguments.start = "{";
|
||||
analysis.tools.arguments.end = "}";
|
||||
analysis.tools.arguments.name_prefix = "";
|
||||
analysis.tools.arguments.name_suffix = ":";
|
||||
analysis.tools.arguments.separator = ",";
|
||||
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
|
||||
analysis.reasoning.start = "<|channel>thought";
|
||||
analysis.reasoning.end = "<channel|>";
|
||||
analysis.preserved_tokens.clear();
|
||||
analysis.preserved_tokens.push_back("<|tool_call>");
|
||||
analysis.preserved_tokens.push_back("<tool_call|>");
|
||||
analysis.preserved_tokens.push_back("<|tool_response>");
|
||||
analysis.preserved_tokens.push_back("<tool_response|>");
|
||||
analysis.preserved_tokens.push_back("<|\"|>");
|
||||
analysis.preserved_tokens.push_back("<|turn>");
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Gemma4]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// DeepSeek-R1-Distill-Qwen
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find(
|
||||
@@ -131,6 +106,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
analysis.tools.function.name_prefix = "<|tool▁sep|>";
|
||||
analysis.tools.format.per_call_end = "<|tool▁call▁end|>";
|
||||
analysis.tools.function.close = "```";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -158,7 +134,7 @@ static json user_msg = json{
|
||||
{ "content", USER_MSG }
|
||||
};
|
||||
|
||||
static json build_tool_call(const std::string & name, const json & args, const std::string & id = "call00001") {
|
||||
static json build_tool_call(const std::string & name, const json & args, const std::string & id = CALL_ID_001) {
|
||||
return json{
|
||||
{ "id", id },
|
||||
{ "type", "function" },
|
||||
@@ -166,17 +142,17 @@ static json build_tool_call(const std::string & name, const json & args, const s
|
||||
};
|
||||
}
|
||||
|
||||
static json first_tool_call_zero_args = build_tool_call(FUN_FIRST, json::object(), "call00001");
|
||||
static json first_tool_call_one_arg = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "XXXX" }}, "call00001");
|
||||
static json first_tool_call_one_arg_other_val = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "YYYY" }}, "call00001");
|
||||
static json first_tool_call_other_arg = build_tool_call(FUN_FIRST, {{ ARG_SECOND, "YYYY" }}, "call00001");
|
||||
static json first_tool_call_zero_args = build_tool_call(FUN_FIRST, json::object(), CALL_ID_001);
|
||||
static json first_tool_call_one_arg = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "XXXX" }}, CALL_ID_001);
|
||||
static json first_tool_call_one_arg_other_val = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "YYYY" }}, CALL_ID_001);
|
||||
static json first_tool_call_other_arg = build_tool_call(FUN_FIRST, {{ ARG_SECOND, "YYYY" }}, CALL_ID_001);
|
||||
|
||||
static json first_tool_call =
|
||||
build_tool_call(FUN_FIRST, json{{ ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, "call00001");
|
||||
build_tool_call(FUN_FIRST, json{{ ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_001);
|
||||
static json second_tool_call =
|
||||
build_tool_call(FUN_SECOND, json{ { ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, "call00002");
|
||||
build_tool_call(FUN_SECOND, json{ { ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_002);
|
||||
static json first_tool_call_alt_id =
|
||||
build_tool_call(FUN_FIRST, json{{ ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, "call99999");
|
||||
build_tool_call(FUN_FIRST, json{{ ARG_FIRST, "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_999);
|
||||
|
||||
template <typename T>
|
||||
static std::string mode_to_str(T mode) {
|
||||
@@ -215,6 +191,11 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
|
||||
LOG_DBG("func_name_prefix: '%s'\n", tools.function.name_prefix.c_str());
|
||||
LOG_DBG("func_name_suffix: '%s'\n", tools.function.name_suffix.c_str());
|
||||
LOG_DBG("func_close: '%s'\n", tools.function.close.c_str());
|
||||
LOG_DBG("call_id_prefix: '%s'\n", tools.call_id.prefix.c_str());
|
||||
LOG_DBG("call_id_suffix: '%s'\n", tools.call_id.suffix.c_str());
|
||||
LOG_DBG("call_id_pos: '%s'\n", mode_to_str(tools.call_id.pos).c_str());
|
||||
LOG_DBG("args_start: '%s'\n", tools.arguments.start.c_str());
|
||||
LOG_DBG("args_end: '%s'\n", tools.arguments.end.c_str());
|
||||
LOG_DBG("arg_name_prefix: '%s'\n", tools.arguments.name_prefix.c_str());
|
||||
LOG_DBG("arg_name_suffix: '%s'\n", tools.arguments.name_suffix.c_str());
|
||||
LOG_DBG("arg_value_prefix: '%s'\n", tools.arguments.value_prefix.c_str());
|
||||
@@ -583,12 +564,15 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
|
||||
if (caps.supports_parallel_tool_calls) {
|
||||
check_per_call_markers();
|
||||
}
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3a: Function call analysis\n" ANSI_RESET);
|
||||
extract_function_markers();
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3b: Argument analysis\n" ANSI_RESET);
|
||||
if (format.mode == tool_format::TAG_WITH_TAGGED) {
|
||||
analyze_arguments();
|
||||
}
|
||||
extract_argument_separator();
|
||||
extract_args_markers();
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3c: Call id analysis\n" ANSI_RESET);
|
||||
extract_call_id_markers();
|
||||
}
|
||||
}
|
||||
@@ -979,8 +963,6 @@ void analyze_tools::extract_function_markers() {
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_arguments() {
|
||||
LOG_DBG(ANSI_ORANGE "Phase 4: Argument analysis\n" ANSI_RESET);
|
||||
|
||||
extract_argument_name_markers();
|
||||
extract_argument_value_markers();
|
||||
}
|
||||
@@ -1189,7 +1171,7 @@ void analyze_tools::extract_args_markers() {
|
||||
|
||||
const auto & diff = comparison->diff;
|
||||
|
||||
if (format.mode != tool_format::JSON_NATIVE) {
|
||||
if (format.mode == tool_format::JSON_NATIVE) {
|
||||
std::string prefix_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
|
||||
std::string suffix_marker = !format.section_end.empty() ? format.section_end : format.per_call_end;
|
||||
// these might happen earlier in the tools section as an example or somewhere else, so we need to find the closest ones
|
||||
@@ -1211,6 +1193,10 @@ void analyze_tools::extract_args_markers() {
|
||||
if (find_fun != std::string::npos) {
|
||||
args_start = args_start.substr(find_fun + FUN_FIRST.size(), args_start.size() - find_fun - FUN_FIRST.size());
|
||||
}
|
||||
size_t find_call_id = args_start.find(CALL_ID_001);
|
||||
if (find_call_id != std::string::npos) {
|
||||
args_start = args_start.substr(find_call_id + CALL_ID_001.size(), args_start.size() - find_call_id - CALL_ID_001.size());
|
||||
}
|
||||
arguments.start = args_start;
|
||||
arguments.end = args_end;
|
||||
}
|
||||
@@ -1250,8 +1236,8 @@ void analyze_tools::extract_call_id_markers() {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string id_value_1 = "call00001";
|
||||
std::string id_value_2 = "call99999";
|
||||
std::string id_value_1 = CALL_ID_001;
|
||||
std::string id_value_2 = CALL_ID_999;
|
||||
|
||||
size_t common_id_prefix_len = 0;
|
||||
for (size_t i = 0; i < std::min(id_value_1.length(), id_value_2.length()); i++) {
|
||||
@@ -1350,6 +1336,14 @@ void analyze_tools::extract_call_id_markers() {
|
||||
call_id.suffix = find_first_marker(before_func);
|
||||
}
|
||||
|
||||
if (call_id.prefix == arguments.end) {
|
||||
call_id.prefix = "";
|
||||
}
|
||||
|
||||
if (call_id.suffix == arguments.start) {
|
||||
call_id.suffix = "";
|
||||
}
|
||||
|
||||
// When call_id is detected, per_call_end may have been incorrectly set to include
|
||||
// the call_id_suffix and sample args. Clear it if it starts with call_id_suffix.
|
||||
if (call_id.pos != call_id_position::NONE && !call_id.suffix.empty() &&
|
||||
|
||||
+140
-82
@@ -75,84 +75,6 @@ static std::string escape_json_string_inner(const std::string & s) {
|
||||
return escaped;
|
||||
}
|
||||
|
||||
static const std::string GEMMA4_QUOTE = "<|\"|>";
|
||||
|
||||
static std::string normalize_gemma4_to_json(const std::string & input) {
|
||||
std::string result;
|
||||
result.reserve(input.size() * 2);
|
||||
|
||||
enum Ctx { DICT, ARRAY };
|
||||
std::vector<Ctx> ctx;
|
||||
|
||||
auto is_ws = [](char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; };
|
||||
auto skip_ws = [&](size_t & pos) {
|
||||
while (pos < input.size() && is_ws(input[pos])) {
|
||||
result += input[pos++];
|
||||
}
|
||||
};
|
||||
|
||||
auto quote_unquoted_key = [&](size_t & pos) {
|
||||
if (pos < input.size() && input[pos] != '"' && input[pos] != '}') {
|
||||
result += '"';
|
||||
while (pos < input.size() && input[pos] != ':' && !is_ws(input[pos])) {
|
||||
result += input[pos++];
|
||||
}
|
||||
result += '"';
|
||||
skip_ws(pos);
|
||||
}
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
while (i < input.size()) {
|
||||
if (i + GEMMA4_QUOTE.size() <= input.size() &&
|
||||
input.compare(i, GEMMA4_QUOTE.size(), GEMMA4_QUOTE) == 0) {
|
||||
result += '"';
|
||||
i += GEMMA4_QUOTE.size();
|
||||
continue;
|
||||
}
|
||||
|
||||
char c = input[i];
|
||||
|
||||
if (c == '{') {
|
||||
result += c;
|
||||
ctx.push_back(DICT);
|
||||
++i;
|
||||
skip_ws(i);
|
||||
quote_unquoted_key(i);
|
||||
continue;
|
||||
}
|
||||
if (c == '}') {
|
||||
result += c;
|
||||
if (!ctx.empty()) ctx.pop_back();
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
if (c == '[') {
|
||||
result += c;
|
||||
ctx.push_back(ARRAY);
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
if (c == ']') {
|
||||
result += c;
|
||||
if (!ctx.empty()) ctx.pop_back();
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
if (c == ',' && !ctx.empty() && ctx.back() == DICT) {
|
||||
result += c;
|
||||
++i;
|
||||
skip_ws(i);
|
||||
quote_unquoted_key(i);
|
||||
continue;
|
||||
}
|
||||
|
||||
result += c;
|
||||
++i;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Convert Python-style single-quoted strings to JSON double-quoted strings
|
||||
// Only converts outer string delimiters, properly handling escape sequences:
|
||||
// - {'key': 'value'} -> {"key": "value"}
|
||||
@@ -296,10 +218,6 @@ std::string common_chat_peg_mapper::normalize_container_value(const std::string
|
||||
return normalize_quotes_to_json(input);
|
||||
}
|
||||
|
||||
std::string common_chat_peg_gemma4_mapper::normalize_container_value(const std::string & input) {
|
||||
return normalize_quotes_to_json(normalize_gemma4_to_json(input));
|
||||
}
|
||||
|
||||
void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena,
|
||||
const common_peg_parse_result & parse_result_arg) {
|
||||
arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
|
||||
@@ -947,3 +865,143 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
|
||||
|
||||
return force_tool_calls ? section : optional(section);
|
||||
}
|
||||
|
||||
void common_chat_peg_gemma4_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
|
||||
for (const auto & node : result.nodes) {
|
||||
visit(arena, node);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string gemma4_to_json(const common_peg_ast_arena & arena, common_peg_ast_id id) {
|
||||
const auto & node = arena.get(id);
|
||||
|
||||
if (node.text.empty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-number" || node.rule == "gemma4-bool" || node.rule == "gemma4-null") {
|
||||
return std::string(node.text);
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-string-content") {
|
||||
return escape_json_string_inner(std::string(node.text));
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-string") {
|
||||
std::string result = "\"";
|
||||
if (!node.children.empty()) {
|
||||
result += gemma4_to_json(arena, node.children[0]);
|
||||
if (!node.is_partial) {
|
||||
result += "\"";
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-array") {
|
||||
std::string result = "[";
|
||||
|
||||
bool add_comma = false;
|
||||
for (auto child_id : node.children) {
|
||||
if (add_comma) {
|
||||
result += ',';
|
||||
}
|
||||
add_comma = true;
|
||||
result += gemma4_to_json(arena, child_id);
|
||||
}
|
||||
|
||||
if (!node.is_partial) {
|
||||
result += ']';
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-dict-key-name") {
|
||||
return std::string(node.text);
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-dict-key") {
|
||||
std::string result = "\"";
|
||||
if (!node.children.empty()) {
|
||||
result += escape_json_string_inner(gemma4_to_json(arena, node.children[0]));
|
||||
}
|
||||
if (!node.is_partial) {
|
||||
result += "\":";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-dict-kv") {
|
||||
std::string result;
|
||||
for (auto child_id : node.children) {
|
||||
result += gemma4_to_json(arena, child_id);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-dict") {
|
||||
std::string result = "{";
|
||||
|
||||
bool add_comma = false;
|
||||
for (auto child_id : node.children) {
|
||||
if (add_comma) {
|
||||
result += ',';
|
||||
}
|
||||
add_comma = true;
|
||||
result += gemma4_to_json(arena, child_id);
|
||||
}
|
||||
|
||||
if (!node.is_partial) {
|
||||
result += '}';
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (node.rule == "gemma4-value") {
|
||||
if (!node.children.empty()) {
|
||||
return gemma4_to_json(arena, node.children[0]);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
void common_chat_peg_gemma4_mapper::visit(const common_peg_ast_arena & arena, common_peg_ast_id id) {
|
||||
const auto & node = arena.get(id);
|
||||
|
||||
if (node.tag == "reasoning") {
|
||||
result.reasoning_content += std::string(node.text);
|
||||
return;
|
||||
}
|
||||
|
||||
if (node.tag == "content") {
|
||||
result.content += std::string(node.text);
|
||||
return;
|
||||
}
|
||||
|
||||
if (node.tag == "tool") {
|
||||
auto name_id = arena.find_by_tag(node, "tool-name");
|
||||
auto args_id = arena.find_by_tag(node, "tool-args");
|
||||
|
||||
if (name_id != COMMON_PEG_INVALID_AST_ID && args_id != COMMON_PEG_INVALID_AST_ID) {
|
||||
const auto & name_node = arena.get(name_id);
|
||||
const auto & args_node = arena.get(args_id);
|
||||
|
||||
if (!name_node.is_partial) {
|
||||
common_chat_tool_call call;
|
||||
call.name = std::string(name_node.text);
|
||||
if (!args_node.children.empty()) {
|
||||
call.arguments = gemma4_to_json(arena, args_node.children[0]);
|
||||
}
|
||||
result.tool_calls.push_back(call);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto child_id : node.children) {
|
||||
visit(arena, child_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,8 +35,9 @@ class common_chat_peg_mapper {
|
||||
class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
|
||||
public:
|
||||
common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
|
||||
protected:
|
||||
std::string normalize_container_value(const std::string & input) override;
|
||||
virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
|
||||
private:
|
||||
void visit(const common_peg_ast_arena & arena, common_peg_ast_id id);
|
||||
};
|
||||
|
||||
struct content_structure;
|
||||
|
||||
+267
-40
@@ -1077,6 +1077,131 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
return data;
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_gemma4(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
|
||||
data.supports_thinking = true;
|
||||
|
||||
data.preserved_tokens = {
|
||||
"<|channel>",
|
||||
"<channel|>",
|
||||
"<|tool_call>",
|
||||
"<tool_call|>",
|
||||
"<|turn>",
|
||||
};
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
|
||||
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto start = p.rule("start", p.prefix(inputs.generation_prompt, "<|channel>"));
|
||||
|
||||
if (extract_reasoning) {
|
||||
p.rule("thought", p.literal("<|channel>thought\n") + p.reasoning(p.until("<channel|>")) + p.literal("<channel|>"));
|
||||
} else {
|
||||
p.rule("thought", p.content(p.literal("<|channel>thought\n") + p.until("<channel|>") + p.literal("<channel|>")));
|
||||
}
|
||||
|
||||
auto thought = (p.peek(p.literal("<|channel>")) + p.ref("thought")) | p.negate(p.literal("<|channel>"));
|
||||
|
||||
if (has_response_format) {
|
||||
auto response_format = p.literal("```json") <<
|
||||
p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)) <<
|
||||
p.literal("```");
|
||||
return start + p.optional(thought) + response_format;
|
||||
}
|
||||
|
||||
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
// Gemma4 tool calling syntax
|
||||
// Rules should match traversal logic in gemma4_to_json()
|
||||
p.rule("gemma4-string-content", p.until("<|\"|>"));
|
||||
p.rule("gemma4-string", p.literal("<|\"|>") + p.ref("gemma4-string-content") + p.literal("<|\"|>"));
|
||||
p.rule("gemma4-bool", p.json_bool());
|
||||
p.rule("gemma4-null", p.json_null());
|
||||
p.rule("gemma4-number", p.json_number());
|
||||
p.rule("gemma4-dict-key", p.rule("gemma4-dict-key-name", p.until(":")) + p.literal(":"));
|
||||
p.rule("gemma4-dict-kv", p.ref("gemma4-dict-key") + p.space() + p.ref("gemma4-value"));
|
||||
p.rule("gemma4-dict", [&]() {
|
||||
auto ws = p.space();
|
||||
auto member = p.ref("gemma4-dict-kv");
|
||||
auto members = p.sequence({member, p.zero_or_more(p.sequence({p.literal(","), ws, member}))});
|
||||
return p.sequence({
|
||||
p.literal("{"), ws,
|
||||
p.choice({p.literal("}"), p.sequence({members, ws, p.literal("}")})})
|
||||
});
|
||||
});
|
||||
p.rule("gemma4-array", [&]() {
|
||||
auto ws = p.space();
|
||||
auto value = p.ref("gemma4-value");
|
||||
auto elements = p.sequence({value, p.zero_or_more(p.sequence({p.literal(","), ws, value}))});
|
||||
return p.sequence({
|
||||
p.literal("["), ws,
|
||||
p.choice({p.literal("]"), p.sequence({elements, ws, p.literal("]")})})
|
||||
});
|
||||
});
|
||||
p.rule("gemma4-value", [&]() {
|
||||
return p.choice({
|
||||
p.ref("gemma4-string"), p.ref("gemma4-dict"), p.ref("gemma4-array"),
|
||||
p.ref("gemma4-number"), p.ref("gemma4-bool"), p.ref("gemma4-null")
|
||||
});
|
||||
});
|
||||
|
||||
auto tool_choice = p.choice();
|
||||
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
// TODO @aldehir : need to extend json-schema-to-grammar to produce more than JSON rules
|
||||
// const auto & params = function.at("parameters");
|
||||
|
||||
tool_choice |= p.rule("tool-" + name, p.tool(p.sequence({
|
||||
p.tool_open(p.tool_name(p.literal(name)) + p.peek(p.literal("{"))),
|
||||
p.tool_args(p.ref("gemma4-dict")),
|
||||
})));
|
||||
});
|
||||
|
||||
auto tool_call = p.trigger_rule("tool-call", p.repeat(
|
||||
"<|tool_call>call:" + tool_choice + "<tool_call|>",
|
||||
/* min = */ inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0,
|
||||
/* max = */ inputs.parallel_tool_calls ? -1 : 1
|
||||
));
|
||||
|
||||
auto content = p.rule("content", p.content(p.until_one_of({"<|channel>", "<|tool_call>"})));
|
||||
auto message = p.rule("message", thought + content);
|
||||
return start + p.zero_or_more(message) + tool_call;
|
||||
}
|
||||
|
||||
auto content = p.rule("content", p.content(p.until("<|channel>")));
|
||||
auto message = p.rule("message", thought + content);
|
||||
return start + p.one_or_more(message);
|
||||
});
|
||||
|
||||
data.parser = parser.save();
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
auto schema = function.at("parameters");
|
||||
builder.resolve_refs(schema);
|
||||
});
|
||||
parser.build_grammar(builder, data.grammar_lazy);
|
||||
});
|
||||
|
||||
data.grammar_triggers = {
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_call>" },
|
||||
};
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
// Functionary v3.2 - uses recipient-based format: >>>recipient\n{content}
|
||||
static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
@@ -1556,46 +1681,146 @@ static void requires_non_null_content(json & messages) {
|
||||
}
|
||||
|
||||
// Gemma4 uses a custom tool_responses field instead of role:tool messages.
|
||||
// Convert consecutive role:tool messages into a single user message with tool_responses.
|
||||
//
|
||||
// This will transform a sequence of messages:
|
||||
// assistant(tool_call+) -> tool+ -> assistant(content)
|
||||
//
|
||||
// Into a single assistant message containing a tool_responses field:
|
||||
// assistant(content + tool_call + tool_responses)
|
||||
//
|
||||
// This is necessary for the Gemma4 chat template to properly format the prompt.
|
||||
// See https://ai.google.dev/gemma/docs/core/prompt-formatting-gemma4
|
||||
struct gemma4_model_turn_builder {
|
||||
json & messages;
|
||||
size_t pos;
|
||||
json tool_calls = json::array();
|
||||
json tool_responses = json::array();
|
||||
json content;
|
||||
json reasoning_content;
|
||||
|
||||
gemma4_model_turn_builder(json & msgs, size_t pos) : messages(msgs), pos(pos) {}
|
||||
|
||||
void collect() {
|
||||
// Collect the first assistant message
|
||||
auto & msg = messages[pos];
|
||||
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
||||
// According to the prompt formatting guide, we need to preserve reasoning_content
|
||||
// between function calls. The current chat templates do not support this, but we will do it anyway.
|
||||
reasoning_content = msg.at("reasoning_content");
|
||||
}
|
||||
for (auto & tc : msg.at("tool_calls")) {
|
||||
tool_calls.push_back(tc);
|
||||
}
|
||||
pos++;
|
||||
|
||||
// Collect tool call results
|
||||
while (pos < messages.size() && messages[pos].value("role", "") == "tool") {
|
||||
collect_result(messages[pos]);
|
||||
pos++;
|
||||
}
|
||||
|
||||
// Check if the next assistant message is the final message
|
||||
if (pos < messages.size() && messages[pos].value("role", "") == "assistant") {
|
||||
auto & next = messages[pos];
|
||||
if (!has_tool_calls(next) && has_content(next)) {
|
||||
content = next.at("content");
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void collect_result(const json & curr) {
|
||||
json response;
|
||||
if (curr.contains("content")) {
|
||||
const auto & content = curr.at("content");
|
||||
if (content.is_string()) {
|
||||
// Try to parse the content as JSON; fall back to raw string
|
||||
try {
|
||||
response = json::parse(content.get<std::string>());
|
||||
} catch (...) {
|
||||
response = content;
|
||||
}
|
||||
} else {
|
||||
response = content;
|
||||
}
|
||||
}
|
||||
|
||||
std::string name;
|
||||
|
||||
// Match name with corresponding tool call
|
||||
size_t idx = tool_responses.size();
|
||||
if (idx < tool_calls.size()) {
|
||||
auto & tc = tool_calls[idx];
|
||||
if (tc.contains("function")) {
|
||||
name = tc.at("function").value("name", "");
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to the tool call id
|
||||
if (name.empty()) {
|
||||
name = curr.value("tool_call_id", "");
|
||||
}
|
||||
|
||||
tool_responses.push_back({{"name", name}, {"response", response}});
|
||||
}
|
||||
|
||||
json build() {
|
||||
collect();
|
||||
|
||||
json msg = {
|
||||
{"role", "assistant"},
|
||||
{"tool_calls", tool_calls},
|
||||
};
|
||||
if (!tool_responses.empty()) {
|
||||
msg["tool_responses"] = tool_responses;
|
||||
}
|
||||
if (!content.is_null()) {
|
||||
msg["content"] = content;
|
||||
}
|
||||
if (!reasoning_content.is_null()) {
|
||||
msg["reasoning_content"] = reasoning_content;
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
|
||||
static bool has_content(const json & msg) {
|
||||
if (!msg.contains("content") || msg.at("content").is_null()) {
|
||||
return false;
|
||||
}
|
||||
const auto & content = msg.at("content");
|
||||
if (content.is_string() && !content.get<std::string>().empty()) {
|
||||
return true;
|
||||
}
|
||||
if (content.is_array() && !content.empty()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool has_tool_calls(const json & msg) {
|
||||
return msg.contains("tool_calls") && msg.at("tool_calls").is_array() && !msg.at("tool_calls").empty();
|
||||
}
|
||||
};
|
||||
|
||||
static void convert_tool_responses_gemma4(json & messages) {
|
||||
json result = json::array();
|
||||
size_t i = 0;
|
||||
|
||||
while (i < messages.size()) {
|
||||
if (messages[i].contains("role") && messages[i].at("role") == "tool") {
|
||||
json tool_responses = json::array();
|
||||
while (i < messages.size() &&
|
||||
messages[i].contains("role") &&
|
||||
messages[i].at("role") == "tool") {
|
||||
const auto & tool_msg = messages[i];
|
||||
std::string name;
|
||||
if (tool_msg.contains("tool_call_id") && tool_msg.at("tool_call_id").is_string()) {
|
||||
name = tool_msg.at("tool_call_id");
|
||||
} else if (tool_msg.contains("name") && tool_msg.at("name").is_string()) {
|
||||
name = tool_msg.at("name");
|
||||
}
|
||||
json response;
|
||||
if (tool_msg.contains("content")) {
|
||||
const auto & content = tool_msg.at("content");
|
||||
if (content.is_string()) {
|
||||
// Try to parse the content as JSON; fall back to raw string
|
||||
try {
|
||||
response = json::parse(content.get<std::string>());
|
||||
} catch (...) {
|
||||
response = content;
|
||||
}
|
||||
} else {
|
||||
response = content;
|
||||
}
|
||||
}
|
||||
tool_responses.push_back({{"name", name}, {"response", response}});
|
||||
i++;
|
||||
}
|
||||
result.push_back({{"role", "user"}, {"tool_responses", tool_responses}});
|
||||
} else {
|
||||
result.push_back(messages[i]);
|
||||
auto & msg = messages[i];
|
||||
|
||||
if (msg.value("role", "") != "assistant" || !msg.contains("tool_calls") ||
|
||||
!msg.at("tool_calls").is_array() || msg.at("tool_calls").empty()) {
|
||||
result.push_back(msg);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
gemma4_model_turn_builder builder(messages, i);
|
||||
result.push_back(builder.build());
|
||||
i = builder.pos;
|
||||
}
|
||||
|
||||
messages = result;
|
||||
}
|
||||
|
||||
@@ -1631,10 +1856,10 @@ static json common_chat_extra_context() {
|
||||
return ctx;
|
||||
}
|
||||
|
||||
static std::optional<common_chat_params> try_specialized_template(
|
||||
std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const common_chat_template & tmpl,
|
||||
const std::string & src,
|
||||
const autoparser::generation_params & params) {
|
||||
autoparser::generation_params & params) {
|
||||
// Ministral/Mistral Large 3 - uses special reasoning structure fixes, can't use autoparser
|
||||
// Note: Mistral Small 3.2 uses [CALL_ID] which Ministral doesn't have, so we can distinguish them
|
||||
if (src.find("[SYSTEM_PROMPT]") != std::string::npos && src.find("[TOOL_CALLS]") != std::string::npos &&
|
||||
@@ -1687,6 +1912,12 @@ static std::optional<common_chat_params> try_specialized_template(
|
||||
return common_chat_params_init_gigachat_v3(tmpl, params);
|
||||
}
|
||||
|
||||
// Gemma4 format detection
|
||||
if (src.find("'<|tool_call>call:'") != std::string::npos) {
|
||||
workaround::convert_tool_responses_gemma4(params.messages);
|
||||
return common_chat_params_init_gemma4(tmpl, params);
|
||||
}
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
@@ -1727,10 +1958,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
workaround::func_args_not_string(params.messages);
|
||||
}
|
||||
|
||||
if (src.find("'<|tool_call>call:'") != std::string::npos) {
|
||||
workaround::convert_tool_responses_gemma4(params.messages);
|
||||
}
|
||||
|
||||
params.add_generation_prompt = false;
|
||||
std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
params.add_generation_prompt = true;
|
||||
@@ -1778,7 +2005,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
return data;
|
||||
}
|
||||
|
||||
if (auto result = try_specialized_template(tmpl, src, params)) {
|
||||
if (auto result = common_chat_try_specialized_template(tmpl, src, params)) {
|
||||
result->generation_prompt = params.generation_prompt;
|
||||
return *result;
|
||||
}
|
||||
|
||||
@@ -270,3 +270,8 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
|
||||
std::string common_chat_template_direct_apply(
|
||||
const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs);
|
||||
|
||||
std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const common_chat_template & tmpl,
|
||||
const std::string & src,
|
||||
autoparser::generation_params & params);
|
||||
|
||||
+3
-2
@@ -579,8 +579,9 @@ struct common_params {
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
bool clear_idle = true; // save and clear idle slots upon starting a new task
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
|
||||
+6
-3
@@ -596,9 +596,12 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & f : files) {
|
||||
if (gguf_filename_is_model(f.path)) {
|
||||
return f;
|
||||
// fallback to first available model only if tag is empty
|
||||
if (tag.empty()) {
|
||||
for (const auto & f : files) {
|
||||
if (gguf_filename_is_model(f.path)) {
|
||||
return f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
+49
-1
@@ -256,6 +256,38 @@ static std::pair<std::vector<common_peg_chars_parser::char_range>, bool> parse_c
|
||||
return {ranges, negated};
|
||||
}
|
||||
|
||||
common_peg_ast_id common_peg_ast_arena::find_by_tag(const common_peg_ast_node & parent, const std::string & tag, int max_depth) const {
|
||||
for (auto child_id : parent.children) {
|
||||
const auto & child = get(child_id);
|
||||
if (child.tag == tag) {
|
||||
return child_id;
|
||||
}
|
||||
if (max_depth > 1) {
|
||||
auto result = find_by_tag(child, tag, max_depth - 1);
|
||||
if (result != COMMON_PEG_INVALID_AST_ID) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
return COMMON_PEG_INVALID_AST_ID;
|
||||
}
|
||||
|
||||
common_peg_ast_id common_peg_ast_arena::find_by_rule(const common_peg_ast_node & parent, const std::string & rule, int max_depth) const {
|
||||
for (auto child_id : parent.children) {
|
||||
const auto & child = get(child_id);
|
||||
if (child.rule == rule) {
|
||||
return child_id;
|
||||
}
|
||||
if (max_depth > 1) {
|
||||
auto result = find_by_rule(child, rule, max_depth - 1);
|
||||
if (result != COMMON_PEG_INVALID_AST_ID) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
return COMMON_PEG_INVALID_AST_ID;
|
||||
}
|
||||
|
||||
void common_peg_ast_arena::visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const {
|
||||
if (id == COMMON_PEG_INVALID_AST_ID) {
|
||||
return;
|
||||
@@ -1561,7 +1593,23 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
|
||||
if (!s.schema) {
|
||||
return true;
|
||||
}
|
||||
if (s.raw && s.schema->contains("type") && s.schema->at("type").is_string() && s.schema->at("type") == "string") {
|
||||
if (s.raw && s.schema->contains("type")) {
|
||||
const auto & type_val = s.schema->at("type");
|
||||
if (type_val.is_string() && type_val == "string") {
|
||||
return true;
|
||||
}
|
||||
// Handle nullable types like ["string", "null"] - delegate when the
|
||||
// non-null type is string, since the tagged format uses raw text
|
||||
if (type_val.is_array()) {
|
||||
for (const auto & t : type_val) {
|
||||
if (t.is_string() && t.get<std::string>() != "null") {
|
||||
return t.get<std::string>() == "string";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Delegate for enum schemas in raw mode - enum values are literal strings
|
||||
if (s.raw && !s.schema->contains("type") && s.schema->contains("enum")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
@@ -106,6 +106,9 @@ class common_peg_ast_arena {
|
||||
|
||||
const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }
|
||||
|
||||
common_peg_ast_id find_by_tag(const common_peg_ast_node & parent, const std::string & tag, int max_depth = 3) const;
|
||||
common_peg_ast_id find_by_rule(const common_peg_ast_node & parent, const std::string & tag, int max_depth = 3) const;
|
||||
|
||||
size_t size() const { return nodes_.size(); }
|
||||
|
||||
void clear() { nodes_.clear(); }
|
||||
|
||||
+91
-9
@@ -11521,13 +11521,50 @@ class LLaDAMoEModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
|
||||
class HunYuanModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
|
||||
def _get_eod_token_id(self) -> int | None:
|
||||
"""Get the actual end-of-generation token from config (eod_token_id)."""
|
||||
return self.hparams.get("eod_token_id")
|
||||
|
||||
def _get_eot_token_id(self) -> int | None:
|
||||
"""Get the end-of-turn token from generation_config.json.
|
||||
This is the first entry in eos_token_id when it's a list."""
|
||||
gen_cfg_path = self.dir_model / "generation_config.json"
|
||||
if gen_cfg_path.is_file():
|
||||
with open(gen_cfg_path, encoding="utf-8") as f:
|
||||
gen_cfg = json.load(f)
|
||||
eos = gen_cfg.get("eos_token_id")
|
||||
if isinstance(eos, list) and len(eos) >= 2:
|
||||
return eos[0]
|
||||
return None
|
||||
|
||||
def _fix_special_tokens(self):
|
||||
"""Fix EOS/EOT tokens that are incorrect in upstream configs."""
|
||||
eod_id = self._get_eod_token_id()
|
||||
if eod_id is not None:
|
||||
self.gguf_writer.add_eos_token_id(eod_id)
|
||||
eot_id = self._get_eot_token_id()
|
||||
if eot_id is not None:
|
||||
self.gguf_writer.add_eot_token_id(eot_id)
|
||||
|
||||
def set_vocab(self):
|
||||
if (self.dir_model / "tokenizer.json").is_file():
|
||||
self._set_vocab_gpt2()
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
|
||||
token_types = None
|
||||
if (self.hparams.get("pad_token_id") or 0) < 0:
|
||||
token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
self._fix_special_tokens()
|
||||
else:
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
@@ -11579,13 +11616,18 @@ class HunYuanModel(TextModel):
|
||||
# FIX for BOS token: Overwrite incorrect id read from config.json
|
||||
if self.hparams['hidden_size'] == 4096:
|
||||
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
|
||||
self._fix_special_tokens()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
|
||||
saved_num_experts = self.hparams.pop("num_experts", None)
|
||||
super().set_gguf_parameters()
|
||||
if saved_num_experts is not None and saved_num_experts > 1:
|
||||
self.hparams["num_experts"] = saved_num_experts
|
||||
hparams = self.hparams
|
||||
|
||||
# Rope
|
||||
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||
if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"):
|
||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||
alpha = self.rope_parameters.get("alpha", 50)
|
||||
@@ -11595,13 +11637,14 @@ class HunYuanModel(TextModel):
|
||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_rope_scaling_factor(1)
|
||||
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
||||
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
||||
if self.rope_parameters.get("rope_type") == "dynamic":
|
||||
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
||||
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
||||
|
||||
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
||||
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
||||
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
||||
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
||||
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
||||
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name == "lm_head.weight":
|
||||
@@ -11609,9 +11652,48 @@ class HunYuanModel(TextModel):
|
||||
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||
return
|
||||
|
||||
# skip vision tensors for HunyuanVL models
|
||||
if name.startswith("vit."):
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
# HunyuanOCR uses max_image_size instead of image_size
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if not name.startswith("vit."):
|
||||
return # skip text tensors
|
||||
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
|
||||
if "position_embedding" in name:
|
||||
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
|
||||
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
@ModelBase.register("SmolLM3ForCausalLM")
|
||||
class SmolLM3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||
|
||||
+1
-1
@@ -389,7 +389,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
||||
|
||||
|
||||
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. Note that [`HSA_OVERRIDE_GFX_VERSION`] is [not supported on Windows](https://github.com/ROCm/ROCm/issues/2654)
|
||||
|
||||
### Unified Memory
|
||||
|
||||
|
||||
@@ -437,12 +437,18 @@ inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_
|
||||
|
||||
// Head-dim specializations used by the tuned vec f16 path.
|
||||
switch (key.head_dim_qk) {
|
||||
case 64: return 2u;
|
||||
case 96: return 4u;
|
||||
case 128: return 1u;
|
||||
case 192: return 2u;
|
||||
case 576: return 2u;
|
||||
default: return 1u;
|
||||
case 64:
|
||||
return 2u;
|
||||
case 96:
|
||||
return 4u;
|
||||
case 128:
|
||||
return 1u;
|
||||
case 192:
|
||||
return 2u;
|
||||
case 576:
|
||||
return 2u;
|
||||
default:
|
||||
return 1u;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -513,9 +519,9 @@ struct ggml_webgpu_flash_attn_blk_shader_lib_context {
|
||||
};
|
||||
|
||||
inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_blk_shader(
|
||||
pre_wgsl::Preprocessor & preprocessor,
|
||||
const char * shader_src,
|
||||
const ggml_webgpu_flash_attn_blk_shader_lib_context & context) {
|
||||
pre_wgsl::Preprocessor & preprocessor,
|
||||
const char * shader_src,
|
||||
const ggml_webgpu_flash_attn_blk_shader_lib_context & context) {
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "flash_attn_vec_blk";
|
||||
|
||||
@@ -1857,9 +1863,8 @@ class ggml_webgpu_shader_lib {
|
||||
defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
|
||||
|
||||
uint32_t q_tile = context.sg_mat_m;
|
||||
uint32_t kv_tile =
|
||||
std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
|
||||
context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
|
||||
uint32_t kv_tile = std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
|
||||
context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
|
||||
if (context.key.use_vec) {
|
||||
q_tile = 1;
|
||||
kv_tile = std::max(context.sg_mat_n, std::min(32u, ggml_webgpu_flash_attn_max_kv_tile(context)));
|
||||
@@ -1885,14 +1890,14 @@ class ggml_webgpu_shader_lib {
|
||||
}
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
|
||||
const char * shader_src = context.key.use_vec ? wgsl_flash_attn_vec_split : wgsl_flash_attn;
|
||||
const char * shader_src = context.key.use_vec ? wgsl_flash_attn_vec_split : wgsl_flash_attn;
|
||||
webgpu_pipeline pipeline =
|
||||
ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
|
||||
auto decisions = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
|
||||
decisions->q_tile = q_tile;
|
||||
decisions->kv_tile = kv_tile;
|
||||
decisions->wg_size = wg_size;
|
||||
pipeline.context = decisions;
|
||||
auto decisions = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
|
||||
decisions->q_tile = q_tile;
|
||||
decisions->kv_tile = kv_tile;
|
||||
decisions->wg_size = wg_size;
|
||||
pipeline.context = decisions;
|
||||
flash_attn_pipelines[context.key] = pipeline;
|
||||
return flash_attn_pipelines[context.key];
|
||||
}
|
||||
@@ -1905,7 +1910,7 @@ class ggml_webgpu_shader_lib {
|
||||
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_flash_attn_blk_shader(preprocessor, wgsl_flash_attn_vec_blk, context);
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
|
||||
flash_attn_blk_pipelines[context.key] = pipeline;
|
||||
return flash_attn_blk_pipelines[context.key];
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -734,6 +734,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_LAYER_OUT_SCALE = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
V_MM_PRE_NORM = auto() # hunyuanocr
|
||||
V_MM_POST_NORM = auto()
|
||||
V_MM_INP_NORM = auto()
|
||||
V_MM_INP_PROJ = auto() # gemma3
|
||||
@@ -769,6 +770,8 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_MM_GATE = auto() # cogvlm
|
||||
V_TOK_BOI = auto() # cogvlm
|
||||
V_TOK_EOI = auto() # cogvlm
|
||||
V_TOK_IMG_BEGIN = auto() # hunyuanocr
|
||||
V_TOK_IMG_END = auto() # hunyuanocr
|
||||
V_STD_BIAS = auto() # gemma4
|
||||
V_STD_SCALE = auto() # gemma4
|
||||
V_SAM_POS_EMBD = auto() # Deepseek-OCR
|
||||
@@ -1246,6 +1249,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.V_MM_GATE: "mm.gate",
|
||||
MODEL_TENSOR.V_TOK_BOI: "v.boi",
|
||||
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
|
||||
MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm",
|
||||
MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin",
|
||||
MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end",
|
||||
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
|
||||
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
|
||||
# DeepSeek-OCR SAM
|
||||
@@ -1393,6 +1399,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_MM_GATE,
|
||||
MODEL_TENSOR.V_TOK_BOI,
|
||||
MODEL_TENSOR.V_TOK_EOI,
|
||||
MODEL_TENSOR.V_MM_PRE_NORM,
|
||||
MODEL_TENSOR.V_TOK_IMG_BEGIN,
|
||||
MODEL_TENSOR.V_TOK_IMG_END,
|
||||
MODEL_TENSOR.V_STD_BIAS,
|
||||
MODEL_TENSOR.V_STD_SCALE,
|
||||
MODEL_TENSOR.V_SAM_POS_EMBD,
|
||||
@@ -4113,6 +4122,7 @@ class VisionProjectorType:
|
||||
GLM4V = "glm4v"
|
||||
YOUTUVL = "youtuvl"
|
||||
NEMOTRON_V2_VL = "nemotron_v2_vl"
|
||||
HUNYUANOCR = "hunyuanocr"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -1359,6 +1359,7 @@ class TensorNameMap:
|
||||
"visual.merger.mlp.{bid}", # qwen2vl
|
||||
"mlp_AR.linear_{bid}", # PaddleOCR-VL
|
||||
"merger.mlp.{bid}",
|
||||
"vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
@@ -1366,6 +1367,7 @@ class TensorNameMap:
|
||||
"model.vision.linear_proj.linear_proj", # cogvlm
|
||||
"model.projector.layers", # Deepseek-OCR
|
||||
"visual.merger.proj", # glm4v
|
||||
"vit.perceive.mlp", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||
@@ -1393,6 +1395,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
|
||||
"vpm.embeddings.patch_embedding",
|
||||
"model.vision_model.embeddings.patch_embedding", # SmolVLM
|
||||
"vit.embeddings.patch_embedding", # HunyuanOCR
|
||||
"vision_tower.patch_conv", # pixtral-hf
|
||||
"vision_encoder.patch_conv", # pixtral
|
||||
"vision_model.patch_embedding.linear", # llama 4
|
||||
@@ -1414,6 +1417,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
|
||||
"vpm.embeddings.position_embedding",
|
||||
"model.vision_model.embeddings.position_embedding", # SmolVLM
|
||||
"vit.embeddings.position_embedding", # HunyuanOCR
|
||||
"vision_model.positional_embedding_vlm", # llama 4
|
||||
"vision_tower.patch_embed.pos_emb", # kimi-vl
|
||||
"visual.pos_embed", # qwen3vl
|
||||
@@ -1425,10 +1429,12 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||
"model.image_newline", # Deepseek-OCR
|
||||
"vit.perceive.image_newline", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_VSEP: (
|
||||
"model.view_seperator", # Deepseek-OCR
|
||||
"vit.perceive.image_sep", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||
@@ -1444,6 +1450,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
|
||||
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
|
||||
@@ -1466,6 +1473,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
|
||||
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
|
||||
@@ -1488,6 +1496,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
|
||||
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
|
||||
@@ -1504,6 +1513,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.layer_norm1",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
"vit.layers.{bid}.input_layernorm", # HunyuanOCR
|
||||
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
|
||||
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
|
||||
@@ -1521,6 +1531,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
||||
"vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
|
||||
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
|
||||
@@ -1540,6 +1551,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
||||
"vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
|
||||
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
|
||||
@@ -1557,6 +1569,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.mlp.fc1",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
|
||||
"vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
|
||||
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
|
||||
@@ -1583,6 +1596,7 @@ class TensorNameMap:
|
||||
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
|
||||
"vpm.encoder.layers.{bid}.mlp.fc2",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
|
||||
"vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
|
||||
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
|
||||
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
|
||||
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
|
||||
@@ -1639,6 +1653,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_MM_POST_NORM: (
|
||||
"visual.merger.post_projection_norm", # glm4v
|
||||
"vit.perceive.after_rms", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: (
|
||||
@@ -1806,6 +1821,18 @@ class TensorNameMap:
|
||||
"model.vision.eoi", # cogvlm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_PRE_NORM: (
|
||||
"vit.perceive.before_rms", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_TOK_IMG_BEGIN: (
|
||||
"vit.perceive.image_begin", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_TOK_IMG_END: (
|
||||
"vit.perceive.image_end", # HunyuanOCR
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_STD_BIAS: (
|
||||
"model.vision_tower.std_bias", # gemma4
|
||||
),
|
||||
|
||||
@@ -0,0 +1,282 @@
|
||||
{%- macro format_parameters(properties, required) -%}
|
||||
{%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
|
||||
{%- set ns = namespace(found_first=false) -%}
|
||||
{%- for key, value in properties | dictsort -%}
|
||||
{%- set add_comma = false -%}
|
||||
{%- if key not in standard_keys -%}
|
||||
{%- if ns.found_first %},{% endif -%}
|
||||
{%- set ns.found_first = true -%}
|
||||
{{ key }}:{
|
||||
{%- if value['description'] -%}
|
||||
description:<|"|>{{ value['description'] }}<|"|>
|
||||
{%- set add_comma = true -%}
|
||||
{%- endif -%}
|
||||
{%- if value['nullable'] %}
|
||||
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
|
||||
nullable:true
|
||||
{%- endif -%}
|
||||
{%- if value['type'] | upper == 'STRING' -%}
|
||||
{%- if value['enum'] -%}
|
||||
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
|
||||
enum:{{ format_argument(value['enum']) }}
|
||||
{%- endif -%}
|
||||
{%- elif value['type'] | upper == 'OBJECT' -%}
|
||||
,properties:{
|
||||
{%- if value['properties'] is defined and value['properties'] is mapping -%}
|
||||
{{- format_parameters(value['properties'], value['required'] | default([])) -}}
|
||||
{%- elif value is mapping -%}
|
||||
{{- format_parameters(value, value['required'] | default([])) -}}
|
||||
{%- endif -%}
|
||||
}
|
||||
{%- if value['required'] -%}
|
||||
,required:[
|
||||
{%- for item in value['required'] | default([]) -%}
|
||||
<|"|>{{- item -}}<|"|>
|
||||
{%- if not loop.last %},{% endif -%}
|
||||
{%- endfor -%}
|
||||
]
|
||||
{%- endif -%}
|
||||
{%- elif value['type'] | upper == 'ARRAY' -%}
|
||||
{%- if value['items'] is mapping and value['items'] -%}
|
||||
,items:{
|
||||
{%- set ns_items = namespace(found_first=false) -%}
|
||||
{%- for item_key, item_value in value['items'] | dictsort -%}
|
||||
{%- if item_value is not none -%}
|
||||
{%- if ns_items.found_first %},{% endif -%}
|
||||
{%- set ns_items.found_first = true -%}
|
||||
{%- if item_key == 'properties' -%}
|
||||
properties:{
|
||||
{%- if item_value is mapping -%}
|
||||
{{- format_parameters(item_value, value['items']['required'] | default([])) -}}
|
||||
{%- endif -%}
|
||||
}
|
||||
{%- elif item_key == 'required' -%}
|
||||
required:[
|
||||
{%- for req_item in item_value -%}
|
||||
<|"|>{{- req_item -}}<|"|>
|
||||
{%- if not loop.last %},{% endif -%}
|
||||
{%- endfor -%}
|
||||
]
|
||||
{%- elif item_key == 'type' -%}
|
||||
{%- if item_value is string -%}
|
||||
type:{{ format_argument(item_value | upper) }}
|
||||
{%- else -%}
|
||||
type:{{ format_argument(item_value | map('upper') | list) }}
|
||||
{%- endif -%}
|
||||
{%- else -%}
|
||||
{{ item_key }}:{{ format_argument(item_value) }}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
|
||||
type:<|"|>{{ value['type'] | upper }}<|"|>}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endmacro -%}
|
||||
{%- macro format_function_declaration(tool_data) -%}
|
||||
declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
|
||||
{%- set params = tool_data['function']['parameters'] -%}
|
||||
{%- if params -%}
|
||||
,parameters:{
|
||||
{%- if params['properties'] -%}
|
||||
properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
|
||||
{%- endif -%}
|
||||
{%- if params['required'] -%}
|
||||
required:[
|
||||
{%- for item in params['required'] -%}
|
||||
<|"|>{{- item -}}<|"|>
|
||||
{{- ',' if not loop.last -}}
|
||||
{%- endfor -%}
|
||||
],
|
||||
{%- endif -%}
|
||||
{%- if params['type'] -%}
|
||||
type:<|"|>{{- params['type'] | upper -}}<|"|>}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{%- if 'response' in tool_data['function'] -%}
|
||||
{%- set response_declaration = tool_data['function']['response'] -%}
|
||||
,response:{
|
||||
{%- if response_declaration['description'] -%}
|
||||
description:<|"|>{{- response_declaration['description'] -}}<|"|>,
|
||||
{%- endif -%}
|
||||
{%- if response_declaration['type'] | upper == 'OBJECT' -%}
|
||||
type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
}
|
||||
{%- endmacro -%}
|
||||
{%- macro format_argument(argument, escape_keys=True) -%}
|
||||
{%- if argument is string -%}
|
||||
{{- '<|"|>' + argument + '<|"|>' -}}
|
||||
{%- elif argument is boolean -%}
|
||||
{{- 'true' if argument else 'false' -}}
|
||||
{%- elif argument is mapping -%}
|
||||
{{- '{' -}}
|
||||
{%- set ns = namespace(found_first=false) -%}
|
||||
{%- for key, value in argument | dictsort -%}
|
||||
{%- if ns.found_first %},{% endif -%}
|
||||
{%- set ns.found_first = true -%}
|
||||
{%- if escape_keys -%}
|
||||
{{- '<|"|>' + key + '<|"|>' -}}
|
||||
{%- else -%}
|
||||
{{- key -}}
|
||||
{%- endif -%}
|
||||
:{{- format_argument(value, escape_keys=escape_keys) -}}
|
||||
{%- endfor -%}
|
||||
{{- '}' -}}
|
||||
{%- elif argument is sequence -%}
|
||||
{{- '[' -}}
|
||||
{%- for item in argument -%}
|
||||
{{- format_argument(item, escape_keys=escape_keys) -}}
|
||||
{%- if not loop.last %},{% endif -%}
|
||||
{%- endfor -%}
|
||||
{{- ']' -}}
|
||||
{%- else -%}
|
||||
{{- argument -}}
|
||||
{%- endif -%}
|
||||
{%- endmacro -%}
|
||||
{%- macro strip_thinking(text) -%}
|
||||
{%- set ns = namespace(result='') -%}
|
||||
{%- for part in text.split('<channel|>') -%}
|
||||
{%- if '<|channel>' in part -%}
|
||||
{%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
|
||||
{%- else -%}
|
||||
{%- set ns.result = ns.result + part -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{{- ns.result | trim -}}
|
||||
{%- endmacro -%}
|
||||
|
||||
{%- set ns = namespace(prev_message_type=None, last_user_message=-1) -%}
|
||||
{%- set loop_messages = messages -%}
|
||||
{{ bos_token }}
|
||||
{#- Handle System/Tool Definitions Block -#}
|
||||
{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
|
||||
{{- '<|turn>system\n' -}}
|
||||
|
||||
{#- Inject Thinking token at the very top of the FIRST system turn -#}
|
||||
{%- if enable_thinking is defined and enable_thinking -%}
|
||||
{{- '<|think|>' -}}
|
||||
{%- set ns.prev_message_type = 'think' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if messages[0]['role'] in ['system', 'developer'] -%}
|
||||
{{- messages[0]['content'] | trim -}}
|
||||
{%- set loop_messages = messages[1:] -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if tools -%}
|
||||
{%- for tool in tools %}
|
||||
{{- '<|tool>' -}}
|
||||
{{- format_function_declaration(tool) | trim -}}
|
||||
{{- '<tool|>' -}}
|
||||
{%- endfor %}
|
||||
{%- set ns.prev_message_type = 'tool' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{{- '<turn|>\n' -}}
|
||||
{%- endif %}
|
||||
|
||||
{#- Find last user message -#}
|
||||
{%- for message in loop_messages -%}
|
||||
{%- if message['role'] == 'user' -%}
|
||||
{%- set ns.last_user_message = loop.index0 -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{#- Loop through messages -#}
|
||||
{%- for message in loop_messages -%}
|
||||
{%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
|
||||
{%- if not (ns.prev_message_type == 'tool_response' and message['tool_calls']) -%}
|
||||
{{- '<|turn>' + role + '\n' }}
|
||||
{%- endif -%}
|
||||
|
||||
{%- set ns.prev_message_type = None -%}
|
||||
|
||||
{%- if message['tool_calls'] -%}
|
||||
{#- Preserve reasoning between tool calls for model turns that come after the last user turn -#}
|
||||
{%- if message['reasoning_content'] and loop.index0 > ns.last_user_message -%}
|
||||
{{- '<|channel>thought\n' -}}
|
||||
{{- message['reasoning_content'] -}}
|
||||
{{- '<channel|>' -}}
|
||||
{%- endif -%}
|
||||
{%- for tool_call in message['tool_calls'] -%}
|
||||
{%- set function = tool_call['function'] -%}
|
||||
{{- '<|tool_call>call:' + function['name'] + '{' -}}
|
||||
{%- if function['arguments'] is mapping -%}
|
||||
{%- set ns_args = namespace(found_first=false) -%}
|
||||
{%- for key, value in function['arguments'] | dictsort -%}
|
||||
{%- if ns_args.found_first %},{% endif -%}
|
||||
{%- set ns_args.found_first = true -%}
|
||||
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
|
||||
{%- endfor -%}
|
||||
{%- elif function['arguments'] is string -%}
|
||||
{{- function['arguments'] -}}
|
||||
{%- endif -%}
|
||||
{{- '}<tool_call|>' -}}
|
||||
{%- endfor -%}
|
||||
{%- set ns.prev_message_type = 'tool_call' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if message['tool_responses'] -%}
|
||||
{#- Tool Response handling -#}
|
||||
{%- for tool_response in message['tool_responses'] -%}
|
||||
{{- '<|tool_response>' -}}
|
||||
{%- if tool_response['response'] is mapping -%}
|
||||
{{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
|
||||
{%- for key, value in tool_response['response'] | dictsort -%}
|
||||
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
|
||||
{%- if not loop.last %},{% endif -%}
|
||||
{%- endfor -%}
|
||||
{{- '}' -}}
|
||||
{%- else -%}
|
||||
{{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
|
||||
{%- endif -%}
|
||||
{{- '<tool_response|>' -}}
|
||||
{%- endfor -%}
|
||||
{%- set ns.prev_message_type = 'tool_response' -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if message['content'] is string -%}
|
||||
{%- if role == 'model' -%}
|
||||
{{- strip_thinking(message['content']) -}}
|
||||
{%- else -%}
|
||||
{{- message['content'] | trim -}}
|
||||
{%- endif -%}
|
||||
{%- elif message['content'] is sequence -%}
|
||||
{%- for item in message['content'] -%}
|
||||
{%- if item['type'] == 'text' -%}
|
||||
{%- if role == 'model' -%}
|
||||
{{- strip_thinking(item['text']) -}}
|
||||
{%- else -%}
|
||||
{{- item['text'] | trim -}}
|
||||
{%- endif -%}
|
||||
{%- elif item['type'] == 'image' -%}
|
||||
{{- '\n\n<|image|>\n\n' -}}
|
||||
{%- set ns.prev_message_type = 'image' -%}
|
||||
{%- elif item['type'] == 'audio' -%}
|
||||
{{- '<|audio|>' -}}
|
||||
{%- set ns.prev_message_type = 'audio' -%}
|
||||
{%- elif item['type'] == 'video' -%}
|
||||
{{- '\n\n<|video|>\n\n' -}}
|
||||
{%- set ns.prev_message_type = 'video' -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endif -%}
|
||||
|
||||
{%- if not (message['tool_responses'] and not message['content']) -%}
|
||||
{{- '<turn|>\n' -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
{%- if add_generation_prompt -%}
|
||||
{%- if ns.prev_message_type != 'tool_response' -%}
|
||||
{{- '<|turn>model\n' -}}
|
||||
{%- endif -%}
|
||||
{%- if not enable_thinking | default(false) -%}
|
||||
{{- '<|channel>thought\n<channel|>' -}}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
||||
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
||||
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
|
||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
||||
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
||||
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
|
||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||
@@ -822,6 +825,22 @@ int32_t llm_chat_apply_template(
|
||||
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
|
||||
// tencent/HunyuanOCR
|
||||
ss << "<|hy_begin▁of▁sentence|>";
|
||||
for (size_t i = 0; i < chat.size(); i++) {
|
||||
std::string role(chat[i]->role);
|
||||
if (i == 0 && role == "system") {
|
||||
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
|
||||
continue;
|
||||
}
|
||||
|
||||
if (role == "user") {
|
||||
ss << chat[i]->content << "<|hy_User|>";
|
||||
} else if (role == "assistant") {
|
||||
ss << chat[i]->content << "<|hy_Assistant|>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
||||
// moonshotai/Kimi-K2-Instruct
|
||||
for (auto message : chat) {
|
||||
|
||||
@@ -53,6 +53,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
||||
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
|
||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||
LLM_CHAT_TEMPLATE_GROK_2,
|
||||
|
||||
@@ -1279,6 +1279,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER, hparams.n_embd_per_layer);
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 35: type = LLM_TYPE_E2B; break;
|
||||
|
||||
@@ -2551,6 +2551,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|end_of_text|>"
|
||||
|| t.first == "<end_of_utterance>" // smoldocling
|
||||
|| t.first == "<turn|>" // gemma4
|
||||
|| t.first == "<|tool_response>" // gemma4
|
||||
|| t.first == "<|end▁of▁sentence|>" // deepseek-ocr
|
||||
) {
|
||||
special_eog_ids.insert(t.second);
|
||||
|
||||
@@ -753,6 +753,35 @@ static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string &
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
// regex: [^\n]+|[\n]+
|
||||
// splits text into runs of non-newline characters and runs of newline characters
|
||||
static std::vector<size_t> unicode_regex_split_custom_newlines(const std::string & text, const std::vector<size_t> & offsets) {
|
||||
std::vector<size_t> bpe_offsets;
|
||||
bpe_offsets.reserve(offsets.size());
|
||||
|
||||
const auto cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
size_t start = 0;
|
||||
for (auto offset : offsets) {
|
||||
const size_t offset_ini = start;
|
||||
const size_t offset_end = start + offset;
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
size_t pos = offset_ini;
|
||||
while (pos < offset_end) {
|
||||
const bool is_newline = (cpts[pos] == '\n');
|
||||
const size_t run_start = pos;
|
||||
while (pos < offset_end && (cpts[pos] == '\n') == is_newline) {
|
||||
pos++;
|
||||
}
|
||||
bpe_offsets.push_back(pos - run_start);
|
||||
}
|
||||
}
|
||||
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
||||
std::vector<size_t> bpe_offsets;
|
||||
|
||||
@@ -769,6 +798,8 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
||||
} else if (regex_expr == "\\p{AFMoE_digits}") {
|
||||
// AFMOE digit pattern - use custom implementation for proper splitting
|
||||
bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
|
||||
} else if (regex_expr == "[^\\n]+|[\\n]+") {
|
||||
bpe_offsets = unicode_regex_split_custom_newlines(text, offsets);
|
||||
} else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
|
||||
// tiny_aya digit grouping pattern from tokenizer.json:
|
||||
// {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
|
||||
|
||||
+187
-43
@@ -657,6 +657,66 @@ static common_chat_tool imaginary_number_tool{
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool nullable_string_tool{
|
||||
/* .name = */ "set_nullable_str",
|
||||
/* .description = */ "Set a nullable string value",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": ["string", "null"],
|
||||
"description": "A nullable string"
|
||||
}
|
||||
},
|
||||
"required": ["name"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool nullable_string_null_first_tool{
|
||||
/* .name = */ "set_nullable_str_nf",
|
||||
/* .description = */ "Set a nullable string value with null first in type array",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": ["null", "string"],
|
||||
"description": "A nullable string with null first"
|
||||
}
|
||||
},
|
||||
"required": ["name"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool nullable_int_tool{
|
||||
/* .name = */ "set_nullable_int",
|
||||
/* .description = */ "Set a nullable integer value",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"count": {
|
||||
"type": ["integer", "null"],
|
||||
"description": "A nullable integer"
|
||||
}
|
||||
},
|
||||
"required": ["count"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool enum_no_type_tool{
|
||||
/* .name = */ "set_unit",
|
||||
/* .description = */ "Set a temperature unit",
|
||||
/* .parameters = */ R"({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"unit": {
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
"description": "Temperature unit"
|
||||
}
|
||||
},
|
||||
"required": ["unit"]
|
||||
})",
|
||||
};
|
||||
|
||||
static common_chat_tool string_param_tool{
|
||||
/* .name = */ "string_param",
|
||||
/* .description = */ "Tool with string parameter for testing",
|
||||
@@ -1916,10 +1976,24 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
{
|
||||
// Google Gemma 4 (tool calling with Gemma4 dict format)
|
||||
auto tst = peg_tester("models/templates/gemma4.jinja");
|
||||
auto tst = peg_tester("models/templates/google-gemma-4-31B-it.jinja");
|
||||
|
||||
tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();
|
||||
|
||||
// Reasoning and content
|
||||
tst.test(
|
||||
"<|channel>thought\nI'm\nthinking<channel|>Hello, world!\nWhat's up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.expect(message_assist_thoughts)
|
||||
.run();
|
||||
|
||||
// Reasoning and content with reasoning_format = none
|
||||
tst.test(
|
||||
"<|channel>thought\nI'm\nthinking<channel|>Hello, world!\nWhat's up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_NONE)
|
||||
.expect_content("<|channel>thought\nI'm\nthinking<channel|>Hello, world!\nWhat's up?")
|
||||
.run();
|
||||
|
||||
// Simple tool call with string argument
|
||||
tst.test(
|
||||
"<|tool_call>call:get_time{city:<|\"|>London<|\"|>}<tool_call|>")
|
||||
@@ -2200,6 +2274,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
}
|
||||
})
|
||||
.run();
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
@@ -2383,6 +2458,58 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// nullable string type ["string", "null"]
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=set_nullable_str>\n"
|
||||
"<parameter=name>\nhello world\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.tools({ nullable_string_tool })
|
||||
.expect_tool_calls({
|
||||
{ "set_nullable_str", R"({"name": "hello world"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// nullable string with null first in type array ["null", "string"]
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=set_nullable_str_nf>\n"
|
||||
"<parameter=name>\nhello world\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.tools({ nullable_string_null_first_tool })
|
||||
.expect_tool_calls({
|
||||
{ "set_nullable_str_nf", R"({"name": "hello world"})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// nullable integer type ["integer", "null"] - should use JSON value path, not string
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=set_nullable_int>\n"
|
||||
"<parameter=count>\n42\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.tools({ nullable_int_tool })
|
||||
.expect_tool_calls({
|
||||
{ "set_nullable_int", R"({"count": 42})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// enum without explicit type key - should infer string from enum values
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"<function=set_unit>\n"
|
||||
"<parameter=unit>\ncelsius\n</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.tools({ enum_no_type_tool })
|
||||
.expect_tool_calls({
|
||||
{ "set_unit", R"({"unit": "celsius"})", {} },
|
||||
})
|
||||
.run();
|
||||
}
|
||||
{
|
||||
auto tst = peg_tester("models/templates/deepseek-ai-DeepSeek-V3.1.jinja", detailed_debug);
|
||||
@@ -2541,55 +2668,57 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// #20424 introduced effective_input = generation_prompt + input, but the throw
|
||||
// uses input.substr(result.end) where result.end is in effective_input space.
|
||||
{
|
||||
auto tmpls = common_chat_templates_ptr(
|
||||
common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));
|
||||
if (!g_template_filter.empty() && std::string("models/templates/GLM-4.7-Flash.jinja").find(g_template_filter) != std::string::npos) {
|
||||
auto tmpls = common_chat_templates_ptr(
|
||||
common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));
|
||||
|
||||
static common_chat_tool weather_tool{
|
||||
"get_weather", "Get weather",
|
||||
R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
|
||||
};
|
||||
static common_chat_tool weather_tool{
|
||||
"get_weather", "Get weather",
|
||||
R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
|
||||
};
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.tools = { weather_tool };
|
||||
inputs.enable_thinking = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.use_jinja = true;
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = "get_weather";
|
||||
inputs.messages = { msg };
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.tools = { weather_tool };
|
||||
inputs.enable_thinking = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.use_jinja = true;
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = "get_weather";
|
||||
inputs.messages = { msg };
|
||||
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
common_peg_arena arena;
|
||||
arena.load(params.parser);
|
||||
common_chat_parser_params pp(params);
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
common_peg_arena arena;
|
||||
arena.load(params.parser);
|
||||
common_chat_parser_params pp(params);
|
||||
|
||||
// generation_prompt is non-empty for thinking models, so result.end
|
||||
// will be offset by generation_prompt.size() into effective_input space.
|
||||
assert(!pp.generation_prompt.empty());
|
||||
// generation_prompt is non-empty for thinking models, so result.end
|
||||
// will be offset by generation_prompt.size() into effective_input space.
|
||||
assert(!pp.generation_prompt.empty());
|
||||
|
||||
std::string bad_input =
|
||||
"Thinking.\n"
|
||||
"</think>"
|
||||
"<tool_call>get_weather"
|
||||
"<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
|
||||
"</tool_call>\n";
|
||||
std::string bad_input =
|
||||
"Thinking.\n"
|
||||
"</think>"
|
||||
"<tool_call>get_weather"
|
||||
"<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
|
||||
"</tool_call>\n";
|
||||
|
||||
bool got_runtime_error = false;
|
||||
bool got_out_of_range = false;
|
||||
std::string error_msg;
|
||||
try {
|
||||
common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
|
||||
} catch (const std::out_of_range & e) {
|
||||
got_out_of_range = true;
|
||||
error_msg = e.what();
|
||||
} catch (const std::runtime_error & e) {
|
||||
got_runtime_error = true;
|
||||
error_msg = e.what();
|
||||
bool got_runtime_error = false;
|
||||
bool got_out_of_range = false;
|
||||
std::string error_msg;
|
||||
try {
|
||||
common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
|
||||
} catch (const std::out_of_range & e) {
|
||||
got_out_of_range = true;
|
||||
error_msg = e.what();
|
||||
} catch (const std::runtime_error & e) {
|
||||
got_runtime_error = true;
|
||||
error_msg = e.what();
|
||||
}
|
||||
GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
|
||||
GGML_ASSERT(got_runtime_error && "throw path should produce std::runtime_error with parse position");
|
||||
}
|
||||
GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
|
||||
GGML_ASSERT(got_runtime_error && "throw path should produce std::runtime_error with parse position");
|
||||
}
|
||||
|
||||
// Kimi-K2-Thinking tests - custom parser
|
||||
@@ -3169,6 +3298,21 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect(message_assist_call_id)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test("[TOOL_CALLS]special_function[CALL_ID]000000001[ARGS]{\"arg1\": 1}"
|
||||
"[TOOL_CALLS]special_function_with_opt[CALL_ID]000000002[ARGS]{\"arg1\": 1, \"arg2\": 2}")
|
||||
.parallel_tool_calls(true)
|
||||
.tools({
|
||||
special_function_tool, special_function_tool_with_optional_param
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", "000000001" },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", "000000002" },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
|
||||
}
|
||||
// Devstral
|
||||
{
|
||||
|
||||
+2
-2
@@ -176,8 +176,8 @@
|
||||
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
|
||||
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
||||
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
||||
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||
|
||||
@@ -255,8 +255,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
||||
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
|
||||
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
||||
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ add_library(mtmd
|
||||
models/conformer.cpp
|
||||
models/gemma4v.cpp
|
||||
models/glm4v.cpp
|
||||
models/hunyuanocr.cpp
|
||||
models/internvl.cpp
|
||||
models/kimivl.cpp
|
||||
models/kimik25.cpp
|
||||
|
||||
@@ -148,6 +148,11 @@
|
||||
#define TN_TOK_BOI "v.boi"
|
||||
#define TN_TOK_EOI "v.eoi"
|
||||
|
||||
// hunyuanocr
|
||||
#define TN_MM_PRE_NORM "mm.pre_norm.%s"
|
||||
#define TN_TOK_IMG_BEGIN "mm.image_begin"
|
||||
#define TN_TOK_IMG_END "mm.image_end"
|
||||
|
||||
// deepseek-ocr
|
||||
#define TN_SAM_POS_EMBD "v.sam.pos_embd.%s"
|
||||
#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
|
||||
@@ -266,6 +271,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_YOUTUVL,
|
||||
PROJECTOR_TYPE_KIMIK25,
|
||||
PROJECTOR_TYPE_NEMOTRON_V2_VL,
|
||||
PROJECTOR_TYPE_HUNYUANOCR,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -306,6 +312,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
|
||||
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
|
||||
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
@@ -358,7 +358,8 @@ struct clip_model {
|
||||
// MINICPMV projection
|
||||
ggml_tensor * mm_model_pos_embed_k = nullptr;
|
||||
ggml_tensor * mm_model_query = nullptr;
|
||||
ggml_tensor * mm_model_proj = nullptr;
|
||||
ggml_tensor * mm_model_proj = nullptr;
|
||||
ggml_tensor * mm_model_proj_b = nullptr;
|
||||
ggml_tensor * mm_model_kv_proj = nullptr;
|
||||
ggml_tensor * mm_model_attn_q_w = nullptr;
|
||||
ggml_tensor * mm_model_attn_q_b = nullptr;
|
||||
@@ -419,6 +420,11 @@ struct clip_model {
|
||||
ggml_tensor * mm_boi = nullptr;
|
||||
ggml_tensor * mm_eoi = nullptr;
|
||||
|
||||
// hunyuanocr perceiver
|
||||
ggml_tensor * mm_pre_norm_w = nullptr;
|
||||
ggml_tensor * mm_img_begin = nullptr;
|
||||
ggml_tensor * mm_img_end = nullptr;
|
||||
|
||||
// deepseek ocr sam
|
||||
ggml_tensor * patch_embed_proj_w = nullptr;
|
||||
ggml_tensor * patch_embed_proj_b = nullptr;
|
||||
|
||||
@@ -902,6 +902,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
@@ -1408,6 +1412,14 @@ struct clip_model_loader {
|
||||
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
hparams.set_warmup_n_tokens(28*28);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
// audio preprocessing params
|
||||
@@ -2035,6 +2047,22 @@ struct clip_model_loader {
|
||||
model.mm_boi = get_tensor(TN_TOK_BOI);
|
||||
model.mm_eoi = get_tensor(TN_TOK_EOI);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
|
||||
model.mm_model_proj_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
|
||||
model.mm_pre_norm_w = get_tensor(string_format(TN_MM_PRE_NORM, "weight"));
|
||||
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
|
||||
model.mm_img_begin = get_tensor(TN_TOK_IMG_BEGIN);
|
||||
model.mm_img_end = get_tensor(TN_TOK_IMG_END);
|
||||
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
|
||||
model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
@@ -2584,6 +2612,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
default:
|
||||
@@ -2768,6 +2797,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
|
||||
n_patches = h * (h + 1) + 1;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
int merge = ctx->model.hparams.n_merge;
|
||||
int ow = (img->nx / patch_size) / merge;
|
||||
int oh = (img->ny / patch_size) / merge;
|
||||
n_patches = (ow + 1) * oh + 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
|
||||
@@ -3175,6 +3211,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_PHI4:
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
// do nothing
|
||||
} break;
|
||||
@@ -3346,6 +3383,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_hunyuanocr::build() {
|
||||
const int merge = hparams.n_merge;
|
||||
const int pw = n_patches_x;
|
||||
const int ph = n_patches_y;
|
||||
|
||||
ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
|
||||
|
||||
// perceiver projector
|
||||
cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
|
||||
|
||||
// [C, W*H] -> [W, H, C] for conv2d
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph);
|
||||
cur = ggml_permute(ctx0, cur, 2, 0, 1, 3);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
|
||||
// Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1);
|
||||
if (model.mm_0_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0]));
|
||||
}
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1);
|
||||
if (model.mm_1_b) {
|
||||
cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0]));
|
||||
}
|
||||
|
||||
const int ow = pw / merge;
|
||||
const int oh = ph / merge;
|
||||
const int idim = (int)cur->ne[2]; // OC = 4608
|
||||
|
||||
// append newline along W (dim 0)
|
||||
ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1);
|
||||
nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1);
|
||||
cur = ggml_concat(ctx0, cur, nl, 0);
|
||||
|
||||
// [OW+1, OH, OC] -> [OC, (OW+1)*OH]
|
||||
cur = ggml_permute(ctx0, cur, 1, 2, 0, 3);
|
||||
cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh);
|
||||
|
||||
// project to LLM hidden size
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
if (model.mm_model_proj_b) {
|
||||
cur = ggml_add(ctx0, cur, model.mm_model_proj_b);
|
||||
}
|
||||
|
||||
// wrap with begin/end tokens
|
||||
cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1);
|
||||
cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1);
|
||||
|
||||
cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
@@ -98,6 +98,11 @@ struct clip_graph_glm4v : clip_graph {
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_hunyuanocr : clip_graph {
|
||||
clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_mobilenetv5 : clip_graph {
|
||||
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
@@ -406,6 +406,13 @@ struct mtmd_context {
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
{
|
||||
// note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
|
||||
img_beg = "<|hy_place▁holder▁no▁100|>";
|
||||
img_end = "<|hy_place▁holder▁no▁101|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||
}
|
||||
|
||||
@@ -5,15 +5,15 @@
|
||||
#include "gguf.h"
|
||||
#include "jinja/runtime.h"
|
||||
#include "log.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <numeric>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "peg-parser.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum class output_mode {
|
||||
@@ -34,14 +34,14 @@ enum class input_message_type {
|
||||
};
|
||||
|
||||
struct debug_options {
|
||||
std::string template_path;
|
||||
bool with_tools = true;
|
||||
bool generation_prompt = true;
|
||||
bool enable_reasoning = true;
|
||||
bool debug_jinja = false;
|
||||
bool force_tool_call = false;
|
||||
output_mode mode = output_mode::BOTH;
|
||||
input_message_type input_message = input_message_type::NONE;
|
||||
std::string template_path;
|
||||
bool with_tools = true;
|
||||
bool generation_prompt = true;
|
||||
bool enable_reasoning = true;
|
||||
bool debug_jinja = false;
|
||||
bool force_tool_call = false;
|
||||
output_mode mode = output_mode::BOTH;
|
||||
input_message_type input_message = input_message_type::NONE;
|
||||
};
|
||||
|
||||
static std::string read_file(const std::string & path) {
|
||||
@@ -274,7 +274,7 @@ static void render_scenario(const common_chat_template & tmpl,
|
||||
json final_messages = messages;
|
||||
if (add_generation_prompt && !messages.empty() && messages.back().value("role", "") == "assistant") {
|
||||
final_messages.push_back(json{
|
||||
{ "role", "user" },
|
||||
{ "role", "user" },
|
||||
{ "content", "Now please continue with another response." }
|
||||
});
|
||||
}
|
||||
@@ -305,7 +305,7 @@ static void render_all_scenarios(const common_chat_template & tmpl,
|
||||
const json & tools,
|
||||
bool add_generation_prompt,
|
||||
bool enable_thinking,
|
||||
input_message_type message_type) {
|
||||
input_message_type message_type) {
|
||||
json user_msg = build_user_message();
|
||||
|
||||
auto render_if = [&](input_message_type type, const std::string & name, const json & assistant_msg) {
|
||||
@@ -335,6 +335,24 @@ static void render_all_scenarios(const common_chat_template & tmpl,
|
||||
}
|
||||
}
|
||||
|
||||
static autoparser::generation_params prepare_params(const debug_options & opts, const json & tools) {
|
||||
autoparser::generation_params params;
|
||||
params.messages = json::array({ build_user_message() });
|
||||
params.reasoning_format = opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
|
||||
params.enable_thinking = opts.enable_reasoning;
|
||||
params.add_generation_prompt = opts.generation_prompt;
|
||||
|
||||
if (opts.with_tools) {
|
||||
params.tools = tools;
|
||||
params.tool_choice = opts.force_tool_call ? COMMON_CHAT_TOOL_CHOICE_REQUIRED : COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
} else {
|
||||
params.tools = json();
|
||||
params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
}
|
||||
params.parallel_tool_calls = false;
|
||||
return params;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// Set log level to most verbose to capture all debug output
|
||||
common_log_set_verbosity_thold(99);
|
||||
@@ -369,49 +387,41 @@ int main(int argc, char ** argv) {
|
||||
try {
|
||||
common_chat_template chat_template(template_source, "", "");
|
||||
|
||||
// Build tools definition
|
||||
json tools = opts.with_tools ? build_tools_definition() : json();
|
||||
|
||||
// Render template scenarios if requested
|
||||
if (opts.input_message != input_message_type::NONE &&
|
||||
(opts.mode == output_mode::TEMPLATE || opts.mode == output_mode::BOTH)) {
|
||||
autoparser::generation_params params = prepare_params(opts, tools);
|
||||
common_chat_params parser_data;
|
||||
if (std::optional<common_chat_params> spec_tmpl =
|
||||
common_chat_try_specialized_template(chat_template, template_source, params)) {
|
||||
LOG_ERR("\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR(" TEMPLATE RENDERING OUTPUT\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR("This template uses a specialized parser, analysis results will not be available.");
|
||||
parser_data = *spec_tmpl;
|
||||
} else {
|
||||
// Render template scenarios if requested
|
||||
if (opts.input_message != input_message_type::NONE &&
|
||||
(opts.mode == output_mode::TEMPLATE || opts.mode == output_mode::BOTH)) {
|
||||
LOG_ERR("\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR(" TEMPLATE RENDERING OUTPUT\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
|
||||
render_all_scenarios(chat_template, tools, opts.generation_prompt, opts.enable_reasoning,
|
||||
opts.input_message);
|
||||
}
|
||||
|
||||
// Output analysis if requested
|
||||
if (opts.mode == output_mode::ANALYSIS || opts.mode == output_mode::BOTH) {
|
||||
LOG_ERR("\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR(" TEMPLATE ANALYSIS\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
|
||||
autoparser::autoparser analysis;
|
||||
analysis.analyze_template(chat_template);
|
||||
|
||||
// Generate Parser
|
||||
autoparser::generation_params params;
|
||||
params.messages = json::array({ build_user_message() });
|
||||
params.reasoning_format =
|
||||
opts.enable_reasoning ? COMMON_REASONING_FORMAT_DEEPSEEK : COMMON_REASONING_FORMAT_NONE;
|
||||
params.enable_thinking = opts.enable_reasoning;
|
||||
params.add_generation_prompt = opts.generation_prompt;
|
||||
|
||||
if (opts.with_tools) {
|
||||
params.tools = tools;
|
||||
params.tool_choice = opts.force_tool_call ? COMMON_CHAT_TOOL_CHOICE_REQUIRED : COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
} else {
|
||||
params.tools = json();
|
||||
params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
render_all_scenarios(chat_template, tools, opts.generation_prompt, opts.enable_reasoning,
|
||||
opts.input_message);
|
||||
}
|
||||
params.parallel_tool_calls = false;
|
||||
|
||||
auto parser_data = autoparser::peg_generator::generate_parser(chat_template, params, analysis);
|
||||
// Output analysis if requested
|
||||
if (opts.mode == output_mode::ANALYSIS || opts.mode == output_mode::BOTH) {
|
||||
LOG_ERR("\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
LOG_ERR(" TEMPLATE ANALYSIS\n");
|
||||
LOG_ERR("================================================================================\n");
|
||||
|
||||
autoparser::autoparser analysis;
|
||||
analysis.analyze_template(chat_template);
|
||||
|
||||
// Generate Parser
|
||||
parser_data = autoparser::peg_generator::generate_parser(chat_template, params, analysis);
|
||||
}
|
||||
|
||||
LOG_ERR("\n=== Generated Parser ===\n");
|
||||
common_peg_arena arena;
|
||||
|
||||
@@ -167,6 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
|
||||
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||
| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
||||
| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CLEAR_IDLE) |
|
||||
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
@@ -221,8 +222,8 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
|
||||
| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
||||
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
|
||||
|
||||
@@ -155,8 +155,8 @@ struct server_slot {
|
||||
int64_t t_start_process_prompt;
|
||||
int64_t t_start_generation;
|
||||
|
||||
double t_prompt_processing; // ms
|
||||
double t_token_generation; // ms
|
||||
double t_prompt_processing = 0.0; // ms
|
||||
double t_token_generation = 0.0; // ms
|
||||
|
||||
std::function<void(int /* id_slot */)> callback_on_release;
|
||||
|
||||
@@ -605,6 +605,17 @@ private:
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
void slot_save_and_clear(server_slot & slot) {
|
||||
if (slot.prompt.n_tokens() == 0) {
|
||||
return;
|
||||
}
|
||||
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
|
||||
SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
|
||||
slot.prompt_save(*prompt_cache);
|
||||
slot.prompt_clear(false);
|
||||
prompt_cache->update();
|
||||
}
|
||||
|
||||
void handle_sleeping_state(bool new_state) {
|
||||
GGML_ASSERT(sleeping != new_state);
|
||||
if (new_state) {
|
||||
@@ -864,6 +875,19 @@ private:
|
||||
|
||||
metrics.init();
|
||||
|
||||
if (params_base.clear_idle) {
|
||||
if (!params_base.kv_unified) {
|
||||
SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
|
||||
params_base.clear_idle = false;
|
||||
} else if (params_base.cache_ram_mib == 0) {
|
||||
SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
|
||||
params_base.clear_idle = false;
|
||||
} else {
|
||||
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
|
||||
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
|
||||
}
|
||||
}
|
||||
|
||||
// populate webui settings
|
||||
{
|
||||
if (!params_base.webui_config_json.empty()) {
|
||||
@@ -1010,15 +1034,15 @@ private:
|
||||
// cache prompts only for completion tasks
|
||||
update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
|
||||
|
||||
// don't update the cache if the slot's context is empty
|
||||
update_cache = update_cache && tokens.size() > 0;
|
||||
|
||||
if (update_cache) {
|
||||
SRV_WRN("%s", "updating prompt cache\n");
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
ret->prompt_save(*prompt_cache);
|
||||
// don't save the slot's state if its context is empty
|
||||
if (tokens.size() > 0) {
|
||||
ret->prompt_save(*prompt_cache);
|
||||
}
|
||||
|
||||
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
|
||||
ret->prompt_clear(false);
|
||||
@@ -1692,9 +1716,7 @@ private:
|
||||
const int id_slot = task.id_slot;
|
||||
const int id_task = task.id;
|
||||
|
||||
server_slot * slot = id_slot != -1
|
||||
? get_slot_by_id(id_slot)
|
||||
: get_available_slot(task);
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
|
||||
//
|
||||
// slot scheduling logic
|
||||
@@ -1731,6 +1753,14 @@ private:
|
||||
SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
|
||||
break; // drop the task
|
||||
}
|
||||
|
||||
if (params_base.clear_idle) {
|
||||
for (auto & s : slots) {
|
||||
if (!s.is_processing()) {
|
||||
slot_save_and_clear(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_CANCEL:
|
||||
{
|
||||
|
||||
@@ -2008,7 +2008,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
|
||||
bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
|
||||
const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
|
||||
|
||||
float f_keep_best = float(lcp_best) / prompt.tokens.size();
|
||||
float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
|
||||
float sim_best = float(lcp_best) / tokens_new.size();
|
||||
|
||||
SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
|
||||
|
||||
@@ -261,14 +261,14 @@ struct result_timings {
|
||||
int32_t cache_n = -1;
|
||||
|
||||
int32_t prompt_n = -1;
|
||||
double prompt_ms;
|
||||
double prompt_per_token_ms;
|
||||
double prompt_per_second;
|
||||
double prompt_ms = 0.0;
|
||||
double prompt_per_token_ms = 0.0;
|
||||
double prompt_per_second = 0.0;
|
||||
|
||||
int32_t predicted_n = -1;
|
||||
double predicted_ms;
|
||||
double predicted_per_token_ms;
|
||||
double predicted_per_second;
|
||||
double predicted_ms = 0.0;
|
||||
double predicted_per_token_ms = 0.0;
|
||||
double predicted_per_second = 0.0;
|
||||
|
||||
// Optional speculative metrics - only included when > 0
|
||||
int32_t draft_n = 0;
|
||||
|
||||
@@ -108,10 +108,8 @@ int main(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("build_info: %s\n", build_info.c_str());
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
|
||||
server_http_context ctx_http;
|
||||
if (!ctx_http.init(params)) {
|
||||
|
||||
@@ -0,0 +1,115 @@
|
||||
import os
|
||||
import tempfile
|
||||
import pytest
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
class LogReader:
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self.pos = 0
|
||||
def drain(self):
|
||||
with open(self.path) as f:
|
||||
f.seek(self.pos)
|
||||
content = f.read()
|
||||
self.pos = f.tell()
|
||||
return content
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
server.n_slots = 2
|
||||
server.n_predict = 4
|
||||
server.temperature = 0.0
|
||||
server.server_slots = True
|
||||
server.cache_ram = 100
|
||||
server.kv_unified = True
|
||||
server.debug = True
|
||||
fd, server.log_path = tempfile.mkstemp(suffix='.log')
|
||||
os.close(fd)
|
||||
yield
|
||||
|
||||
|
||||
LONG_PROMPT = (
|
||||
"Once upon a time in a land far away, there lived a brave knight "
|
||||
"who traveled across mountains and rivers to find the legendary "
|
||||
"golden sword hidden deep within the enchanted forest of whispers. "
|
||||
"He met many creatures along the way including dragons and fairies "
|
||||
"and wizards who helped him on his noble quest to save the kingdom."
|
||||
)
|
||||
|
||||
|
||||
# idle slot cleared on launch should restore from cache-ram
|
||||
def test_clear_and_restore():
|
||||
global server
|
||||
server.start()
|
||||
log = LogReader(server.log_path)
|
||||
|
||||
# verify feature is enabled
|
||||
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
|
||||
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT,
|
||||
"id_slot": 0,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
original_prompt_n = res.body["timings"]["prompt_n"]
|
||||
|
||||
# Slot 0 is the only slot with KV — should NOT be cleared
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
|
||||
# Launching slot 1 clears idle slot 0
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "The quick brown fox",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
|
||||
|
||||
# Re-send same prompt — should restore from cache-ram
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "updating prompt cache" in log.drain()
|
||||
assert res.body["timings"]["cache_n"] > 0
|
||||
assert res.body["timings"]["prompt_n"] < original_prompt_n
|
||||
|
||||
# Follow-up — slot 0 kept its KV, no clearing needed
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT + " The knight finally reached the castle gates.",
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
|
||||
|
||||
def test_disabled_with_flag():
|
||||
global server
|
||||
server.no_clear_idle = True
|
||||
server.start()
|
||||
log = LogReader(server.log_path)
|
||||
|
||||
# Feature should not be enabled
|
||||
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
|
||||
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT,
|
||||
"id_slot": 0,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
|
||||
# Request on different slot — should NOT trigger clearing
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "The quick brown fox",
|
||||
"id_slot": 1,
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
@@ -102,6 +102,9 @@ class ServerProcess:
|
||||
mmproj_url: str | None = None
|
||||
media_path: str | None = None
|
||||
sleep_idle_seconds: int | None = None
|
||||
cache_ram: int | None = None
|
||||
no_clear_idle: bool = False
|
||||
log_path: str | None = None
|
||||
webui_mcp_proxy: bool = False
|
||||
|
||||
# session variables
|
||||
@@ -237,6 +240,10 @@ class ServerProcess:
|
||||
server_args.extend(["--media-path", self.media_path])
|
||||
if self.sleep_idle_seconds is not None:
|
||||
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
|
||||
if self.cache_ram is not None:
|
||||
server_args.extend(["--cache-ram", self.cache_ram])
|
||||
if self.no_clear_idle:
|
||||
server_args.append("--no-clear-idle")
|
||||
if self.webui_mcp_proxy:
|
||||
server_args.append("--webui-mcp-proxy")
|
||||
|
||||
@@ -249,11 +256,16 @@ class ServerProcess:
|
||||
flags |= subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
flags |= subprocess.CREATE_NO_WINDOW
|
||||
|
||||
if self.log_path:
|
||||
self._log = open(self.log_path, "w")
|
||||
else:
|
||||
self._log = sys.stdout
|
||||
|
||||
self.process = subprocess.Popen(
|
||||
[str(arg) for arg in [server_path, *server_args]],
|
||||
creationflags=flags,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stdout,
|
||||
stdout=self._log,
|
||||
stderr=self._log if self._log != sys.stdout else sys.stdout,
|
||||
env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
|
||||
)
|
||||
server_instances.add(self)
|
||||
@@ -298,6 +310,8 @@ class ServerProcess:
|
||||
except Exception as e:
|
||||
print(f"Error waiting for server: {e}")
|
||||
self.process = None
|
||||
if hasattr(self, '_log') and self._log != sys.stdout:
|
||||
self._log.close()
|
||||
|
||||
def make_request(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user