Compare commits

..

14 Commits

Author SHA1 Message Date
Georgi Gerganov 37c56c245e wip 2026-06-06 16:30:41 +03:00
Georgi Gerganov 1c4a91c0f3 wip 2026-06-06 10:48:36 +03:00
Georgi Gerganov 65eef9549c Merge branch 'master' into pr/23398 2026-06-05 17:47:19 +03:00
Georgi Gerganov f0438b1b15 cont : avoid computations on the CPU 2026-06-05 14:39:03 +03:00
Georgi Gerganov d78a3864f0 cont : adjust to hparams changes 2026-06-05 14:38:41 +03:00
Georgi Gerganov 5954f196ed Merge branch 'master' into pr/23398 2026-06-05 14:02:53 +03:00
Aman Gupta 4eaa3cee66 add unified assistant 2026-06-05 14:59:44 +08:00
Aman Gupta dd97604fc4 move assistant to separate file 2026-06-04 18:56:48 +08:00
Aman Gupta c0da00af04 add exception in test-llama-archs 2026-06-04 18:54:12 +08:00
Aman Gupta 777af6af54 add temp hack to not use fit with gemma4, rm later 2026-06-04 18:54:12 +08:00
Aman Gupta 27461cd888 add Q rot when cache is quantized 2026-06-04 18:54:12 +08:00
Aman Gupta 7b87cd3598 add assert that draft + shared kv should be on same device 2026-06-04 18:54:12 +08:00
Aman Gupta 9af0434d8c fix multi-seq 2026-06-04 18:54:12 +08:00
Aman Gupta f268966d49 llama: Gemma 4 MTP 2026-06-04 18:51:14 +08:00
732 changed files with 30286 additions and 77315 deletions
+2 -18
View File
@@ -13,20 +13,6 @@ ARG APP_REVISION=N/A
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM ${CANN_BASE_IMAGE} AS build
# -- Install build dependencies --
@@ -40,8 +26,6 @@ WORKDIR /app
# -- Copy project files --
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
# -- Set CANN environment variables (required for compilation) --
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
@@ -145,7 +129,7 @@ ENTRYPOINT ["/app/tools.sh"]
# ==============================================================================
FROM base AS light
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
ENTRYPOINT [ "/app/llama-cli" ]
@@ -156,7 +140,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama /app/full/llama-server /app
COPY --from=build /app/full/llama-server /app
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
+5 -21
View File
@@ -3,21 +3,7 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH
@@ -30,8 +16,6 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
else \
@@ -53,7 +37,7 @@ RUN mkdir -p /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
FROM ubuntu:$UBUNTU_VERSION AS base
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
@@ -69,7 +53,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.source=$IMAGE_SOURCE
RUN apt-get update \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@@ -104,7 +88,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
@@ -115,7 +99,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama /app/full/llama-server /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
+7 -25
View File
@@ -1,47 +1,29 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.8.1
ARG GCC_VERSION=14
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG GCC_VERSION
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} build-essential cmake python3 python3-pip git libssl-dev libgomp1
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
ENV CC=gcc-${GCC_VERSION} CXX=g++-${GCC_VERSION} CUDAHOSTCXX=g++-${GCC_VERSION}
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
@@ -77,7 +59,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.source=$IMAGE_SOURCE
RUN apt-get update \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@@ -113,7 +95,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
@@ -124,7 +106,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama /app/full/llama-server /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
+12 -39
View File
@@ -5,23 +5,9 @@ ARG APP_REVISION=N/A
## Build Image
ARG NODE_VERSION=24
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=ON
ARG GGML_SYCL_F16=OFF
ARG LEVEL_ZERO_VERSION=1.28.2
ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
RUN apt-get update && \
@@ -36,12 +22,9 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
&& export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
@@ -59,7 +42,7 @@ RUN mkdir -p /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS base
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
@@ -74,21 +57,11 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.url=$IMAGE_URL \
org.opencontainers.image.source=$IMAGE_SOURCE
#Following versions are for multiple GPUs, since 26.x has known issue:
# https://github.com/ggml-org/llama.cpp/issues/21747,
# https://github.com/intel/compute-runtime/issues/921.
#ARG IGC_VERSION=v2.20.5
#ARG IGC_VERSION_FULL=2_2.20.5+19972
#ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
#ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
#ARG IGDGMM_VERSION=22.8.2
ARG IGC_VERSION=v2.34.4
ARG IGC_VERSION_FULL=2_2.34.4+21428
ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
ARG IGDGMM_VERSION=22.10.0
ARG IGC_VERSION=v2.20.5
ARG IGC_VERSION_FULL=2_2.20.5+19972
ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
ARG IGDGMM_VERSION=22.8.2
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
@@ -102,7 +75,7 @@ RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& dpkg --install *.deb
RUN apt-get update \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@@ -141,7 +114,7 @@ ENTRYPOINT ["/app/tools.sh"]
FROM base AS light
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
@@ -153,7 +126,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/lib/ /app
COPY --from=build /app/full/llama /app/full/llama-server /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
+2 -2
View File
@@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
FROM docker.io/ascendai/cann:$ASCEND_VERSION AS build
FROM ascendai/cann:$ASCEND_VERSION AS build
WORKDIR /app
@@ -30,7 +30,7 @@ RUN echo "Building with static libs" && \
cmake --build build --config Release --target llama-completion
# TODO: use image with NNRT
FROM docker.io/ascendai/cann:$ASCEND_VERSION AS runtime
FROM ascendai/cann:$ASCEND_VERSION AS runtime
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
+5 -21
View File
@@ -2,28 +2,14 @@ ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.3.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
ARG BASE_MUSA_RUN_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
@@ -43,8 +29,6 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
@@ -80,7 +64,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.source=$IMAGE_SOURCE
RUN apt-get update \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@@ -115,7 +99,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
@@ -126,7 +110,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama /app/full/llama-server /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
+50 -83
View File
@@ -1,17 +1,17 @@
ARG OPENVINO_VERSION_MAJOR=2026.2.1
ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3
ARG OPENVINO_VERSION_MAJOR=2026.0
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
ARG UBUNTU_VERSION=24.04
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
ARG IGC_VERSION=v2.36.3
ARG IGC_VERSION_FULL=2_2.36.3+21719
ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4
ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0
ARG IGDGMM_VERSION=22.10.0
ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGDGMM_VERSION=22.9.0
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
ARG NPU_DRIVER_VERSION=v1.33.0
ARG NPU_DRIVER_FULL=v1.33.0.20260529-26625960453
ARG NPU_DRIVER_VERSION=v1.32.0
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
# Optional proxy build arguments
@@ -22,22 +22,8 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
## Build Image
FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build
FROM ubuntu:${UBUNTU_VERSION} AS build
# Pass proxy args to build stage
ARG http_proxy
@@ -60,18 +46,13 @@ RUN apt-get update && \
intel-opencl-icd && \
rm -rf /var/lib/apt/lists/*
# OpenVINO toolkit and GPU/NPU drivers are cached via BuildKit cache mounts to avoid re-downloading on rebuilds.
# Install OpenVINO for Ubuntu 24.04.
# Install OpenVINO for Ubuntu 24.04
ARG OPENVINO_VERSION_MAJOR
ARG OPENVINO_VERSION_FULL
RUN --mount=type=cache,target=/var/cache/openvino,sharing=locked \
mkdir -p /opt/intel && \
TGZ=/var/cache/openvino/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
if [ ! -f "$TGZ" ]; then \
wget -O "$TGZ" https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz; \
fi && \
tar -xf "$TGZ" -C /opt/intel/ && \
mv /opt/intel/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
RUN mkdir -p /opt/intel && \
wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
cd - && \
@@ -83,20 +64,18 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
# Build Stage
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_OPENVINO=ON && \
cmake --build build/ReleaseOV --parallel "
cmake --build build/ReleaseOV -j$(nproc)"
# Copy all necessary libraries (build outputs + OpenVINO runtime libs)
# Copy all necessary libraries
RUN mkdir -p /app/lib && \
find build/ReleaseOV -name '*.so*' -exec cp -P {} /app/lib \; && \
find "${OpenVINO_DIR}/runtime/lib/intel64" -name '*.so*' -exec cp -P {} /app/lib \;
find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
# Create runtime directories and copy binaries
RUN mkdir -p /app/full \
@@ -109,7 +88,7 @@ RUN mkdir -p /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base Runtime Image
FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base
FROM ubuntu:${UBUNTU_VERSION} AS base
# Pass proxy args to runtime stage
ARG http_proxy
@@ -128,7 +107,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.source=$IMAGE_SOURCE
RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl wget ffmpeg ocl-icd-libopencl1 \
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@@ -141,41 +120,33 @@ ARG IGC_VERSION_FULL
ARG COMPUTE_RUNTIME_VERSION
ARG COMPUTE_RUNTIME_VERSION_FULL
ARG IGDGMM_VERSION
RUN --mount=type=cache,target=/var/cache/intel-gpu,sharing=locked \
set -eux; \
cd /var/cache/intel-gpu; \
for url in \
https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb ; do \
f=$(basename "$url"); \
[ -f "$f" ] || wget -q -O "$f" "$url"; \
done; \
apt-get update; \
apt-get install -y --no-install-recommends ./*.deb; \
rm -rf /var/lib/apt/lists/*
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& dpkg --install *.deb \
&& rm -rf /tmp/neo/
# Install NPU drivers
ARG NPU_DRIVER_VERSION
ARG NPU_DRIVER_FULL
ARG LIBZE1_VERSION
RUN --mount=type=cache,target=/var/cache/intel-npu,sharing=locked \
set -eux; \
TGZ=/var/cache/intel-npu/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
if [ ! -f "$TGZ" ]; then \
wget -q -O "$TGZ" https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
fi; \
DEB=/var/cache/intel-npu/libze1_${LIBZE1_VERSION}_amd64.deb; \
if [ ! -f "$DEB" ]; then \
wget -q -O "$DEB" https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb; \
fi; \
mkdir /tmp/npu/ && cd /tmp/npu/ && tar -xf "$TGZ" && cp "$DEB" .; \
apt-get update; \
apt-get install -y --no-install-recommends ./*.deb; \
rm -rf /tmp/npu/ /var/lib/apt/lists/*
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& dpkg --install *.deb \
&& rm -rf /tmp/npu/
RUN cd /tmp \
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
COPY --from=build /app/lib/ /app/
@@ -195,26 +166,22 @@ RUN apt-get update && \
python3 \
python3-venv \
python3-pip && \
python3 -m venv /openvino-venv && \
/openvino-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
/openvino-venv/bin/pip install --no-cache-dir -r requirements.txt && \
python3 -m venv /ov-venv && \
/ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
/ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
# Activate the venv
ENV VIRTUAL_ENV=/openvino-venv \
PATH=/openvino-venv/bin:$PATH
ENTRYPOINT ["/app/tools.sh"]
ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app/
COPY --from=build /app/full/llama-cli /app/
WORKDIR /app
@@ -225,7 +192,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama /app/full/llama-server /app/
COPY --from=build /app/full/llama-server /app/
WORKDIR /app
+4 -20
View File
@@ -5,26 +5,12 @@ ARG ROCM_VERSION=7.2.1
ARG AMDGPU_VERSION=7.2.1
# Target the ROCm build image
ARG BASE_ROCM_DEV_CONTAINER=docker.io/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
### Build image
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
@@ -52,8 +38,6 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build \
-DGGML_HIP=ON \
@@ -92,7 +76,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.source=$IMAGE_SOURCE
RUN apt-get update \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@@ -127,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
@@ -138,7 +122,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama /app/full/llama-server /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
+4 -4
View File
@@ -5,7 +5,7 @@ ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
### Build Llama.cpp stage
FROM docker.io/gcc:${GCC_VERSION} AS build
FROM gcc:${GCC_VERSION} AS build
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
@@ -55,7 +55,7 @@ COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion
### Base image
FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base
FROM ubuntu:${UBUNTU_VERSION} AS base
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
@@ -124,7 +124,7 @@ WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
@@ -138,7 +138,7 @@ WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-server /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
EXPOSE 8080
+5 -21
View File
@@ -3,21 +3,7 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget xz-utils
@@ -31,8 +17,6 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
cmake --build build --config Release -j$(nproc)
@@ -49,7 +33,7 @@ RUN mkdir -p /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
FROM ubuntu:$UBUNTU_VERSION AS base
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
@@ -65,7 +49,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.source=$IMAGE_SOURCE
RUN apt-get update \
&& apt-get install -y libgomp1 curl ffmpeg libvulkan1 mesa-vulkan-drivers \
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
libglvnd0 libgl1 libglx0 libegl1 libgles2 \
&& apt autoremove -y \
&& apt clean -y \
@@ -107,7 +91,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
@@ -118,7 +102,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama /app/full/llama-server /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
+5 -21
View File
@@ -3,21 +3,7 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
@@ -28,8 +14,6 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
cmake --build build -j $(nproc)
@@ -46,7 +30,7 @@ RUN mkdir -p /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base image
FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
FROM ubuntu:$UBUNTU_VERSION AS base
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
@@ -62,7 +46,7 @@ LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.source=$IMAGE_SOURCE
RUN apt-get update \
&& apt-get install -y libgomp1 libnuma1 curl ffmpeg \
&& apt-get install -y libgomp1 libnuma1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
@@ -97,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
WORKDIR /app
@@ -108,7 +92,7 @@ FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama /app/full/llama-server /app
COPY --from=build /app/full/llama-server /app
WORKDIR /app
-2
View File
@@ -10,8 +10,6 @@
build*/
tools/ui/node_modules/
models/*
/llama-cli
@@ -1,24 +0,0 @@
name: "Windows - Setup OpenVINO Toolkit"
description: "Setup OpenVINO Toolkit for Windows"
inputs:
path:
description: "Installation path"
required: true
version_major:
description: "OpenVINO major version (e.g., 2026.2)"
required: true
version_full:
description: "OpenVINO full version"
required: true
runs:
using: "composite"
steps:
- name: Download and extract OpenVINO Runtime
shell: powershell
run: |
$url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/windows/openvino_toolkit_windows_${{ inputs.version_full }}_x86_64.zip"
$out = "openvino.zip"
Invoke-WebRequest -Uri $url -OutFile $out
Expand-Archive -Path $out -DestinationPath ${{ inputs.path }} -Force
Remove-Item $out
+20 -27
View File
@@ -12,7 +12,7 @@ SYCL:
- ggml/src/ggml-sycl/**
- docs/backend/SYCL.md
- examples/sycl/**
CUDA:
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-cuda.h
@@ -35,20 +35,8 @@ AMD ZenDNN:
documentation:
- changed-files:
- any-glob-to-any-file:
- "**/*.md"
- docs/**
- media/**
examples:
- all:
- changed-files:
- any-glob-to-any-file:
- app/**
- examples/**
- tools/**
- all-globs-to-all-files:
- '!tools/server/**'
- '!tools/mtmd/**'
- '!tools/ui/**'
testing:
- changed-files:
- any-glob-to-any-file:
@@ -59,12 +47,28 @@ build:
- cmake/**
- CMakeLists.txt
- CMakePresets.json
examples:
- changed-files:
- any-glob-to-any-file:
- examples/**
- tools/**
devops:
- changed-files:
- any-glob-to-any-file:
- .devops/**
- .github/**
- ci/**
python:
- changed-files:
- any-glob-to-any-file:
- "**/*.py"
- requirements/**
- gguf-py/**
- .flake8
script:
- changed-files:
- any-glob-to-any-file:
- scripts/**
android:
- changed-files:
- any-glob-to-any-file:
@@ -77,20 +81,9 @@ server:
- changed-files:
- any-glob-to-any-file:
- tools/server/**
mtmd:
- changed-files:
- any-glob-to-any-file:
- tools/mtmd/**
conversion:
- changed-files:
- any-glob-to-any-file:
- conversion/**
- convert_*.py
- gguf-py/**
vendor:
- changed-files:
- any-glob-to-any-file:
- vendor/**
ggml:
- changed-files:
- any-glob-to-any-file:
+2 -30
View File
@@ -68,8 +68,8 @@ jobs:
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
@@ -91,34 +91,6 @@ jobs:
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
windows-2022-openvino-cache:
runs-on: windows-2022
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Setup Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/windows-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
windows-2022-rocm-cache:
runs-on: windows-2022
+8 -81
View File
@@ -37,10 +37,14 @@ jobs:
ubuntu-24-openvino:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
concurrency:
group: openvino-gpu-${{ github.head_ref || github.ref }}
cancel-in-progress: false
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
@@ -74,7 +78,7 @@ jobs:
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
time cmake --build build/ReleaseOV --config Release --parallel
time cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test (CPU)
id: cmake_test_cpu
@@ -89,81 +93,4 @@ jobs:
run: |
cd ${{ github.workspace }}
export GGML_OPENVINO_DEVICE=GPU
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 3000
openvino-windows-2022:
runs-on: windows-2022
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: openvino-windows-2022
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Setup Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/windows-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenCL using vcpkg
shell: powershell
run: |
git clone https://github.com/microsoft/vcpkg C:\vcpkg
C:\vcpkg\bootstrap-vcpkg.bat
C:\vcpkg\vcpkg install opencl
- name: Build
id: cmake_build
shell: cmd
run: |
REM Find extracted OpenVINO folder dynamically
for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
echo ERROR: OpenVINOConfig.cmake not found
exit /b 1
)
call "%OPENVINO_ROOT%\setupvars.bat"
cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
-A x64 ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_OPENVINO=ON ^
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
cmake --build build\ReleaseOV --config Release -- /m
- name: Test (CPU)
id: cmake_test_cpu
shell: cmd
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
REM Find extracted OpenVINO folder dynamically
for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
call "%OPENVINO_ROOT%\setupvars.bat"
cd build
ctest --test-dir ReleaseOV -L main -E "test-llama-archs" -C Release --verbose --timeout 3000
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
+6 -2
View File
@@ -264,10 +264,14 @@ jobs:
gpu-openvino-low-perf:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
concurrency:
group: openvino-gpu-${{ github.head_ref || github.ref }}
cancel-in-progress: false
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
+124 -103
View File
@@ -34,108 +34,129 @@ env:
LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
# in order to enable it again, we have to provision dedicated runners to run it
# ubuntu-24-sycl:
# strategy:
# matrix:
# build: [fp32]
# include:
# - build: fp32
# fp16: OFF
#
# runs-on: ubuntu-24.04
#
# env:
# ONEAPI_ROOT: /opt/intel/oneapi/
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
# LEVEL_ZERO_VERSION: "1.28.2"
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
#
# continue-on-error: true
#
# steps:
# - uses: actions/checkout@v6
#
# - name: Use oneAPI Installation Cache
# uses: actions/cache@v5
# id: cache-sycl
# with:
# path: ${{ env.ONEAPI_ROOT }}
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
#
# - name: Download & Install oneAPI
# shell: bash
# if: steps.cache-sycl.outputs.cache-hit != 'true'
# run: |
# cd /tmp
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
#
# - name: Install Level Zero SDK
# shell: bash
# run: |
# cd /tmp
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
#
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.21
# with:
# key: sycl-ubuntu-24-${{ matrix.build }}
# evict-old-files: 1d
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
#
# - name: Build
# id: cmake_build
# run: |
# source /opt/intel/oneapi/setvars.sh
# cmake -B build \
# -G "Ninja" \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_SYCL=ON \
# -DCMAKE_C_COMPILER=icx \
# -DCMAKE_CXX_COMPILER=icpx \
# -DLLAMA_OPENSSL=OFF \
# -DGGML_NATIVE=OFF \
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
# time cmake --build build --config Release -j $(nproc)
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
LEVEL_ZERO_VERSION: "1.28.2"
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
continue-on-error: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Download & Install oneAPI
shell: bash
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: Install Level Zero SDK
shell: bash
run: |
cd /tmp
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: sycl-ubuntu-24-${{ matrix.build }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
windows-latest-sycl:
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Download & Install oneAPI
shell: bash
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Install Level Zero SDK
shell: pwsh
run: |
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: sycl-windows-latest
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
# in order to enable it again, we have to provision dedicated runners to run it
# windows-latest-sycl:
# runs-on: windows-2022
#
# defaults:
# run:
# shell: bash
#
# env:
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Use oneAPI Installation Cache
# uses: actions/cache@v5
# id: cache-sycl
# with:
# path: ${{ env.ONEAPI_ROOT }}
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
#
# - name: Download & Install oneAPI
# shell: bash
# if: steps.cache-sycl.outputs.cache-hit != 'true'
# run: |
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
#
# - name: Install Level Zero SDK
# shell: pwsh
# run: |
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
#
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.21
# with:
# key: sycl-windows-latest
# variant: ccache
# evict-old-files: 1d
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
#
# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
#
# - name: Build
# id: cmake_build
# run: examples/sycl/win-build-sycl.bat
-23
View File
@@ -35,29 +35,6 @@ env:
LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
format:
runs-on: ubuntu-24.04
steps:
- name: Clone
uses: actions/checkout@v6
- name: Install clang-format 22
run: |
wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key |
sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc > /dev/null
sudo add-apt-repository -y \
"deb http://apt.llvm.org/noble/ llvm-toolchain-noble-22 main"
sudo apt-get update
sudo apt-get install -y clang-format-22
- name: Check formatting
run: |
find ggml/src/ggml-webgpu \
-type f \( -name '*.cpp' -o -name '*.hpp' -o -name '*.h' \) \
-print0 |
xargs -0 clang-format-22 --dry-run --Werror
macos:
runs-on: macos-latest
+4 -18
View File
@@ -58,13 +58,6 @@ jobs:
git tag ${{ steps.srctag.outputs.name }} || exit 0
git push origin ${{ steps.srctag.outputs.name }} || exit 0
build_ui:
name: Build UI
needs: create_tag
uses: ./.github/workflows/ui-build.yml
with:
hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
prepare_matrices:
name: Prepare Docker matrices
runs-on: ubuntu-24.04
@@ -86,11 +79,11 @@ jobs:
[
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
@@ -142,7 +135,7 @@ jobs:
push_to_registry:
name: Push Docker image to Docker Registry
needs: [prepare_matrices, create_tag, build_ui]
needs: [prepare_matrices, create_tag]
runs-on: ${{ matrix.config.runs_on }}
strategy:
@@ -157,13 +150,6 @@ jobs:
fetch-depth: 0
ref: ${{ needs.create_tag.outputs.source_tag }}
- name: Download prebuilt UI
if: ${{ matrix.config.prebuilt_ui == true }}
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
with:
name: ui-build
path: tools/ui/dist
- name: Set up QEMU
if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
+230 -416
View File
@@ -46,13 +46,11 @@ jobs:
steps:
- id: check
env:
COMMIT_MESSAGE: ${{ github.event.head_commit.message }}
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "should_release=true" >> $GITHUB_OUTPUT
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
if echo "$COMMIT_MESSAGE" | grep -q '\[no release\]'; then
if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
echo "should_release=false" >> $GITHUB_OUTPUT
else
echo "should_release=true" >> $GITHUB_OUTPUT
@@ -61,31 +59,8 @@ jobs:
echo "should_release=false" >> $GITHUB_OUTPUT
fi
get-version:
runs-on: ubuntu-slim
outputs:
ui_version: ${{ steps.version.outputs.ui_version }}
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- id: version
run: |
# Resolve UI version: BUILD_NUMBER from cmake/build-info.cmake > git hash + epoch > fallback
version=""
if grep -q "BUILD_NUMBER" cmake/build-info.cmake; then
build_number=$(grep "set(BUILD_NUMBER" cmake/build-info.cmake | grep -oP '\d+')
if [ -n "$build_number" ] && [ "$build_number" -gt 0 ]; then
version="b${build_number}"
fi
fi
if [ -z "$version" ]; then
version=$(git rev-parse --short HEAD)-$(date +%s)
fi
echo "ui_version=${version}" >> $GITHUB_OUTPUT
macos-cpu:
needs: [check-release, get-version]
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
strategy:
matrix:
@@ -141,7 +116,6 @@ jobs:
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_BUILD_BORINGSSL=ON \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@@ -167,7 +141,7 @@ jobs:
name: llama-bin-macos-${{ matrix.build }}.tar.gz
ubuntu-cpu:
needs: [check-release, get-version]
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
strategy:
matrix:
@@ -227,7 +201,6 @@ jobs:
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -254,7 +227,7 @@ jobs:
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
ubuntu-vulkan:
needs: [check-release, get-version]
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
strategy:
@@ -314,7 +287,6 @@ jobs:
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_VULKAN=ON \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -340,7 +312,7 @@ jobs:
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
android-arm64:
needs: [check-release, get-version]
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: ubuntu-latest
@@ -407,7 +379,6 @@ jobs:
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -433,7 +404,7 @@ jobs:
name: llama-bin-android-arm64.tar.gz
ubuntu-24-openvino:
needs: [check-release, get-version]
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: ubuntu-24.04
@@ -445,9 +416,9 @@ jobs:
openvino_version: ${{ steps.openvino_version.outputs.value }}
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Set OpenVINO version output
@@ -505,12 +476,8 @@ jobs:
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
${{ env.CMAKE_ARGS }}
cmake --build build/ReleaseOV --config Release --parallel
-DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: ccache-clear
uses: ./.github/actions/ccache-clear
@@ -524,26 +491,8 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
dest=./build/ReleaseOV/bin
OPENVINO_ROOT=./openvino_toolkit
ov_lib="$OPENVINO_ROOT/runtime/lib/intel64"
# Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN
# load these siblings without setupvars.sh / LD_LIBRARY_PATH.
cp -P "$ov_lib"/libopenvino.so* \
"$ov_lib"/libopenvino_c.so* \
"$ov_lib"/libopenvino_*_plugin.so \
"$ov_lib"/libopenvino_intel_npu_compiler*.so \
"$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \
"$dest"
cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true
cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true
# OpenVINO licensing
cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing
cp LICENSE "$dest"
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" .
cp LICENSE ./build/ReleaseOV/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
@@ -551,140 +500,11 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
windows-openvino:
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: windows-2022
outputs:
openvino_version: ${{ steps.openvino_version.outputs.value }}
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2.1"
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
steps:
- name: Set OpenVINO version output
id: openvino_version
shell: bash
run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: "24"
cache: "npm"
cache-dependency-path: "tools/ui/package-lock.json"
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: release-windows-2022-openvino
variant: ccache
evict-old-files: 1d
- name: Setup Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/windows-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenCL using vcpkg
shell: powershell
run: |
git clone https://github.com/microsoft/vcpkg C:\vcpkg
C:\vcpkg\bootstrap-vcpkg.bat
C:\vcpkg\vcpkg install opencl
- name: Build
id: cmake_build
shell: cmd
run: |
REM Find extracted OpenVINO folder dynamically
for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
echo ERROR: OpenVINOConfig.cmake not found
exit /b 1
)
call "%OPENVINO_ROOT%\setupvars.bat"
cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
-A x64 ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_OPENVINO=ON ^
-DLLAMA_BUILD_BORINGSSL=ON ^
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^
${{ env.CMAKE_ARGS }}
cmake --build build\ReleaseOV --config Release -- /m
- name: ccache-clear
uses: ./.github/actions/ccache-clear
with:
key: release-windows-2022-openvino
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
shell: powershell
run: |
# Locate the extracted OpenVINO toolkit root (same pattern as the Build step).
$OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName
if (-not $OPENVINO_ROOT) {
Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit"
exit 1
}
$dest = ".\build\ReleaseOV\bin\Release"
$ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release'
Copy-Item -Path (Join-Path $ovBin '*.dll') -Destination $dest -Force
Copy-Item -Path (Join-Path $ovBin 'cache.json') -Destination $dest -Force
$tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin'
Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force
# OpenVINO licensing
$licensingDest = Join-Path $dest 'openvino-licensing'
New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null
Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force
Copy-Item LICENSE $dest
7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\*
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
name: llama-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
windows-cpu:
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: windows-2025-vs2026
runs-on: windows-2025
permissions:
actions: write
@@ -715,12 +535,12 @@ jobs:
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: release-windows-2025-vs2026-${{ matrix.arch }}-cpu
key: release-windows-2025-${{ matrix.arch }}-cpu
- name: Build
shell: cmd
run: |
call "C:\Program Files\Microsoft Visual Studio\18\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
cmake -S . -B build -G "Ninja Multi-Config" ^
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-DLLAMA_BUILD_BORINGSSL=ON ^
@@ -734,12 +554,12 @@ jobs:
- name: ccache-clear
uses: ./.github/actions/ccache-clear
with:
key: release-windows-2025-vs2026-${{ matrix.arch }}-cpu
key: release-windows-2025-${{ matrix.arch }}-cpu
- name: Pack artifacts
id: pack_artifacts
run: |
Copy-Item "C:\Program Files\Microsoft Visual Studio\18\Enterprise\VC\Redist\MSVC\14.51.36231\debug_nonredist\${{ matrix.arch }}\Microsoft.VC145.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -934,209 +754,213 @@ jobs:
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
windows-sycl:
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
# in order to enable it again, we have to provision dedicated runners to run it
# windows-sycl:
#
# runs-on: windows-2022
#
# defaults:
# run:
# shell: bash
#
# env:
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Use oneAPI Installation Cache
# uses: actions/cache@v5
# id: cache-sycl
# with:
# path: ${{ env.ONEAPI_ROOT }}
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
#
# - name: Download & Install oneAPI
# shell: bash
# if: steps.cache-sycl.outputs.cache-hit != 'true'
# run: |
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
#
# - name: Install Level Zero SDK
# shell: pwsh
# run: |
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
#
# - name: Setup Node.js
# uses: actions/setup-node@v6
# with:
# node-version: "24"
# cache: "npm"
# cache-dependency-path: "tools/ui/package-lock.json"
#
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.21
# with:
# key: release-windows-2022-x64-sycl
#
# - name: Build
# id: cmake_build
# shell: cmd
# run: |
# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# cmake -G "Ninja" -B build ^
# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
# -DCMAKE_BUILD_TYPE=Release ^
# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
# -DGGML_CPU=OFF -DGGML_SYCL=ON ^
# -DLLAMA_BUILD_BORINGSSL=ON
# cmake --build build --target ggml-sycl -j
#
# - name: Build the release package
# id: pack_artifacts
# run: |
# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
#
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
#
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
# if [ -n "$ZE_LOADER_DLL" ]; then
# echo "Using Level Zero loader: $ZE_LOADER_DLL"
# cp "$ZE_LOADER_DLL" ./build/bin
# else
# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
# fi
#
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
#
# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
#
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
#
# echo "cp oneAPI running time dll files to ./build/bin done"
# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
#
# - name: Upload the release package
# uses: actions/upload-artifact@v6
# with:
# path: llama-bin-win-sycl-x64.zip
# name: llama-bin-win-sycl-x64.zip
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Download & Install oneAPI
shell: bash
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Install Level Zero SDK
shell: pwsh
run: |
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: "24"
cache: "npm"
cache-dependency-path: "tools/ui/package-lock.json"
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: release-windows-2022-x64-sycl
- name: Build
id: cmake_build
shell: cmd
run: |
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
cmake -G "Ninja" -B build ^
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
-DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --target ggml-sycl -j %NUMBER_OF_PROCESSORS%
- name: ccache-clear
uses: ./.github/actions/ccache-clear
with:
key: release-windows-2022-x64-sycl
- name: Build the release package
id: pack_artifacts
run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
if [ -n "$ZE_LOADER_DLL" ]; then
echo "Using Level Zero loader: $ZE_LOADER_DLL"
cp "$ZE_LOADER_DLL" ./build/bin
else
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
fi
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
uses: actions/upload-artifact@v6
with:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
ubuntu-24-sycl:
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
LEVEL_ZERO_VERSION: "1.28.2"
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Download & Install oneAPI
shell: bash
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: Install Level Zero SDK
shell: bash
run: |
cd /tmp
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: "24"
cache: "npm"
cache-dependency-path: "tools/ui/package-lock.json"
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: release-ubuntu-24.04-sycl-${{ matrix.build }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
- name: ccache-clear
uses: ./.github/actions/ccache-clear
with:
key: release-ubuntu-24.04-sycl-${{ matrix.build }}
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
# in order to enable it again, we have to provision dedicated runners to run it
# ubuntu-24-sycl:
#
# strategy:
# matrix:
# build: [fp32]
# include:
# - build: fp32
# fp16: OFF
#
# runs-on: ubuntu-24.04
#
# env:
# ONEAPI_ROOT: /opt/intel/oneapi/
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
# LEVEL_ZERO_VERSION: "1.28.2"
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
# with:
# fetch-depth: 0
#
# - name: Use oneAPI Installation Cache
# uses: actions/cache@v5
# id: cache-sycl
# with:
# path: ${{ env.ONEAPI_ROOT }}
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
#
# - name: Download & Install oneAPI
# shell: bash
# if: steps.cache-sycl.outputs.cache-hit != 'true'
# run: |
# cd /tmp
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
#
# - name: Install Level Zero SDK
# shell: bash
# run: |
# cd /tmp
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
#
# - name: Setup Node.js
# uses: actions/setup-node@v6
# with:
# node-version: "24"
# cache: "npm"
# cache-dependency-path: "tools/ui/package-lock.json"
#
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.21
# with:
# key: release-ubuntu-24.04-sycl
#
# - name: Build
# id: cmake_build
# run: |
# source /opt/intel/oneapi/setvars.sh
# cmake -B build \
# -G "Ninja" \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_SYCL=ON \
# -DCMAKE_C_COMPILER=icx \
# -DCMAKE_CXX_COMPILER=icpx \
# -DLLAMA_OPENSSL=OFF \
# -DGGML_NATIVE=OFF \
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
# time cmake --build build --config Release -j $(nproc)
#
# - name: Determine tag name
# id: tag
# uses: ./.github/actions/get-tag-name
#
# - name: Pack artifacts
# id: pack_artifacts
# run: |
# cp LICENSE ./build/bin/
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
#
# - name: Upload artifacts
# uses: actions/upload-artifact@v6
# with:
# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
ubuntu-22-rocm:
needs: [check-release, get-version]
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: ubuntu-22.04
@@ -1228,7 +1052,6 @@ jobs:
-DGGML_HIP=ON \
-DHIP_PLATFORM=amd \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -1257,7 +1080,7 @@ jobs:
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
windows-hip:
needs: [check-release, get-version]
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: windows-2022
@@ -1353,7 +1176,6 @@ jobs:
-DGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} `
-DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
@@ -1381,7 +1203,7 @@ jobs:
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
ios-xcode:
needs: [check-release, get-version]
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
runs-on: macos-26
@@ -1410,8 +1232,7 @@ jobs:
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml \
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
- name: xcodebuild for swift package
@@ -1531,12 +1352,10 @@ jobs:
# path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
# name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
ui-build:
needs: [check-release, get-version]
ui:
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
uses: ./.github/workflows/ui-build.yml
with:
hf_ui_version: ${{ needs.get-version.outputs.ui_version }}
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -1549,13 +1368,11 @@ jobs:
runs-on: ubuntu-slim
needs:
- get-version
- windows
- windows-cpu
- windows-cuda
#- windows-sycl
- windows-hip
- windows-openvino
- ubuntu-22-rocm
- ubuntu-cpu
- ubuntu-vulkan
@@ -1565,7 +1382,7 @@ jobs:
- macos-cpu
- ios-xcode
#- openEuler-cann
- ui-build
- ui
outputs:
tag_name: ${{ steps.tag.outputs.name }}
@@ -1665,8 +1482,7 @@ jobs:
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
- Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
**Android:**
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
@@ -1674,12 +1490,10 @@ jobs:
**Windows:**
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
- [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
- [Windows x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ needs.windows-openvino.outputs.openvino_version }}-x64.zip)
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
- Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
**openEuler:**
@@ -28,6 +28,13 @@ jobs:
run: npm run build
working-directory: tools/ui
- name: Generate checksums
run: |
cd tools/ui/dist
for f in *; do
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
done
- name: Upload built UI
uses: actions/upload-artifact@v6
with:
+6 -11
View File
@@ -2,11 +2,6 @@ name: UI Build
on:
workflow_call:
inputs:
hf_ui_version:
description: 'Version string for version.json (e.g. 12345)'
required: false
type: string
jobs:
build:
@@ -30,15 +25,15 @@ jobs:
working-directory: tools/ui
- name: Build application
env:
HF_UI_VERSION: ${{ inputs.hf_ui_version || '' }}
LLAMA_BUILD_NUMBER: ${{ inputs.hf_ui_version || 'b0000' }}
run: npm run build
working-directory: tools/ui
- name: Run PWA unit tests (versioned build output)
run: npx vitest --project=unit --run tests/unit/pwa.spec.ts
working-directory: tools/ui
- name: Generate checksums
run: |
cd tools/ui/dist
for f in *; do
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
done
- name: Upload built UI
uses: actions/upload-artifact@v6
-6
View File
@@ -40,12 +40,6 @@ jobs:
name: ui-build
path: tools/ui/dist/
- name: Create distribution archive
run: |
tar -czf dist.tar.gz -C tools/ui/dist .
sha256sum dist.tar.gz > dist.tar.gz.sha256
mv dist.tar.gz dist.tar.gz.sha256 tools/ui/dist/
- name: Install Hugging Face Hub CLI
run: pip install -U huggingface_hub
+11 -18
View File
@@ -1,8 +1,8 @@
name: UI (self-hosted)
# these are the same as ui.yml, but with self-hosted runners
# the jobs are lighter because they don't need to install Node.js or Playwright browsers
# the runner has pre-installed Playwright browsers for @playwright/test (1.56.1) at /ms-playwright/
# the runners come with pre-installed Playwright browsers version: 1.56.1
# the jobs are much lighter because they don't need to install node and playwright browsers
on:
workflow_dispatch:
@@ -61,12 +61,6 @@ jobs:
run: npm ci
working-directory: tools/ui
- name: Download built UI artifacts
uses: actions/download-artifact@v6
with:
name: ui-build
path: tools/ui/dist/
- name: Run type checking
if: ${{ always() && steps.setup.conclusion == 'success' }}
run: npm run check
@@ -78,12 +72,12 @@ jobs:
working-directory: tools/ui
- name: Run Client tests
if: ${{ always() && steps.setup.conclusion == 'success' }}
if: ${{ always() }}
run: npm run test:client
working-directory: tools/ui
- name: Run Unit tests
if: ${{ always() && steps.setup.conclusion == 'success' }}
if: ${{ always() }}
run: npm run test:unit
working-directory: tools/ui
@@ -103,23 +97,22 @@ jobs:
run: npm ci
working-directory: tools/ui
- name: Download built UI artifacts
uses: actions/download-artifact@v6
with:
name: ui-build
path: tools/ui/dist/
- name: Build application
if: ${{ always() && steps.setup.conclusion == 'success' }}
run: npm run build
working-directory: tools/ui
- name: Build Storybook
if: ${{ always() && steps.setup.conclusion == 'success' }}
if: ${{ always() }}
run: npm run build-storybook
working-directory: tools/ui
- name: Run UI tests
if: ${{ always() && steps.setup.conclusion == 'success' }}
if: ${{ always() }}
run: npm run test:ui -- --testTimeout=60000
working-directory: tools/ui
- name: Run E2E tests
if: ${{ always() && steps.setup.conclusion == 'success' }}
if: ${{ always() }}
run: npm run test:e2e
working-directory: tools/ui
+8 -15
View File
@@ -43,7 +43,7 @@ jobs:
ui-checks:
name: Checks
needs: ui-build
runs-on: ubuntu-24.04
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Checkout code
@@ -60,12 +60,6 @@ jobs:
cache: "npm"
cache-dependency-path: "tools/ui/package-lock.json"
- name: Download built UI artifacts
uses: actions/download-artifact@v6
with:
name: ui-build
path: tools/ui/dist/
- name: Install dependencies
id: setup
if: ${{ steps.node.conclusion == 'success' }}
@@ -93,7 +87,7 @@ jobs:
run: npm run test:client
working-directory: tools/ui
- name: Run Unit tests (uses pre-built dist/ from ui-build)
- name: Run Unit tests
if: ${{ always() && steps.playwright.conclusion == 'success' }}
run: npm run test:unit
working-directory: tools/ui
@@ -101,7 +95,7 @@ jobs:
e2e-tests:
name: E2E Tests
needs: ui-build
runs-on: ubuntu-24.04
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
@@ -123,11 +117,10 @@ jobs:
run: npm ci
working-directory: tools/ui
- name: Download built UI artifacts (reuses ui-build)
uses: actions/download-artifact@v6
with:
name: ui-build
path: tools/ui/dist/
- name: Build application
if: ${{ always() && steps.setup.conclusion == 'success' }}
run: npm run build
working-directory: tools/ui
- name: Install Playwright browsers
id: playwright
@@ -145,7 +138,7 @@ jobs:
run: npm run test:ui -- --testTimeout=60000
working-directory: tools/ui
- name: Run E2E tests (uses pre-built dist/ from ui-build)
- name: Run E2E tests
if: ${{ always() && steps.playwright.conclusion == 'success' }}
run: npm run test:e2e
working-directory: tools/ui
+1 -1
View File
@@ -17,7 +17,7 @@ jobs:
- name: Install komac
run: |
cargo binstall komac@2.16.0 -y
cargo binstall komac@2.15.0 -y
- name: Find latest release
id: find_latest_release
+7
View File
@@ -92,6 +92,13 @@
!/examples/sycl/*.bat
!/examples/sycl/*.sh
# Server Web UI temporary files (+ legacy directory)
/tools/server/webui/node_modules
/tools/server/webui/dist
/tools/ui/node_modules
/tools/ui/dist
# Python
/.venv
+10
View File
@@ -25,3 +25,13 @@ Commits:
- Do not explicitly set the git author in commits - rely on the default git config
- Always use `--no-gpg-sign` when committing
- Never `git push` without explicit confirmation from the user
Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
-10
View File
@@ -222,16 +222,6 @@ if (LLAMA_BUILD_APP)
add_subdirectory(app)
endif()
# Standalone libmtmd build without pulling in the rest of the tools/ tree.
# Useful when packaging just the mtmd library for language bindings (e.g. an
# Apple XCFramework, or a WASM build). When the full tools build is enabled,
# mtmd is already built by the tools/ subdirectory above; this hook only fires
# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target.
option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF)
if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS))
add_subdirectory(tools/mtmd)
endif()
#
# install
#
+1 -1
View File
@@ -10,7 +10,7 @@
# ggml-org/ggml-rpc : rgerganov
# ggml-org/ggml-sycl : arthw
# ggml-org/ggml-vulkan : 0cc4m, jeffbolznv
# ggml-org/ggml-webgpu : reeselevine, yomaytk
# ggml-org/ggml-webgpu : reeselevine
# ggml-org/ggml-zdnn : taronaeo
# ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
# ggml-org/llama-mtmd : ngxson
+3 -5
View File
@@ -1,6 +1,6 @@
# llama.cpp
![llama](https://raw.githubusercontent.com/ggml-org/llama.brand/refs/heads/master/cover/llama-cpp/cover-llama-cpp-dark.svg)
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
@@ -37,7 +37,7 @@ LLM inference in C/C++
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md)
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
- Run with Docker - see our [Docker documentation](docs/docker.md)
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
@@ -142,9 +142,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2)
- [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25)
- [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
- [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
- [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
+1 -1
View File
@@ -80,7 +80,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
### Untrusted environments or networks
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Do not use the RPC backend, [ggml-rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
* Encrypt your data if sending it over the network.
+1 -1
View File
@@ -1,6 +1,6 @@
set(TARGET llama-app)
add_executable(${TARGET} llama.cpp download.cpp)
add_executable(${TARGET} llama.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
target_link_libraries(${TARGET} PRIVATE
-71
View File
@@ -1,71 +0,0 @@
#include "arg.h"
#include "common.h"
#include "download.h"
#include "log.h"
#include <cstdio>
#include <filesystem>
static void print_usage(int /*argc*/, char ** argv) {
printf(
"\nexamples:\n"
" %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n"
" %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n"
" %s -hf ggml-org/models -hff model.gguf\n"
" %s -mu https://example.com/model.gguf -m model.gguf\n"
"\n",
argv[0], argv[0], argv[0], argv[0]
);
}
int llama_download(int argc, char ** argv);
int llama_download(int argc, char ** argv) {
common_init();
common_params params;
params.verbosity = LOG_LEVEL_ERROR;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) {
return 1;
}
const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() ||
!params.model.path.empty() || !params.model.docker_repo.empty();
if (!has_source) {
fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n");
return 1;
}
try {
common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_DOWNLOAD);
common_models_handler_apply(handler, params);
} catch (const std::exception & e) {
fprintf(stderr, "error: %s\n", e.what());
return 1;
}
if (!params.models_preset.empty()) {
// -hf pointed at a preset repo: print the preset path and stop
printf("%s\n", params.models_preset.c_str());
return 0;
}
if (params.model.path.empty()) {
fprintf(stderr, "error: model download failed\n");
return 1;
}
if (!std::filesystem::exists(params.model.path)) {
fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str());
return 1;
}
printf("%s\n", params.model.path.c_str());
if (!params.mmproj.path.empty()) {
printf("%s\n", params.mmproj.path.c_str());
}
if (!params.speculative.draft.mparams.path.empty()) {
printf("%s\n", params.speculative.draft.mparams.path.c_str());
}
return 0;
}
+14 -33
View File
@@ -19,23 +19,17 @@ int llama_batched_bench(int argc, char ** argv);
int llama_fit_params(int argc, char ** argv);
int llama_quantize(int argc, char ** argv);
int llama_perplexity(int argc, char ** argv);
int llama_download(int argc, char ** argv);
// Self-update is only supported for binaries built with llama-install.sh
// hands the update over to the install script, which downloads and swaps the binary
static int llama_update(int argc, char ** argv) {
(void) argc;
(void) argv;
#ifdef LLAMA_INSTALL_BUILD
#if defined(_WIN32)
return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
#else
return system("curl -fsSL https://llama.app/install.sh | sh");
#endif
#else
printf("Updates are available only when installed from https://llama.app\n");
return 1;
#endif
}
static const char * progname;
@@ -50,33 +44,23 @@ struct command {
std::vector<std::string> aliases;
bool hidden;
int (*func)(int, char **);
bool flags = false; // allow --name
};
#ifdef LLAMA_INSTALL_BUILD
#define UPDATE_HIDDEN false
#else
#define UPDATE_HIDDEN true
#endif
static const command cmds[] = {
{"serve", "HTTP API server", {"server"}, false, llama_server },
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
{"update", "Update llama to the latest release", {}, UPDATE_HIDDEN, llama_update },
{"download", "Download a model", {"get"}, false, llama_download },
{"completion", "Text completion", {"complete"}, true, llama_completion },
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
{"quantize", "Quantize a model", {}, true, llama_quantize },
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
{"version", "Show version", {}, false, version, true },
{"licenses", "Show third-party licenses", {"credits"}, false, licenses, true },
{"help", "Show available commands", {}, false, help, true },
{"serve", "HTTP API server", {"server"}, false, llama_server },
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
{"update", "Update llama to the latest release", {}, false, llama_update },
{"completion", "Text completion", {"complete"}, true, llama_completion },
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
{"quantize", "Quantize a model", {}, true, llama_quantize },
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
{"version", "Show version", {}, false, version },
{"licenses", "Show third-party licenses", {"credits"}, false, licenses },
{"help", "Show available commands", {}, false, help },
};
#undef UPDATE_HIDDEN
static int version(int argc, char ** argv) {
printf("%s\n", llama_build_info());
return 0;
@@ -109,10 +93,7 @@ static int help(int argc, char ** argv) {
return 0;
}
static bool matches(std::string arg, const command & cmd) {
if (cmd.flags && arg.size() > 2 && arg[0] == '-' && arg[1] == '-') {
arg.erase(0, 2);
}
static bool matches(const std::string & arg, const command & cmd) {
if (arg == cmd.name) {
return true;
}
-11
View File
@@ -13,7 +13,6 @@ LLAMA_BUILD_EXAMPLES=OFF
LLAMA_BUILD_TOOLS=OFF
LLAMA_BUILD_TESTS=OFF
LLAMA_BUILD_SERVER=OFF
LLAMA_BUILD_MTMD=ON
GGML_METAL=ON
GGML_METAL_EMBED_LIBRARY=ON
GGML_BLAS_DEFAULT=ON
@@ -40,7 +39,6 @@ COMMON_CMAKE_ARGS=(
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
-DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD}
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
-DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
-DGGML_METAL=${GGML_METAL}
@@ -128,8 +126,6 @@ setup_framework_structure() {
cp ggml/include/ggml-cpu.h ${header_path}
cp ggml/include/ggml-blas.h ${header_path}
cp ggml/include/gguf.h ${header_path}
cp tools/mtmd/mtmd.h ${header_path}
cp tools/mtmd/mtmd-helper.h ${header_path}
# Create module map (common for all platforms)
cat > ${module_path}module.modulemap << EOF
@@ -251,7 +247,6 @@ combine_static_libraries() {
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
"${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
)
# Create temporary directory for processing
@@ -415,7 +410,6 @@ cmake -B build-ios-sim -G Xcode \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DMTMD_VIDEO=OFF \
-S .
cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
@@ -430,7 +424,6 @@ cmake -B build-ios-device -G Xcode \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DMTMD_VIDEO=OFF \
-S .
cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
@@ -457,7 +450,6 @@ cmake -B build-visionos -G Xcode \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DMTMD_VIDEO=OFF \
-S .
cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
@@ -473,7 +465,6 @@ cmake -B build-visionos-sim -G Xcode \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DMTMD_VIDEO=OFF \
-S .
cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
@@ -490,7 +481,6 @@ cmake -B build-tvos-sim -G Xcode \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DMTMD_VIDEO=OFF \
-S .
cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
@@ -506,7 +496,6 @@ cmake -B build-tvos-device -G Xcode \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DMTMD_VIDEO=OFF \
-S .
cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
+4
View File
@@ -80,6 +80,8 @@ add_library(${TARGET}
http.h
imatrix-loader.cpp
imatrix-loader.h
json-partial.cpp
json-partial.h
json-schema-to-grammar.cpp
llguidance.cpp
log.cpp
@@ -94,8 +96,10 @@ add_library(${TARGET}
peg-parser.h
preset.cpp
preset.h
regex-partial.cpp
reasoning-budget.cpp
reasoning-budget.h
regex-partial.h
sampling.cpp
sampling.h
speculative.cpp
+253 -375
View File
@@ -17,7 +17,6 @@
# define NOMINMAX
#endif
#include <windows.h>
#include <shellapi.h>
#endif
#define JSON_ASSERT GGML_ASSERT
@@ -286,17 +285,108 @@ static std::string clean_file_name(const std::string & fname) {
return clean_fname;
}
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
GGML_ASSERT(!params.model.hf_repo.empty());
// the returned hf_repo is without tag
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
// "latest" tag (default if not specified) is translated to "default" preset
if (hf_tag == "latest") {
hf_tag = "default";
}
std::string model_endpoint = common_get_model_endpoint();
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
// prepare local path for caching
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
auto preset_path = fs_get_cache_file(preset_fname);
common_download_opts opts;
opts.bearer_token = params.hf_token;
opts.offline = params.offline;
LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
const int status = common_download_file_single(preset_url, preset_path, opts);
const bool has_preset = status >= 200 && status < 400;
// remote preset is optional, so we don't error out if not found
if (has_preset) {
LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
common_preset_context ctx(ex, /* only_remote_allowed */ true);
common_preset global;
auto remote_presets = ctx.load_from_ini(preset_path, global);
remote_presets = ctx.cascade(global, remote_presets);
if (remote_presets.find(hf_tag) != remote_presets.end()) {
common_preset preset = remote_presets.at(hf_tag);
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
preset.apply_to_params(params);
} else {
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
}
} else {
LOG_TRC("%s: no remote preset found, skipping\n", __func__);
}
return has_preset;
}
struct handle_model_result {
bool found_mmproj = false;
common_params_model mmproj;
bool found_mtp = false;
common_params_model mtp;
bool found_preset = false;
std::string preset_path;
};
static handle_model_result common_params_handle_model(struct common_params_model & model,
const common_download_opts & opts) {
handle_model_result result;
if (!model.docker_repo.empty()) {
model.path = common_docker_resolve_model(model.docker_repo);
model.name = model.docker_repo;
} else if (!model.hf_repo.empty()) {
// If -m was used with -hf, treat the model "path" as the hf_file to download
if (model.hf_file.empty() && !model.path.empty()) {
model.hf_file = model.path;
model.path = "";
}
common_download_opts hf_opts = opts;
auto download_result = common_download_model(model, hf_opts);
if (download_result.model_path.empty()) {
throw std::runtime_error("failed to download model from Hugging Face");
}
model.name = model.hf_repo;
model.path = download_result.model_path;
if (!download_result.mmproj_path.empty()) {
result.found_mmproj = true;
result.mmproj.path = download_result.mmproj_path;
}
if (!download_result.mtp_path.empty()) {
result.found_mtp = true;
result.mtp.path = download_result.mtp_path;
}
} else if (!model.url.empty()) {
if (model.path.empty()) {
auto f = string_split<std::string>(model.url, '#').front();
f = string_split<std::string>(f, '?').front();
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
auto download_result = common_download_model(model, opts);
if (download_result.model_path.empty()) {
throw std::runtime_error("failed to download model from " + model.url);
}
}
return result;
}
const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_F32,
GGML_TYPE_F16,
@@ -340,242 +430,62 @@ static bool parse_bool_value(const std::string & value) {
throw std::invalid_argument("the argument has been removed. " + msg);
}
//
// common_models_handler
//
static std::string get_default_local_path(const std::string & url) {
auto f = string_split<std::string>(url, '#').front();
f = string_split<std::string>(f, '?').front();
return fs_get_cache_file(string_split<std::string>(f, '/').back());
}
common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
common_download_hf_plan plan;
common_download_hf_plan plan_spec;
common_download_hf_plan plan_voc;
common_download_opts opts;
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
params.speculative.types.end(),
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
// only download mmproj if the current example is using it
bool use_mmproj = false;
for (const auto & ex : mmproj_examples) {
if (curr_ex == ex) {
use_mmproj = true;
break;
}
}
opts.bearer_token = params.hf_token;
opts.offline = params.offline;
opts.download_mtp = spec_type_draft_mtp;
opts.download_mmproj = use_mmproj && !params.no_mmproj
&& params.mmproj.path.empty() && params.mmproj.url.empty();
if (!params.model.hf_repo.empty()) {
plan = common_download_get_hf_plan(params.model, opts);
}
if (!params.speculative.draft.mparams.hf_repo.empty()) {
plan_spec = common_download_get_hf_plan(params.speculative.draft.mparams, opts);
}
if (!params.vocoder.model.hf_repo.empty()) {
plan_voc = common_download_get_hf_plan(params.vocoder.model, opts);
}
return common_models_handler{plan, plan_spec, plan_voc, opts};
}
bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
return !handler.plan.preset.url.empty();
}
static std::vector<common_download_task> build_url_tasks(const common_params_model & model, common_download_opts opts) {
auto parts = common_download_get_all_parts(model.url);
std::vector<common_download_task> tasks;
// single-part: download straight to model.path if the user gave one (-m), else the cache default
if (parts.size() == 1) {
common_download_task task;
task.url = parts[0];
task.local_path = model.path.empty() ? get_default_local_path(parts[0]) : model.path;
task.opts = opts;
tasks.push_back(std::move(task));
return tasks;
}
// multi-part: place each part under the user's -m directory (if given), else the cache default
std::string base_dir;
if (!model.path.empty()) {
auto pos = model.path.rfind('/');
base_dir = pos == std::string::npos ? std::string(".") : model.path.substr(0, pos);
}
for (const auto & part : parts) {
common_download_task task;
task.url = part;
task.opts = opts;
std::string local = get_default_local_path(part);
if (!base_dir.empty()) {
auto pos = local.rfind('/');
std::string name = pos == std::string::npos ? local : local.substr(pos + 1);
local = base_dir + "/" + name;
}
task.local_path = local;
tasks.push_back(std::move(task));
}
return tasks;
}
void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
std::vector<common_download_task> tasks;
auto & plan = handler.plan;
auto & plan_spec = handler.plan_spec;
auto & plan_voc = handler.plan_voc;
auto opts = handler.opts; // copy
opts.callback = callback;
// handle plain "url" if needed
auto handle_url = [&](common_params_model & model) {
if (!model.url.empty()) {
if (model.path.empty()) {
model.path = get_default_local_path(model.url);
}
}
};
handle_url(params.model);
handle_url(params.mmproj);
handle_url(params.vocoder.model);
handle_url(params.speculative.draft.mparams);
// optionally, if docker repo is set, resolve it
if (!params.model.docker_repo.empty()) {
params.model.url = common_docker_resolve_model(params.model.docker_repo);
params.model.path = get_default_local_path(params.model.url);
}
// handle plain "url" tasks (non-hf)
if (!params.model.url.empty()) {
auto url_tasks = build_url_tasks(params.model, opts);
// the first part is what gets loaded, so point params.model.path at it
if (!url_tasks.empty()) {
std::string first_path = url_tasks.front().local_path;
url_tasks.front().on_done = [&, first_path]() { params.model.path = first_path; };
}
for (auto & task : url_tasks) {
tasks.push_back(std::move(task));
}
}
if (!params.mmproj.url.empty()) {
common_download_task task;
task.url = params.mmproj.url;
task.local_path = params.mmproj.path;
task.opts = opts;
tasks.push_back(task);
}
if (!params.vocoder.model.url.empty()) {
common_download_task task;
task.url = params.vocoder.model.url;
task.local_path = params.vocoder.model.path;
task.opts = opts;
tasks.push_back(task);
}
if (!params.speculative.draft.mparams.url.empty()) {
common_download_task task;
task.url = params.speculative.draft.mparams.url;
task.local_path = params.speculative.draft.mparams.path;
task.opts = opts;
tasks.push_back(task);
}
// handle hf_plan tasks
auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) {
for (size_t i = 0; i < model_files.size(); ++i) {
auto & model_file = model_files[i];
bool is_first = (i == 0);
tasks.emplace_back(model_file, opts, [&, is_first]() {
if (is_first) {
// only use first part as model path
model.path = hf_cache::finalize_file(model_file);
} else {
hf_cache::finalize_file(model_file);
}
});
}
};
if (!plan.model_files.empty()) {
add_tasks(plan.model_files, params.model);
}
if (!plan.mmproj.local_path.empty()) {
tasks.emplace_back(plan.mmproj, opts, [&]() {
params.mmproj.path = hf_cache::finalize_file(plan.mmproj);
});
}
if (!plan.mtp.local_path.empty()) {
tasks.emplace_back(plan.mtp, opts, [&]() {
// only fall back to the discovered MTP head when no draft was explicitly provided
if (params.speculative.draft.mparams.empty()) {
params.speculative.draft.mparams.path = hf_cache::finalize_file(plan.mtp);
} else {
hf_cache::finalize_file(plan.mtp);
}
});
}
if (!plan.preset.local_path.empty()) {
tasks.emplace_back(plan.preset, opts, [&]() {
// if HF repo is a preset repo, we simply run server in router mode with the preset.ini file
params.models_preset_hf = params.model.hf_repo; // only for showing a warning
params.models_preset = hf_cache::finalize_file(plan.preset);
params.model = common_params_model{}; // make sure to clear model, so server starts in router mode
});
}
// handle plan_spec (e.g. --spec-draft-hf)
if (!plan_spec.model_files.empty()) {
add_tasks(plan_spec.model_files, params.speculative.draft.mparams);
}
// handle vocoder plan (e.g. --hf-repo-v)
if (!plan_voc.model_files.empty()) {
add_tasks(plan_voc.model_files, params.vocoder.model);
}
// run all tasks in parallel
if (!params.offline) {
// if duplicated files are found, only download once (but still call on_done for each task)
std::unordered_map<std::string, common_download_task *> unique_tasks;
for (auto & task : tasks) {
auto it = unique_tasks.find(task.local_path);
if (it == unique_tasks.end()) {
unique_tasks[task.local_path] = &task;
}
}
std::vector<common_download_task> unique_tasks_vec;
for (auto & pair : unique_tasks) {
unique_tasks_vec.push_back(*pair.second);
}
common_download_run_tasks(unique_tasks_vec);
}
// download successful, update params with the downloaded paths
for (const auto & task : tasks) {
if (task.on_done) {
task.on_done();
}
}
}
//
// CLI argument parsing functions
//
bool common_params_handle_models(common_params & params, llama_example curr_ex) {
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
params.speculative.types.end(),
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
common_download_opts opts;
opts.bearer_token = params.hf_token;
opts.offline = params.offline;
opts.skip_download = params.skip_download;
opts.download_mtp = spec_type_draft_mtp;
opts.download_mmproj = !params.no_mmproj;
// sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
// so we should not auto-discover mtp/mmproj siblings for them
common_download_opts sub_opts = opts;
sub_opts.download_mtp = false;
sub_opts.download_mmproj = false;
try {
auto res = common_params_handle_model(params.model, opts);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
// optionally, handle mmproj model when -hf is specified
params.mmproj = res.mmproj;
}
// only download mmproj if the current example is using it
for (const auto & ex : mmproj_examples) {
if (curr_ex == ex) {
common_params_handle_model(params.mmproj, sub_opts);
break;
}
}
// when --spec-type mtp is set and no draft model was provided explicitly,
// fall back to the MTP head discovered alongside the -hf model
if (spec_type_draft_mtp && res.found_mtp &&
params.speculative.draft.mparams.path.empty() &&
params.speculative.draft.mparams.hf_repo.empty() &&
params.speculative.draft.mparams.url.empty()) {
params.speculative.draft.mparams.path = res.mtp.path;
}
common_params_handle_model(params.speculative.draft.mparams, sub_opts);
common_params_handle_model(params.vocoder.model, sub_opts);
return true;
} catch (const common_skip_download_exception &) {
return false;
} catch (const std::exception &) {
throw;
}
}
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
common_params & params = ctx_arg.params;
@@ -691,6 +601,30 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// parse the first time to get -hf option (used for remote preset)
parse_cli_args();
// export_graph_ops loads only metadata
const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
// maybe handle remote preset
if (!params.model.hf_repo.empty() && !skip_model_download) {
std::string cli_hf_repo = params.model.hf_repo;
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
std::string preset_hf_repo = params.model.hf_repo;
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
if (has_preset) {
// re-parse CLI args to override preset values
parse_cli_args();
}
// preserve hf_repo from preset if needed
if (preset_has_hf_repo) {
params.model.hf_repo = preset_hf_repo;
}
}
postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
@@ -701,26 +635,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
const bool skip_model_download =
// server will call common_params_handle_models() later, so we skip it here
ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
// download calls common_params_handle_models() itself and prints the paths
ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD ||
// export_graph_ops loads only metadata
ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
// handle model and download
if (!skip_model_download) {
// handle model and download
common_models_handler handler = common_models_handler_init(params, ctx_arg.ex);
common_models_handler_apply(handler, params);
common_params_handle_models(params, ctx_arg.ex);
}
// model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty()
&& !params.usage
&& !params.completion) {
throw std::invalid_argument("error: --model is required\n");
}
// model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
throw std::invalid_argument("error: --model is required\n");
}
if (params.escape) {
@@ -784,19 +707,15 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
common_options.push_back(&opt);
}
}
bool first = true;
auto print_section = [&](const char * header, std::vector<common_arg *> & options) {
if (options.empty()) {
return;
}
printf("%s----- %s -----\n\n", first ? "" : "\n\n", header);
first = false;
print_options(options);
};
print_section("common params", common_options);
print_section("sampling params", sampling_options);
print_section("speculative params", spec_options);
print_section("example-specific params", specific_options);
printf("----- common params -----\n\n");
print_options(common_options);
printf("\n\n----- sampling params -----\n\n");
print_options(sampling_options);
printf("\n\n----- speculative params -----\n\n");
print_options(spec_options);
// TODO: maybe convert enum llama_example to string
printf("\n\n----- example-specific params -----\n\n");
print_options(specific_options);
}
static void common_params_print_completion(common_params_context & ctx_arg) {
@@ -1018,44 +937,7 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
return true;
}
#ifdef _WIN32
struct utf8_argv {
std::vector<std::string> buf;
std::vector<char*> ptrs;
};
static utf8_argv make_utf8_argv() {
utf8_argv out;
int wargc = 0;
LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc);
if (!wargv) return out;
out.buf.reserve(wargc);
for (int i = 0; i < wargc; ++i) {
int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr);
if (n <= 0) { out.buf.emplace_back(); continue; }
auto& s = out.buf.emplace_back();
s.resize(static_cast<size_t>(n - 1));
(void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr);
}
LocalFree(wargv);
out.ptrs.reserve(out.buf.size() + 1);
for (auto& s : out.buf) out.ptrs.push_back(s.data());
out.ptrs.push_back(nullptr);
return out;
}
#endif
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
#ifdef _WIN32
auto utf8 = make_utf8_argv();
// repair argv only when it matches the process command line
if (static_cast<int>(utf8.buf.size()) == argc) {
argv = utf8.ptrs.data();
}
#endif
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -1196,9 +1078,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
* - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
*/
auto add_opt = [&](common_arg arg) {
// download only exposes the handful of args explicitly tagged for it
const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD;
if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) {
if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
ctx_arg.options.push_back(std::move(arg));
}
};
@@ -1209,7 +1089,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.usage = true;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}));
));
add_opt(common_arg(
{"--version"},
"show version and build info",
@@ -1480,7 +1360,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"--cache-idle-slots"},
{"--no-cache-idle-slots"},
"save idle slots to the prompt cache on new task, and clear them when using unified KV (default: enabled, requires cache-ram)",
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
[](common_params & params, bool value) {
params.cache_idle_slots = value;
}
@@ -1735,7 +1615,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
[](common_params & params, const std::string & value) {
const auto sampler_names = string_split<std::string>(value, ';');
params.sampling.samplers = common_sampler_types_from_names(sampler_names);
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
}
).set_sampling());
@@ -2331,7 +2211,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, bool value) {
params.no_mmproj = !value;
}
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO"));
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
add_opt(common_arg(
{"--mmproj-offload"},
{"--no-mmproj-offload"},
@@ -2341,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--image", "--audio", "--video"}, "FILE",
"path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n",
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
params.image.emplace_back(item);
@@ -2363,13 +2243,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image_max_tokens = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
add_opt(common_arg(
{"--mtmd-batch-max-tokens"}, "N",
string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens),
[](common_params & params, int value) {
params.mtmd_batch_max_tokens = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS"));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
@@ -2730,14 +2603,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL"));
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
add_opt(common_arg(
{"-mu", "--model-url"}, "MODEL_URL",
"model download url (default: unused)",
[](common_params & params, const std::string & value) {
params.model.url = value;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL"));
).set_env("LLAMA_ARG_MODEL_URL"));
add_opt(common_arg(
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
@@ -2746,7 +2619,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.model.docker_repo = value;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO"));
).set_env("LLAMA_ARG_DOCKER_REPO"));
add_opt(common_arg(
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -2756,14 +2629,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.model.hf_repo = value;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO"));
).set_env("LLAMA_ARG_HF_REPO"));
add_opt(common_arg(
{"-hff", "--hf-file"}, "FILE",
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
[](common_params & params, const std::string & value) {
params.model.hf_file = value;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE"));
).set_env("LLAMA_ARG_HF_FILE"));
add_opt(common_arg(
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
"Hugging Face model repository for the vocoder model (default: unused)",
@@ -2784,14 +2657,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.hf_token = value;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN"));
add_opt(common_arg(
{"--mtp"},
"also download the multi-token prediction (MTP) head, if available (default: unused)",
[](common_params & params) {
params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP);
}
).set_examples({LLAMA_EXAMPLE_DOWNLOAD}));
).set_env("HF_TOKEN"));
add_opt(common_arg(
{"--context-file"}, "FNAME",
"file to load context from (use comma-separated values to specify multiple files)",
@@ -3001,26 +2867,62 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
// Deprecated: use --ui-config instead (kept for backward compat)
add_opt(common_arg(
{"--ui-config", "--webui-config"}, "JSON",
{"--webui-config"}, "JSON",
"[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
[](common_params & params, const std::string & value) {
params.ui_config_json = value;
params.webui_config_json = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
add_opt(common_arg(
{"--ui-config"}, "JSON",
"JSON that provides default UI settings (overrides UI defaults)",
[](common_params & params, const std::string & value) {
params.ui_config_json = value;
params.webui_config_json = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
// Deprecated: use --ui-config-file instead (kept for backward compat)
add_opt(common_arg(
{"--ui-config-file", "--webui-config-file"}, "PATH",
{"--webui-config-file"}, "PATH",
"[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
[](common_params & params, const std::string & value) {
params.ui_config_json = read_file(value);
params.webui_config_json = params.ui_config_json;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
add_opt(common_arg(
{"--ui-config-file"}, "PATH",
"JSON file that provides default UI settings (overrides UI defaults)",
[](common_params & params, const std::string & value) {
params.ui_config_json = read_file(value);
params.webui_config_json = params.ui_config_json;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
// Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
add_opt(common_arg(
{"--ui-mcp-proxy", "--webui-mcp-proxy"},
{"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"},
{"--webui-mcp-proxy"},
{"--no-webui-mcp-proxy"},
"[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
[](common_params & params, bool value) {
params.ui_mcp_proxy = value;
params.webui_mcp_proxy = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
add_opt(common_arg(
{"--ui-mcp-proxy"},
{"--no-ui-mcp-proxy"},
"experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
[](common_params & params, bool value) {
params.ui_mcp_proxy = value;
params.webui_mcp_proxy = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
add_opt(common_arg(
@@ -3032,26 +2934,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.server_tools = parse_csv_row(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
// Deprecated: use --ui/--no-ui instead (kept for backward compat)
add_opt(common_arg(
{"-ag", "--agent"},
{"-no-ag", "--no-agent"},
"whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
{"--webui"},
{"--no-webui"},
"[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
[](common_params & params, bool value) {
if (value) {
params.server_tools = {"all"};
params.ui_mcp_proxy = true;
} else {
params.server_tools.clear();
params.ui_mcp_proxy = false;
}
params.ui = value;
params.webui = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT"));
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
add_opt(common_arg(
{"--ui", "--webui"},
{"--no-ui", "--no-webui"},
{"--ui"},
{"--no-ui"},
string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.ui = value;
params.webui = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
add_opt(common_arg(
@@ -3082,7 +2982,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
add_opt(common_arg(
{"--api-key-file"}, "FNAME",
"path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
"path to file containing API keys (default: none)",
[](common_params & params, const std::string & value) {
std::ifstream key_file(value);
if (!key_file) {
@@ -3090,7 +2990,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
std::string key;
while (std::getline(key_file, key)) {
if (!key.empty() && key[0] != '#') {
if (!key.empty()) {
params.api_keys.push_back(key);
}
}
@@ -3296,20 +3196,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.reasoning_budget_message = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
add_opt(common_arg(
{"--reasoning-preserve"},
{"--no-reasoning-preserve"},
"preserve reasoning trace in the full history, not just the last assistant message (default: template default)\n"
"compatible with certain templates having 'supports_preserve_reasoning' capability\n"
"example: https://docs.z.ai/guides/capabilities/thinking-mode#preserved-thinking",
[](common_params & params, bool value) {
if (value) {
params.default_template_kwargs["preserve_reasoning"] = "true";
} else {
params.default_template_kwargs["preserve_reasoning"] = "false";
}
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING_PRESERVE"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
@@ -3447,13 +3333,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
common_log_set_file(common_log_main(), value.c_str());
}
).set_env("LLAMA_ARG_LOG_FILE"));
add_opt(common_arg(
{"--log-prompts-dir"}, "PATH",
"Log prompts to directory (only used for debugging, default: disabled)",
[](common_params & params, const std::string & value) {
params.path_prompts_log_dir = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--log-colors"}, "[on|off|auto]",
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -3485,7 +3364,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.offline = true;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_OFFLINE"));
).set_env("LLAMA_ARG_OFFLINE"));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
@@ -3762,7 +3641,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.draft.mparams.path = value;
params.speculative.draft.mparams.hf_file = value; // will be used if --spec-draft-hf is set
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
add_opt(common_arg(
+5 -17
View File
@@ -1,14 +1,12 @@
#pragma once
#include "common.h"
#include "download.h"
#include <set>
#include <map>
#include <string>
#include <vector>
#include <cstring>
#include <memory>
// pseudo-env variable to identify preset-only arguments
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
@@ -131,21 +129,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
// see: https://github.com/ggml-org/llama.cpp/issues/18163
void common_params_add_preset_options(std::vector<common_arg> & args);
struct common_models_handler {
common_download_hf_plan plan;
common_download_hf_plan plan_spec;
common_download_hf_plan plan_voc;
common_download_opts opts;
};
// initialize downloading opts and hf_plan if needed, but does not download anything yet
common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex);
// check if the model is a preset repo (i.e. has a preset file)
bool common_models_handler_is_preset_repo(const common_models_handler & handler);
// download and update params with the downloaded model path
void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback = nullptr);
// populate model paths (main model, mmproj, etc) from -hf if necessary
// return true if the model is ready to use
// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
bool common_params_handle_models(common_params & params, llama_example curr_ex);
// initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+7 -11
View File
@@ -103,10 +103,6 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
};
if (autoparser.tools.format.openai_wrapper_trigger) {
// model emits the OpenAI function wrapper, trigger on it
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "{\"type\": \"function\"," });
}
}
}
@@ -138,7 +134,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs, cons
auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
parser = ctx.reasoning_parser + p.space() + p.choice({
p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
p.space() + response_format + p.space()
response_format
}) + p.end();
pure_content = false;
} else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
@@ -228,13 +224,13 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
auto single_tool_parser = p.standard_json_tools(
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
} else {
tools_parser = p.standard_json_tools(
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger);
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
}
// Handle content wrappers if present
@@ -395,11 +391,11 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
arguments.name_suffix) +
arguments.value_prefix +
(schema_info.resolves_to_string(param_schema) ?
p.ac(p.tool_arg_string_value(until_suffix) +
p.tool_arg_close(p.literal(arguments.value_suffix)), arguments.value_suffix) :
(p.tool_arg_json_value(p.schema(
p.tool_arg_string_value(until_suffix) :
p.tool_arg_json_value(p.schema(
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
p.tool_arg_close(p.literal(arguments.value_suffix)))));
p.space()) +
p.tool_arg_close(p.literal(arguments.value_suffix)));
auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
if (is_required) {
-1
View File
@@ -181,7 +181,6 @@ struct tool_format_analysis {
bool fun_name_is_key = false; // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
bool tools_array_wrapped = false; // Tool calls wrapped in JSON array [...]
bool openai_wrapper_trigger = false; // model emits the OpenAI function wrapper, trigger on it
std::string function_field = "function";
std::string name_field = "name";
+4 -11
View File
@@ -165,14 +165,6 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
}
},
// template uses the JSON {name, parameters} tool instruction, emits the OpenAI function wrapper
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
if (tmpl.src.find("Respond in the format {\"name\": function name") != std::string::npos &&
tmpl.src.find("Do not use variables.") != std::string::npos) {
analysis.tools.format.openai_wrapper_trigger = true;
LOG_DBG(ANSI_ORANGE "[Patch: JSON name/parameters tool instruction]\n" ANSI_RESET);
}
},
});
@@ -1237,8 +1229,8 @@ void analyze_tools::extract_argument_name_markers() {
left_result.tags["pre"] == right_result.tags["pre"] &&
left_result.tags["suffix"] == right_result.tags["suffix"]) {
// Name is inside a structure (e.g., JSON key): prefix is the shared wrapper
arguments.name_prefix = left_result.tags["pre"];
arguments.name_suffix = left_result.tags["suffix"];
arguments.name_prefix = trim_whitespace(left_result.tags["pre"]);
arguments.name_suffix = trim_leading_whitespace(left_result.tags["suffix"]);
} else if (diff.left.substr(0, ARG_FIRST.length()) == ARG_FIRST && diff.right.substr(0, ARG_SECOND.length()) == ARG_SECOND) {
// Name is directly in the diff: prefix comes from last marker in diff.prefix
auto pre_parser = build_tagged_peg_parser([&](common_peg_parser_builder & p) {
@@ -1323,7 +1315,8 @@ void analyze_tools::extract_argument_value_markers() {
value_suffix = value_suffix.substr(0, end_marker_pos);
}
}
if (!trim_whitespace(value_suffix).empty()) {
value_suffix = trim_leading_whitespace(value_suffix);
if (!value_suffix.empty()) {
arguments.value_suffix = value_suffix;
}
}
+17 -70
View File
@@ -87,8 +87,6 @@ static std::string normalize_quotes_to_json(const std::string & input) {
bool in_single_quoted = false;
bool in_double_quoted = false;
auto is_word_char = [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; };
for (size_t i = 0; i < input.size(); ++i) {
char c = input[i];
@@ -153,29 +151,6 @@ static std::string normalize_quotes_to_json(const std::string & input) {
in_single_quoted = true;
result += '"';
}
} else if (!in_single_quoted && !in_double_quoted && (c == 'T' || c == 'F' || c == 'N') &&
(i == 0 || !is_word_char(input[i - 1]))) {
// Python literals -> JSON; prefix match keeps streamed partials monotonic.
static constexpr std::pair<std::string_view, std::string_view> literals[] = {
{ "True", "true" }, { "False", "false" }, { "None", "null" },
};
size_t n = 0;
while (i + n < input.size() && is_word_char(input[i + n])) {
++n;
}
std::string_view token(input.data() + i, n);
bool matched = false;
for (const auto & [py, js] : literals) {
if (py.substr(0, n) == token) {
result += js.substr(0, n);
i += n - 1;
matched = true;
break;
}
}
if (!matched) {
result += c;
}
} else {
result += c;
}
@@ -363,7 +338,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
}
if ((is_arg_value || is_arg_string_value) && current_tool) {
std::string value_content = std::string(node.text);
std::string value_content = std::string(trim_trailing_space(trim_leading_space(node.text, 1), 1));
std::string value_to_add;
if (value_content.empty() && is_arg_string_value) {
@@ -378,8 +353,12 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
}
value_to_add += escape_json_string_inner(value_content);
} else if (!value_content.empty()) {
// Pythonic scalars/containers -> JSON.
value_to_add += normalize_container_value(value_content);
// For potential containers, normalize Python-style single quotes to JSON double quotes
bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
if (is_potential_container) {
value_content = normalize_container_value(value_content);
}
value_to_add += value_content;
}
args_target() += value_to_add;
@@ -487,34 +466,11 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
return force_tool_calls ? section : optional(section);
}
// Like python_value(), but the leaf also accepts JSON-cased true/false/null, used by LFM2/LFM2.5
common_peg_parser common_chat_peg_builder::python_or_json_value() {
return rule("python-or-json-value", [this]() {
auto ws = space();
auto value = python_or_json_value();
auto member = sequence({ python_string(), ws, literal(":"), ws, value });
auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) });
auto dict = rule("python-or-json-dict", [&]() {
return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }), ws });
});
auto elements = sequence({ value, zero_or_more(sequence({ literal(","), ws, value })) });
auto array = rule("python-or-json-array", [&]() {
return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }), ws });
});
return choice({ dict, array, python_string(), python_number(),
python_bool(), python_null(), json_bool(), json_null() });
});
}
// Python-style tool calls: name(arg1="value1", arg2=123)
// Used only by LFM2 for now, so we don't merge it into autoparser
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
const ordered_json & tools,
bool parallel_tool_calls,
bool allow_json_literals) {
bool parallel_tool_calls) {
if (!tools.is_array() || tools.empty()) {
return eps();
}
@@ -540,16 +496,15 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
auto arg_name_parser = literal(prop_name);
common_peg_parser arg_value_parser = eps();
// Quoted literal as a value: normalize_quotes_to_json preserves escapes.
auto string_value_parser = tool_arg_value(choice({
literal("\"") + string_content('"') + literal("\""),
literal("'") + string_content('\'') + literal("'")
}));
auto string_value_parser = choice({
literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
});
if (is_string_type) {
arg_value_parser = string_value_parser;
} else {
arg_value_parser = tool_arg_value(allow_json_literals ? python_or_json_value() : python_value());
arg_value_parser = tool_arg_value(python_value());
}
// Full argument: name="value" or name=value
@@ -746,8 +701,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key,
const std::vector<std::string> & parameters_order,
bool accept_openai_wrapper) {
const std::vector<std::string> & parameters_order) {
auto tool_choices = choice();
auto name_key_parser = literal("\"" + effective_name_key + "\"");
@@ -809,13 +763,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
return idx_a < idx_b;
});
// accept an optional leading "type": "function" field when the model emits the OpenAI wrapper
common_peg_parser type_field = eps();
if (accept_openai_wrapper) {
type_field = optional(literal("\"type\"") + space() + literal(":") + space() +
literal("\"function\"") + space() + literal(",") + space());
}
auto ordered_body = tool_open(literal("{")) + space() + type_field;
auto ordered_body = tool_open(literal("{")) + space();
for (size_t i = 0; i < parser_pairs.size(); i++) {
ordered_body = ordered_body + parser_pairs[i].first;
if (i < parser_pairs.size() - 1) {
@@ -878,8 +826,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
bool function_is_key,
const std::string & call_id_key,
const std::string & gen_call_id_key,
const std::vector<std::string> & parameters_order,
bool accept_openai_wrapper) {
const std::vector<std::string> & parameters_order) {
if (!tools.is_array() || tools.empty()) {
return eps();
}
@@ -897,7 +844,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(
if (!name_spec.first.empty() || !args_spec.first.empty()) {
tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
} else {
tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order, accept_openai_wrapper);
tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
}
}
+4 -9
View File
@@ -120,8 +120,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {
bool function_is_key = false,
const std::string & call_id_key = "",
const std::string & gen_call_id_key = "",
const std::vector<std::string> & parameters_order = {},
bool accept_openai_wrapper = false);
const std::vector<std::string> & parameters_order = {});
// Legacy-compatible helper for building XML/tagged style tool calls
// Used by tests and manual parsers
@@ -133,13 +132,9 @@ class common_chat_peg_builder : public common_peg_parser_builder {
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
// Used by LFM2 and similar templates
common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
bool parallel_tool_calls,
bool allow_json_literals);
bool parallel_tool_calls);
private:
// Python values plus JSON true/false/null.
common_peg_parser python_or_json_value();
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
const std::string & args_key,
@@ -158,8 +153,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key,
const std::vector<std::string> & parameters_order,
bool accept_openai_wrapper);
const std::vector<std::string> & parameters_order);
};
inline common_peg_arena build_chat_peg_parser(
@@ -201,3 +195,4 @@ struct tagged_peg_parser {
tagged_peg_parser build_tagged_peg_parser(
const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
+165 -455
View File
@@ -90,93 +90,41 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
return text;
}
common_chat_role common_chat_role_from_string(const std::string & role) {
if (role == "system") { return COMMON_CHAT_ROLE_SYSTEM; }
if (role == "assistant") { return COMMON_CHAT_ROLE_ASSISTANT; }
if (role == "user") { return COMMON_CHAT_ROLE_USER; }
if (role == "tool") { return COMMON_CHAT_ROLE_TOOL; }
return COMMON_CHAT_ROLE_UNKNOWN;
}
const char * common_chat_role_to_string(common_chat_role role) {
switch (role) {
case COMMON_CHAT_ROLE_SYSTEM: return "system";
case COMMON_CHAT_ROLE_ASSISTANT: return "assistant";
case COMMON_CHAT_ROLE_USER: return "user";
case COMMON_CHAT_ROLE_TOOL: return "tool";
case COMMON_CHAT_ROLE_UNKNOWN: return "";
}
return "";
}
json common_chat_msg_delimiters::to_json() const {
json result = json::array();
for (const auto & d : delimiters) {
result.push_back({
{ "role", common_chat_role_to_string(d.role) },
{ "delimiter", d.delimiter },
});
}
return result;
}
common_chat_msg_delimiters common_chat_msg_delimiters_parse(const json & delimiters) {
common_chat_msg_delimiters result;
if (!delimiters.is_array()) {
return result;
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
if (delims.empty() || prompt.empty()) {
return {};
}
result.delimiters.reserve(delimiters.size());
for (const auto & d : delimiters) {
if (!d.is_object()) {
continue;
auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
std::vector<std::string> all_delims;
std::vector<common_peg_parser> tagged_messages;
all_delims.reserve(delims.size());
tagged_messages.reserve(delims.size());
for (const auto & d : delims) {
all_delims.push_back(d.delimiter);
}
result.delimiters.push_back({
common_chat_role_from_string(d.value("role", std::string())),
d.value("delimiter", std::string()),
});
}
return result;
}
void common_chat_msg_delimiters::tokenize(const llama_vocab * vocab) {
for (auto & d : delimiters) {
d.tokens = common_tokenize(vocab, d.delimiter, false, true);
}
}
common_chat_msg_spans common_chat_msg_delimiters::split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips) const {
std::vector<std::pair<common_chat_role, size_t>> matches;
auto skip = skips.begin();
for (size_t i = 0; i < tokens.size();) {
if (skip != skips.end() && i == skip->first) {
i += skip->second;
++skip;
continue;
auto any_delim = p.until_one_of(all_delims);
for (const auto & d : delims) {
tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
}
for (const auto & d : delimiters) {
if (i + d.tokens.size() > tokens.size()) {
continue;
}
if (std::equal(d.tokens.begin(), d.tokens.end(), tokens.begin() + i)) {
matches.emplace_back(d.role, i);
break;
}
return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
});
common_peg_parse_context ctx(prompt);
const auto result = parser.parse(ctx);
if (!result.success()) {
return {};
}
std::vector<common_chat_msg_span> spans;
ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
if (!node.tag.empty()) {
spans.push_back({ node.tag, node.start, node.end - node.start });
}
i++;
}
matches.emplace_back(COMMON_CHAT_ROLE_UNKNOWN, tokens.size());
common_chat_msg_spans spans;
for (size_t i = 0; i + 1 < matches.size(); i++) {
const auto & curr = matches[i];
const auto & next = matches[i + 1];
spans.add(curr.first, curr.second, next.second - curr.second);
}
});
return spans;
}
@@ -912,10 +860,6 @@ static std::string common_chat_template_direct_apply_impl(
if (inputs.add_generation_prompt) {
inp["add_generation_prompt"] = true;
}
if (inp.contains("preserve_reasoning") && inp["preserve_reasoning"].is_boolean()) {
bool enabled = inp["preserve_reasoning"].get<bool>();
jinja::caps_apply_preserve_reasoning(ctx, enabled);
}
jinja::global_from_json(ctx, inp, inputs.mark_input);
@@ -1137,13 +1081,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
data.prompt = prompt;
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
data.message_delimiters = {
{ COMMON_CHAT_ROLE_ASSISTANT, "<|start|>assistant" },
{ COMMON_CHAT_ROLE_USER, "<|start|>user" },
{ COMMON_CHAT_ROLE_SYSTEM, "<|start|>developer" },
{ COMMON_CHAT_ROLE_SYSTEM, "<|start|>system" },
{ COMMON_CHAT_ROLE_TOOL, "<|start|>functions" },
};
data.message_spans = common_chat_split_by_role(prompt, {
{ "assistant", "<|start|>assistant" },
{ "user", "<|start|>user" },
{ "system", "<|start|>developer" },
{ "system", "<|start|>system" },
{ "tool", "<|start|>functions" },
});
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
@@ -1284,10 +1228,10 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
data.prompt += data.generation_prompt;
}
data.message_delimiters = {
{ COMMON_CHAT_ROLE_USER, "<|turn>user" },
{ COMMON_CHAT_ROLE_ASSISTANT, "<|turn>model" },
};
data.message_spans = common_chat_split_by_role(data.prompt, {
{ "user", "<|turn>user\n" },
{ "assistant", "<|turn>model\n" },
});
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
data.supports_thinking = true;
@@ -1664,52 +1608,42 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
return data;
}
// LFM2/LFM2.5 parser. Tool calls are almost Python-style and parallel-capable
// (except dotted names and JSON literals true/false/null).
// Always wrapped in <|tool_call_start|>[name(args)]<|tool_call_end|> with optional <think> reasoning.
// tool_list_tokens preserves LFM2 system tool-list markers.
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::generation_params & inputs,
bool tool_list_tokens) {
// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
// - Reasoning: <think>{reasoning}</think> (optional)
// - Content: text before a tool call (optional)
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
// Tool calls can appear multiple times (parallel tool calls supported)
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_list_start|>",
"<|tool_list_end|>",
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
const std::string TOOL_CALL_START = "<|tool_call_start|>";
const std::string TOOL_CALL_END = "<|tool_call_end|>";
const std::string TOOL_LIST_START = "<|tool_list_start|>";
const std::string TOOL_LIST_END = "<|tool_list_end|>";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
// Copy reasoning to the "thinking" field the template expects
auto adjusted_messages = json::array();
for (auto msg : inputs.messages) {
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
msg["thinking"] = msg.at("reasoning_content");
}
adjusted_messages.push_back(msg);
}
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs, adjusted_messages);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, adjusted_messages);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END };
if (tool_list_tokens) {
data.preserved_tokens.push_back(TOOL_LIST_START);
data.preserved_tokens.push_back(TOOL_LIST_END);
}
data.thinking_start_tag = THINK_START;
data.thinking_end_tag = THINK_END;
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
// Gate by reasoning format and whether the template supports <think>
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
tmpl.source().find(THINK_START) != std::string::npos;
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
if (inputs.has_continuation()) {
const auto & msg = inputs.continue_msg;
@@ -1726,21 +1660,17 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning) {
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
if (has_response_format) {
auto response_format = p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema));
return generation_prompt + reasoning + response_format + end;
}
return generation_prompt + reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call",
p.literal(TOOL_CALL_START) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls, /* allow_json_literals = */ true) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
p.literal(TOOL_CALL_END)
)
);
@@ -1753,17 +1683,13 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
if (has_response_format) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
}
parser.build_grammar(builder, data.grammar_lazy);
});
@@ -1771,6 +1697,93 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
};
}
return data;
}
// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
// - Reasoning: <think>{reasoning}</think> (optional)
// - Content: text before a tool call (optional)
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
// Tool calls can appear multiple times (parallel tool calls supported)
static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
data.thinking_start_tag = THINK_START;
data.thinking_end_tag = THINK_END;
if (inputs.has_continuation()) {
const auto & msg = inputs.continue_msg;
data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
data.generation_prompt += THINK_END + msg.render_content();
}
data.prompt += data.generation_prompt;
}
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.literal(GEN_PROMPT);
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return generation_prompt + reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call",
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
)
);
auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
foreach_function(inputs.tools, [&](const json & tool) {
const std::string name = tool.at("function").at("name");
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
});
}
return data;
}
@@ -2035,146 +2048,6 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
return data;
}
// Cohere2 MoE (a.k.a. "North Code") parser.
//
// The assistant turn is fully marker-wrapped:
// <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
// <|START_THINKING|>{reasoning}<|END_THINKING|>
// then EITHER content: <|START_TEXT|>{content}<|END_TEXT|>
// OR tool calls: <|START_ACTION|>[
// {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ...
// ]<|END_ACTION|>
// <|END_OF_TURN_TOKEN|>
//
// The generation prompt forces a leading <|START_THINKING|> (when reasoning is enabled, which is
// the template default), so the model's output continues from *inside* the thinking block. The
// parser literal therefore only covers the stable <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> prefix
// and the reasoning rule consumes the <|START_THINKING|> ... <|END_THINKING|> markers itself,
// regardless of whether they came from the generation prompt or the generated text.
static common_chat_params common_chat_params_init_cohere2moe(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
const std::string TURN_START = "<|START_OF_TURN_TOKEN|>";
const std::string TURN_END = "<|END_OF_TURN_TOKEN|>";
const std::string CHATBOT = "<|CHATBOT_TOKEN|>";
const std::string USER = "<|USER_TOKEN|>";
const std::string SYSTEM = "<|SYSTEM_TOKEN|>";
const std::string THINK_START = "<|START_THINKING|>";
const std::string THINK_END = "<|END_THINKING|>";
const std::string TEXT_START = "<|START_TEXT|>";
const std::string TEXT_END = "<|END_TEXT|>";
const std::string ACTION_START = "<|START_ACTION|>";
const std::string ACTION_END = "<|END_ACTION|>";
const std::string RESULT_START = "<|START_TOOL_RESULT|>";
const std::string RESULT_END = "<|END_TOOL_RESULT|>";
// Stable prefix of the generation prompt that precedes the (forced) <|START_THINKING|> marker.
const std::string GEN_PREFIX = TURN_START + CHATBOT;
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.thinking_start_tag = THINK_START;
data.thinking_end_tag = THINK_END;
data.preserved_tokens = {
TURN_START, TURN_END, CHATBOT, USER, SYSTEM,
THINK_START, THINK_END,
TEXT_START, TEXT_END,
ACTION_START, ACTION_END,
RESULT_START, RESULT_END,
};
// Declare per-role message delimiters. Tool results are rendered with the
// system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
// the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
data.message_delimiters = {
{ COMMON_CHAT_ROLE_ASSISTANT, GEN_PREFIX },
{ COMMON_CHAT_ROLE_USER, TURN_START + USER },
{ COMMON_CHAT_ROLE_TOOL, TURN_START + SYSTEM + RESULT_START },
{ COMMON_CHAT_ROLE_SYSTEM, TURN_START + SYSTEM },
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
if (inputs.has_continuation()) {
const auto & msg = inputs.continue_msg;
data.generation_prompt = GEN_PREFIX + THINK_START + msg.reasoning_content;
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
data.generation_prompt += THINK_END + TEXT_START + msg.render_content();
}
data.prompt += data.generation_prompt;
}
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.literal(GEN_PREFIX);
auto end = p.end();
// The thinking block is always present (the generation prompt forces <|START_THINKING|>).
// When extracting reasoning, capture its body; otherwise keep the whole block (markers
// included) inline as content, matching reasoning_format=NONE conventions.
common_peg_parser reasoning = p.eps();
if (extract_reasoning) {
reasoning = p.optional(p.literal(THINK_START) +
p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) +
p.optional(p.literal(THINK_END)));
} else {
reasoning = p.optional(p.content(p.literal(THINK_START) +
p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) +
p.optional(p.literal(THINK_END))));
}
auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END));
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return generation_prompt + reasoning + text_content + p.optional(p.literal(TURN_END)) + end;
}
auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
// <|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ... ]<|END_ACTION|>
auto tool_calls = p.standard_json_tools(ACTION_START, ACTION_END, inputs.tools, inputs.parallel_tool_calls,
/* force_tool_calls = */ true,
/* name_key = */ "tool_name",
/* args_key = */ "parameters",
/* array_wrapped = */ true,
/* function_is_key = */ false,
/* call_id_key = */ "",
/* gen_call_id_key = */ "tool_call_id",
/* parameters_order = */ { "tool_call_id", "tool_name", "parameters" });
// Content and tool calls are mutually exclusive in this format.
common_peg_parser body = require_tools ? tool_calls : p.choice({ tool_calls, text_content });
return generation_prompt + reasoning + body + p.optional(p.literal(TURN_END)) + end;
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ACTION_START }
};
}
return data;
}
namespace workaround {
static void map_developer_role_to_system(json & messages) {
@@ -2380,149 +2253,6 @@ static void func_args_not_string(json & messages) {
}
// MiniCPM5 format:
// - Reasoning: <think>{reasoning}</think> (optional)
// - Tool calls: <function name="foo"><param name="bar">value</param></function>
static common_chat_params common_chat_params_init_minicpm5(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<function",
"<param",
"</function>",
"</param>",
"<think>",
"</think>",
};
data.thinking_start_tag = "<think>";
data.thinking_end_tag = "</think>";
data.message_delimiters = {
{ COMMON_CHAT_ROLE_ASSISTANT, "<|im_start|>assistant" },
{ COMMON_CHAT_ROLE_TOOL, "<|im_start|>user\n<tool_response>" },
{ COMMON_CHAT_ROLE_USER, "<|im_start|>user" },
{ COMMON_CHAT_ROLE_SYSTEM, "<|im_start|>system" },
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
if (inputs.has_continuation()) {
const auto & msg = inputs.continue_msg;
data.generation_prompt = "<|im_start|>assistant\n<think>\n" + msg.reasoning_content;
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
data.generation_prompt += "\n</think>\n\n" + msg.render_content();
}
data.prompt += data.generation_prompt;
}
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.literal("<|im_start|>assistant\n");
auto reasoning = p.eps();
if (extract_reasoning) {
reasoning = ("<think>" << p.reasoning(p.until("</think>")) << "</think>") + p.space();
}
// Response format parser
if (has_response_format) {
return generation_prompt + reasoning + p.content(p.schema(p.json(), "response-format", inputs.json_schema));
}
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
// CDATA lets a value carry characters that would otherwise close the tag (e.g.
// </param>); capture the inner text only, excluding the CDATA markers.
auto string_value = p.choice({
p.literal("<![CDATA[") + p.ac(p.tool_arg_string_value(p.until("]]>")) + p.literal("]]>"), "]]>") + p.tool_arg_close(p.literal("</param>")),
p.negate(p.literal("<![CDATA[")) + p.ac(p.tool_arg_string_value(p.until("</param>")) + p.tool_arg_close(p.literal("</param>")), "</param>")
});
auto tool_choice = p.choice();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
const std::string name = function.at("name");
auto params = function.contains("parameters") ? function.at("parameters") : json::object();
auto args = p.eps();
if (params.contains("properties") && params.at("properties").is_object() && !params.at("properties").empty()) {
auto schema_info = common_schema_info();
schema_info.resolve_refs(params);
auto arg_choice = p.choice();
for (const auto & [prop_name, prop_schema] : params.at("properties").items()) {
auto value_parser = p.eps();
if (schema_info.resolves_to_string(prop_schema)) {
value_parser = string_value;
} else {
value_parser = p.tool_arg_json_value(
p.schema(p.json(), "tool-" + name + "-arg-" + prop_name + "-schema", prop_schema, false)
) + p.tool_arg_close(p.literal("</param>"));
}
auto arg_rule = p.tool_arg(
p.tool_arg_open(p.literal("<param name=\"") + p.tool_arg_name(p.literal(prop_name)) + p.literal("\">")) +
value_parser
);
arg_choice |= arg_rule;
}
args = p.zero_or_more(arg_choice + p.space());
}
auto tool_parser = p.tool(
p.tool_open(p.literal("<function name=\"") + p.tool_name(p.literal(name)) + p.literal("\">"))
<< p.tool_args(args)
<< p.tool_close(p.literal("</function>")));
tool_choice |= p.rule("tool-" + name, tool_parser);
});
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
auto tool_calls = p.trigger_rule("tool-call", p.repeat(tool_choice + p.space(), 1, max_calls));
auto content = p.content(p.until("<function"));
return generation_prompt + reasoning + content + tool_calls + p.end();
}
return generation_prompt + reasoning + p.content(p.rest()) + p.end();
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
builder.resolve_refs(schema);
});
if (has_response_format) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);
}
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function" },
};
}
return data;
}
static json common_chat_extra_context() {
json ctx = json::object();
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
@@ -2566,25 +2296,16 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
return common_chat_params_init_kimi_k2(tmpl, params);
}
// Cohere2 MoE / North Code - marker-wrapped format with <|START_TEXT|> content and
// <|START_ACTION|> JSON tool calls. <|START_TEXT|> is unique to this template (the older
// Command-R templates use <|START_RESPONSE|>).
if (src.find("<|START_TEXT|>") != std::string::npos &&
src.find("<|START_ACTION|>") != std::string::npos) {
LOG_DBG("Using specialized template: Cohere2 MoE\n");
return common_chat_params_init_cohere2moe(tmpl, params);
}
if (is_lfm2_template(src)) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
return common_chat_params_init_lfm2(tmpl, params);
}
// LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
if (src.find("List of tools: [") != std::string::npos &&
src.find("<|tool_list_start|>") == std::string::npos) {
LOG_DBG("Using specialized template: LFM2.5\n");
return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ false);
return common_chat_params_init_lfm2_5(tmpl, params);
}
// GigaChatV3 format detection
@@ -2615,14 +2336,6 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
return common_chat_params_init_gemma4(tmpl, params);
}
// MiniCPM5 - XML tool calls with <function name="..."><param name="...">...</param></function>
if (src.find("Tool usage guidelines:") != std::string::npos &&
src.find("<function name=\"") != std::string::npos &&
src.find("<param name=\"") != std::string::npos) {
LOG_DBG("Using specialized template: MiniCPM5\n");
return common_chat_params_init_minicpm5(tmpl, params);
}
return std::nullopt;
}
@@ -2733,15 +2446,17 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
autoparser.analyze_template(tmpl);
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
common_chat_msg_delimiters delimiters;
std::vector<common_chat_msg_delimiter> delimiters;
if (!autoparser.assistant_start.empty()) {
delimiters.add(COMMON_CHAT_ROLE_ASSISTANT, autoparser.assistant_start);
delimiters.push_back({ "assistant", autoparser.assistant_start });
}
if (!autoparser.user_start.empty()) {
delimiters.add(COMMON_CHAT_ROLE_USER, autoparser.user_start);
delimiters.push_back({ "user", autoparser.user_start });
}
auto_params.message_delimiters = std::move(delimiters);
if (!delimiters.empty()) {
auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
}
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
if (auto_params.supports_thinking) {
@@ -2883,9 +2598,8 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
}
return msg;
}
LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str());
LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str());
throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format");
throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
effective_input.substr(result.end));
}
common_chat_msg msg;
@@ -2913,9 +2627,5 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates) {
GGML_ASSERT(chat_templates != nullptr);
GGML_ASSERT(chat_templates->template_default != nullptr);
if (chat_templates->template_tool_use != nullptr) {
// take the more expressive template when available
return chat_templates->template_tool_use->caps.to_map();
}
return chat_templates->template_default->caps.to_map();
}
+6 -65
View File
@@ -143,75 +143,15 @@ struct common_chat_msg_diff {
}
};
enum common_chat_role {
COMMON_CHAT_ROLE_UNKNOWN,
COMMON_CHAT_ROLE_SYSTEM,
COMMON_CHAT_ROLE_ASSISTANT,
COMMON_CHAT_ROLE_USER,
COMMON_CHAT_ROLE_TOOL
};
common_chat_role common_chat_role_from_string(const std::string & role);
const char * common_chat_role_to_string(common_chat_role role);
struct common_chat_msg_span {
common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
std::string role;
std::size_t pos = 0;
std::size_t len = 0;
bool valid() const {
return role != COMMON_CHAT_ROLE_UNKNOWN;
}
};
struct common_chat_msg_spans {
std::vector<common_chat_msg_span> spans;
void add(common_chat_role role, size_t pos, size_t len) {
spans.push_back({ role, pos, len });
}
bool is_user_start(int32_t pos) const {
for (auto it = spans.begin(); it != spans.end(); ++it) {
if (it->role == COMMON_CHAT_ROLE_USER && pos == (int32_t) it->pos) {
return true;
}
}
return false;
}
int32_t last_user_message_pos() const {
for (auto it = spans.rbegin(); it != spans.rend(); ++it) {
if (it->role == COMMON_CHAT_ROLE_USER) {
return (int32_t) it->pos;
}
}
return -1;
}
};
struct common_chat_msg_delimiter {
common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
std::string delimiter;
llama_tokens tokens = {};
};
struct common_chat_msg_delimiters {
std::vector<common_chat_msg_delimiter> delimiters;
common_chat_msg_delimiters() = default;
common_chat_msg_delimiters(std::initializer_list<common_chat_msg_delimiter> delims) : delimiters(delims) {}
void add(common_chat_role role, const std::string & delimiter) {
delimiters.push_back({ role, delimiter });
}
void tokenize(const llama_vocab * vocab);
// split tokens into message spans. skips maps a start index to a length of a region to jump over without matching
common_chat_msg_spans split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips = {}) const;
nlohmann::ordered_json to_json() const;
std::string role;
std::string delimiter;
};
struct common_chat_tool {
@@ -279,7 +219,7 @@ struct common_chat_params {
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;
std::string parser;
common_chat_msg_delimiters message_delimiters;
std::vector<common_chat_msg_span> message_spans;
};
// per-message parsing syntax
@@ -385,4 +325,5 @@ struct common_chat_prompt_preset {
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
common_chat_msg_delimiters common_chat_msg_delimiters_parse(const nlohmann::ordered_json & delimiters);
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
+49 -63
View File
@@ -225,7 +225,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
}
if (!SetPriorityClass(GetCurrentProcess(), p)) {
COM_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false;
}
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
}
if (setpriority(PRIO_PROCESS, 0, p) != 0) {
COM_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false;
}
return true;
@@ -284,14 +284,14 @@ void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_para
if (n_set && n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues.
COM_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
}
}
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) {
COM_ERR("%s", "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
return false;
}
@@ -303,7 +303,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
} else {
start_i = std::stoull(range.substr(0, dash_loc));
if (start_i >= GGML_MAX_N_THREADS) {
COM_ERR("%s", "Start index out of bounds!\n");
LOG_ERR("Start index out of bounds!\n");
return false;
}
}
@@ -313,7 +313,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
} else {
end_i = std::stoull(range.substr(dash_loc + 1));
if (end_i >= GGML_MAX_N_THREADS) {
COM_ERR("%s", "End index out of bounds!\n");
LOG_ERR("End index out of bounds!\n");
return false;
}
}
@@ -333,7 +333,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
}
size_t num_digits = mask.length() - start_i;
num_digits = std::min<size_t>(num_digits, 128);
if (num_digits > 128) num_digits = 128;
size_t end_i = num_digits + start_i;
@@ -348,7 +348,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
} else if (c >= 'A' && c <= 'F') {
id -= 'A' - 10;
} else {
COM_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
return false;
}
@@ -379,21 +379,21 @@ void common_params_print_info(const common_params & params, bool print_devices)
#else
const char * build_type = " (debug)";
#endif
COM_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
COM_INF("%s: verbosity = %d (adjust with the `-lv N` CLI arg)\n", __func__, common_log_get_verbosity_thold());
LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
// device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
if (print_devices) {
COM_TRC("%s", "device_info:\n");
LOG_INF("device_info:\n");
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
COM_TRC(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
}
}
COM_TRC("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
}
std::string common_params_get_system_info(const common_params & params) {
@@ -660,7 +660,7 @@ void string_process_escapes(std::string & input) {
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char * sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
COM_ERR("%s: malformed KV override '%s'\n", __func__, data);
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
@@ -683,20 +683,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
} else if (std::strcmp(sep, "false") == 0) {
kvo.val_bool = false;
} else {
COM_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}
} else if (strncmp(sep, "str:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
if (strlen(sep) > 127) {
COM_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
return false;
}
strncpy(kvo.val_str, sep, 127);
kvo.val_str[127] = '\0';
} else {
COM_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
@@ -1074,18 +1074,6 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
return files;
}
std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) {
#ifdef _WIN32
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
if (!wlen) { return std::ifstream(); }
std::vector<wchar_t> wfname(wlen);
(void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
return std::ifstream(wfname.data(), mode);
#else
return std::ifstream(fname, mode);
#endif
}
//
// TTY utils
//
@@ -1160,7 +1148,7 @@ static void common_init_sampler_from_model(
if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
if (!sampler_names.empty()) {
sparams.samplers = common_sampler_types_from_names(sampler_names);
sparams.samplers = common_sampler_types_from_names(sampler_names, true);
}
}
}
@@ -1199,8 +1187,8 @@ common_init_result::common_init_result(common_params & params, bool model_only)
auto cparams = common_context_params_to_llama(params);
if (params.fit_params) {
COM_TRC("%s", "fitting params to device memory ...\n");
COM_TRC("%s", "(for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n");
LOG_INF("%s: fitting params to device memory ...\n", __func__);
LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
common_fit_params(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split,
params.tensor_buft_overrides.data(),
@@ -1227,7 +1215,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) {
COM_ERR("failed to load lora adapter '%s'\n", la.path.c_str());
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
pimpl->model.reset(model);
return;
}
@@ -1246,14 +1234,14 @@ common_init_result::common_init_result(common_params & params, bool model_only)
common_init_sampler_from_model(model, params.sampling);
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
COM_WRN("%s", "vocab does not have an EOS token, ignoring --ignore-eos\n");
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sampling.ignore_eos = false;
}
// initialize once
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
COM_TRC("added %s logit bias = %f\n", common_token_to_piece(vocab, i).c_str(), -INFINITY);
LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
}
}
@@ -1291,7 +1279,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)
llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
COM_ERR("failed to create context with model '%s'\n", params.model.path.c_str());
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
return;
}
@@ -1328,7 +1316,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
llama_model * model = res->model();
if (model == NULL) {
COM_ERR("failed to load model '%s'\n", params.model.path.c_str());
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return res;
}
@@ -1338,14 +1326,14 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
llama_context * lctx = res->context();
if (lctx == NULL) {
COM_ERR("failed to create context with model '%s'\n", params.model.path.c_str());
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
return res;
}
const llama_vocab * vocab = llama_model_get_vocab(model);
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
COM_WRN("%s", "KV cache shifting is not supported for this context, disabling KV cache shifting\n");
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}
@@ -1374,7 +1362,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
bool ok = true;
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
COM_WRN("%s", "vocab does not have a BOS token, reranking will not work\n");
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false;
}
@@ -1383,10 +1371,10 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
if (!has_eos && !has_sep && !has_rerank_prompt) {
COM_WRN("%s", "vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n");
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
COM_WRN("%s", "vocab does not have an EOS token, using SEP token as fallback\n");
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
}
if (!ok) {
@@ -1399,7 +1387,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
}
if (params.warmup) {
COM_TRC("%s", "warming up the model with an empty run - please wait ... (--no-warmup to disable)\n");
LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
std::vector<llama_token> tmp;
llama_token bos = llama_vocab_bos(vocab);
@@ -1473,20 +1461,20 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
if (ret != 0) {
COM_ERR("llama_decode() failed: %d\n", ret);
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
goto done;
}
if (llama_n_rs_seq(ctx) > 0) {
COM_TRC("%s", "the context supports bounded partial sequence removal\n");
LOG_INF("%s: the context supports bounded partial sequence removal\n", __func__);
res = COMMON_CONTEXT_SEQ_RM_TYPE_RS;
goto done;
}
// try to remove the last tokens
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
COM_TRC("%s", "the context does not support partial sequence removal\n");
LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
goto done;
}
@@ -1803,13 +1791,13 @@ static common_control_vector_data common_control_vector_load_one(const common_co
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!ctx_gguf) {
COM_ERR("failed to load control vector file from %s\n", load_info.fname.c_str());
LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
return result;
}
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
if (n_tensors == 0) {
COM_WRN("no direction tensors found in %s\n", load_info.fname.c_str());
LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
}
for (int i = 0; i < n_tensors; i++) {
@@ -1827,23 +1815,23 @@ static common_control_vector_data common_control_vector_load_one(const common_co
}
}
if (layer_idx < 0) {
COM_ERR("invalid/unparsable direction tensor layer index in %s\n", load_info.fname.c_str());
LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
} else if (layer_idx == 0) {
COM_ERR("invalid (zero) direction tensor layer index in %s\n", load_info.fname.c_str());
LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
if (tensor->type != GGML_TYPE_F32) {
COM_ERR("invalid (non-F32) direction tensor type in %s\n", load_info.fname.c_str());
LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (ggml_n_dims(tensor) != 1) {
COM_ERR("invalid (non-1D) direction tensor shape in %s\n", load_info.fname.c_str());
LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
@@ -1851,7 +1839,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor);
} else if (ggml_nelements(tensor) != result.n_embd) {
COM_ERR("direction tensor in %s does not match previous dimensions\n", load_info.fname.c_str());
LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
@@ -1868,7 +1856,7 @@ static common_control_vector_data common_control_vector_load_one(const common_co
}
if (result.n_embd == -1) {
COM_WRN("skipping %s due to invalid direction tensors\n", load_info.fname.c_str());
LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
result.data.clear();
}
@@ -1889,7 +1877,7 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
break;
}
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
COM_ERR("control vectors in %s does not match previous dimensions\n", info.fname.c_str());
LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
result.n_embd = -1;
break;
}
@@ -1905,7 +1893,7 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
}
if (result.n_embd == -1) {
COM_ERR("%s", "no valid control vector files passed\n");
LOG_ERR("%s: no valid control vector files passed\n", __func__);
result.data.clear();
}
@@ -2016,13 +2004,13 @@ bool common_prompt_batch_decode(
// memory, so we can't just remove the last token from the memory and replay the last token which
// is the reason for this logic.
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_tokens_before_last))) {
COM_ERR("%s", "failed to eval\n");
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
n_past += n_tokens_before_last;
llama_state_save_file(ctx, state_path.data(), all_tokens.data(), all_tokens.size());
COM_INF("saved session before last token to %s, n_new = %zu\n", state_path.data(), all_tokens.size());
LOG_INF("saved session before last token to %s, n_new = %zu\n", state_path.data(), all_tokens.size());
llama_token last_token = all_tokens.back();
llama_batch batch = llama_batch_get_one(&last_token, 1);
@@ -2030,13 +2018,13 @@ bool common_prompt_batch_decode(
batch.pos = &pos;
if (llama_decode(ctx, batch)) {
COM_ERR("%s", "failed to eval last token\n");
LOG_ERR("%s : failed to eval last token\n", __func__);
return false;
}
n_past++;
} else {
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_new))) {
COM_ERR("%s", "failed to eval\n");
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
n_past += n_new;
@@ -2046,7 +2034,7 @@ bool common_prompt_batch_decode(
}
size_t common_prompt_checkpoint::size() const {
return data_tgt.size() + data_dft.size() + data_spec.size();
return data_tgt.size() + data_dft.size();
}
bool common_prompt_checkpoint::empty() const {
@@ -2061,7 +2049,6 @@ void common_prompt_checkpoint::clear() {
data_tgt.clear();
data_dft.clear();
data_spec.clear();
}
void common_prompt_checkpoint::update_pos(
@@ -2151,5 +2138,4 @@ void common_prompt_checkpoint::clear_tgt() {
void common_prompt_checkpoint::clear_dft() {
data_dft.clear();
data_spec.clear();
}
+21 -46
View File
@@ -25,13 +25,6 @@
#define DIRECTORY_SEPARATOR '/'
#endif // _WIN32
#define COM_DBG(fmt, ...) LOG_DBG("cmn %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define COM_TRC(fmt, ...) LOG_TRC("cmn %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define COM_INF(fmt, ...) LOG_INF("cmn %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define COM_WRN(fmt, ...) LOG_WRN("cmn %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define COM_ERR(fmt, ...) LOG_ERR("cmn %12.*s: " fmt, 12, __func__, __VA_ARGS__)
#define COM_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__)
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
@@ -103,7 +96,6 @@ enum llama_example {
LLAMA_EXAMPLE_FIT_PARAMS,
LLAMA_EXAMPLE_RESULTS,
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
LLAMA_EXAMPLE_DOWNLOAD,
LLAMA_EXAMPLE_COUNT,
};
@@ -169,7 +161,6 @@ enum common_speculative_type {
COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, // standalone draft model speculative decoding
COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding
COMMON_SPECULATIVE_TYPE_DRAFT_MTP, // Multi-token prediction
COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, // DFlash speculative decoding
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
@@ -299,25 +290,12 @@ struct common_params_sampling {
};
struct common_params_model {
std::string path = ""; // model local path
std::string url = ""; // model url to download
std::string hf_repo = ""; // HF repo
std::string hf_file = ""; // HF file
std::string docker_repo = ""; // Docker repo
std::string get_name() const {
if (!hf_repo.empty()) {
return hf_repo;
}
if (!docker_repo.empty()) {
return docker_repo;
}
return path;
}
bool empty() const {
return get_name().empty();
}
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
};
// draft-model-based speculative decoding parameters
@@ -380,12 +358,12 @@ struct common_params_speculative {
common_params_speculative_ngram_cache ngram_cache;
bool has_dft() const {
return !draft.mparams.empty();
return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
}
uint32_t need_n_rs_seq() const {
bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH;
return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
});
return needs_rs_seq ? draft.n_max : 0u;
@@ -511,7 +489,6 @@ struct common_params {
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string path_prompts_log_dir = ""; // directory with logged prompts // NOLINT
// llama-debug specific options
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
@@ -532,6 +509,7 @@ struct common_params {
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
bool offline = false;
bool skip_download = false; // skip model file downloading
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -593,10 +571,9 @@ struct common_params {
struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
std::vector<std::string> image; // path to image file(s)
int image_min_tokens = -1;
int image_max_tokens = -1;
int mtmd_batch_max_tokens = 1024;
// finetune
struct lr_opt lr;
@@ -621,7 +598,7 @@ struct common_params {
bool cache_prompt = true; // whether to enable prompt caching
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
int32_t checkpoint_min_step = 8192; // minimum spacing between context checkpoints
int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
std::string hostname = "127.0.0.1";
@@ -645,6 +622,12 @@ struct common_params {
// UI configs
bool ui = true;
// Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
bool webui = ui;
bool webui_mcp_proxy = false;
std::string webui_config_json;
bool ui_mcp_proxy = false;
std::string ui_config_json;
@@ -657,11 +640,10 @@ struct common_params {
std::vector<std::string> server_tools;
// router server configs
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
std::string models_preset_hf = ""; // show a warning about remote presets on router loaded (if not empty)
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
bool log_json = false;
@@ -863,9 +845,6 @@ struct common_file_info {
};
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
// fs open, also handle UTF8 on Windows
std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode);
//
// TTY utils
//
@@ -1083,10 +1062,6 @@ struct common_prompt_checkpoint {
std::vector<uint8_t> data_tgt;
std::vector<uint8_t> data_dft;
// (optional) speculative-decoding implementation state stashed with the checkpoint
// (e.g. eagle3's deferred-boundary g_embd row)
std::vector<uint8_t> data_spec;
size_t size() const;
bool empty() const;
+105 -114
View File
@@ -292,6 +292,10 @@ static int common_download_file_single_online(const std::string & url,
const bool file_exists = std::filesystem::exists(path);
if (!file_exists && opts.skip_download) {
return -2; // file is missing and download is disabled
}
if (file_exists && skip_etag) {
LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
@@ -358,6 +362,9 @@ static int common_download_file_single_online(const std::string & url,
return 304; // 304 Not Modified - fake cached response
}
// pass this point, the file exists but is different from the server version, so we need to redownload it
if (opts.skip_download) {
return -2; // special code to indicate that the download was skipped due to etag mismatch
}
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return -1;
@@ -684,8 +691,18 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) {
}
}
common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts) {
common_download_hf_plan plan;
struct hf_plan {
hf_cache::hf_file primary;
hf_cache::hf_files model_files;
hf_cache::hf_file mmproj;
hf_cache::hf_file mtp;
};
static hf_plan get_hf_plan(const common_params_model & model,
const common_download_opts & opts,
bool download_mmproj,
bool download_mtp) {
hf_plan plan;
hf_cache::hf_files all;
auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
@@ -700,14 +717,6 @@ common_download_hf_plan common_download_get_hf_plan(const common_params_model &
return plan;
}
// if preset.ini exists in the repo root, download only that file
for (const auto & f : all) {
if (f.path == "preset.ini") {
plan.preset = f;
return plan;
}
}
hf_cache::hf_file primary;
if (!model.hf_file.empty()) {
@@ -734,49 +743,115 @@ common_download_hf_plan common_download_get_hf_plan(const common_params_model &
plan.primary = primary;
plan.model_files = get_split_files(all, primary);
if (opts.download_mmproj) {
if (download_mmproj) {
plan.mmproj = find_best_mmproj(all, primary.path);
}
if (opts.download_mtp) {
if (download_mtp) {
plan.mtp = find_best_mtp(all, primary.path);
}
return plan;
}
void common_download_run_tasks(const std::vector<common_download_task> & tasks) {
struct download_task {
std::string url;
std::string path;
};
static std::vector<download_task> get_url_tasks(const common_params_model & model) {
auto split = get_gguf_split_info(model.url);
if (split.count <= 1) {
return {{model.url, model.path}};
}
auto filename = split.prefix;
if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
filename = split.prefix.substr(pos + 1);
}
auto parent_path = std::filesystem::path(model.path).parent_path();
auto prefix_path = (parent_path / filename).string();
std::vector<download_task> tasks;
for (int i = 1; i <= split.count; i++) {
auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
tasks.push_back({split.prefix + suffix, prefix_path + suffix});
}
return tasks;
}
common_download_model_result common_download_model(const common_params_model & model,
const common_download_opts & opts) {
common_download_model_result result;
std::vector<download_task> tasks;
hf_plan hf;
bool download_mmproj = opts.download_mmproj;
bool download_mtp = opts.download_mtp;
bool is_hf = !model.hf_repo.empty();
if (is_hf) {
hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
for (const auto & f : hf.model_files) {
tasks.push_back({f.url, f.local_path});
}
if (!hf.mmproj.path.empty()) {
tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
}
if (!hf.mtp.path.empty()) {
tasks.push_back({hf.mtp.url, hf.mtp.local_path});
}
} else if (!model.url.empty()) {
tasks = get_url_tasks(model);
} else {
result.model_path = model.path;
return result;
}
if (tasks.empty()) {
return result;
}
std::vector<std::future<int>> futures;
for (const auto & task : tasks) {
futures.push_back(std::async(std::launch::async,
[&task]() {
return common_download_file_single(task.url, task.local_path, task.opts, task.is_hf);
[&task, &opts, is_hf]() {
return common_download_file_single(task.url, task.path, opts, is_hf);
}
));
}
for (size_t i = 0; i < futures.size(); ++i) {
std::string url = tasks[i].url;
int status = futures[i].get();
for (auto & f : futures) {
int status = f.get();
if (status == -2 && opts.skip_download) {
throw common_skip_download_exception();
}
bool is_ok = is_http_status_ok(status);
if (!is_ok) {
throw std::runtime_error(string_format("Download '%s' failed with status code: %d", url.c_str(), status));
return {};
}
}
}
std::vector<std::string> common_download_get_all_parts(const std::string & url) {
auto split = get_gguf_split_info(url);
if (is_hf) {
for (const auto & f : hf.model_files) {
hf_cache::finalize_file(f);
}
result.model_path = hf.primary.final_path;
if (split.count <= 1) {
return {url};
if (!hf.mmproj.path.empty()) {
result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
}
if (!hf.mtp.path.empty()) {
result.mtp_path = hf_cache::finalize_file(hf.mtp);
}
} else {
result.model_path = model.path;
}
std::vector<std::string> parts;
for (int i = 1; i <= split.count; i++) {
auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
parts.push_back(split.prefix + suffix);
}
return parts;
return result;
}
//
@@ -922,87 +997,3 @@ std::vector<common_cached_model_info> common_list_cached_models() {
return result;
}
bool common_download_remove(const std::string & hf_repo_with_tag) {
namespace fs = std::filesystem;
auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag);
if (tag.empty()) {
return hf_cache::remove_cached_repo(repo_id);
}
std::string tag_upper = tag;
for (char & c : tag_upper) {
c = (char) std::toupper((unsigned char) c);
}
auto files = hf_cache::get_cached_files(repo_id);
if (files.empty()) {
return false;
}
// collect snapshot entries whose tag matches
std::vector<fs::path> to_remove;
for (const auto & f : files) {
auto split = get_gguf_split_info(f.path);
if (split.tag == tag_upper) {
to_remove.emplace_back(f.local_path);
}
}
if (to_remove.empty()) {
return false;
}
// resolve blob paths from symlinks before deleting snapshot entries
std::vector<fs::path> blobs_to_check;
for (const auto & p : to_remove) {
std::error_code ec;
if (fs::is_symlink(p, ec)) {
auto target = fs::read_symlink(p, ec);
if (!ec) {
blobs_to_check.push_back((p.parent_path() / target).lexically_normal());
}
}
}
// remove snapshot entries
for (const auto & p : to_remove) {
std::error_code ec;
fs::remove(p, ec);
if (ec) {
LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str());
}
}
if (blobs_to_check.empty()) {
return true;
}
// collect blobs still referenced by remaining snapshot entries
std::unordered_set<std::string> still_referenced;
for (const auto & f : hf_cache::get_cached_files(repo_id)) {
fs::path p(f.local_path);
std::error_code ec;
if (fs::is_symlink(p, ec)) {
auto target = fs::read_symlink(p, ec);
if (!ec) {
still_referenced.insert((p.parent_path() / target).lexically_normal().string());
}
}
}
// remove orphaned blobs
for (const auto & blob : blobs_to_check) {
if (still_referenced.find(blob.string()) == still_referenced.end()) {
std::error_code ec;
fs::remove(blob, ec);
if (ec) {
LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str());
}
}
}
return true;
}
+41 -35
View File
@@ -1,10 +1,7 @@
#pragma once
#include "hf-cache.h"
#include <string>
#include <vector>
#include <functional>
struct common_params_model;
@@ -50,40 +47,65 @@ struct common_cached_model_info {
}
};
// Options for common_download_file_single
// Options for common_download_model and common_download_file_single
struct common_download_opts {
std::string bearer_token;
common_header_list headers;
bool offline = false;
bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
bool download_mmproj = false;
bool download_mtp = false;
common_download_callback * callback = nullptr;
};
struct common_download_task {
common_download_opts opts;
std::string url;
std::string local_path;
std::function<void()> on_done;
bool is_hf = false;
common_download_task() = default;
common_download_task(hf_cache::hf_file f,
const common_download_opts & opts,
std::function<void()> on_done = nullptr)
: opts(opts), url(f.url), local_path(f.local_path), on_done(on_done), is_hf(true) {}
// Result of common_download_model
struct common_download_model_result {
std::string model_path;
std::string mmproj_path;
std::string mtp_path;
};
void common_download_run_tasks(const std::vector<common_download_task> & tasks);
// throw if the file is missing or invalid (e.g. ETag check failed)
struct common_skip_download_exception : public std::runtime_error {
common_skip_download_exception() : std::runtime_error("skip download") {}
};
// if url is a multi-part GGUF file, returns all parts, otherwise returns the single file
std::vector<std::string> common_download_get_all_parts(const std::string & url);
// Download model from HuggingFace repo or URL
//
// input (via model struct):
// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
// - model.hf_file: specific file in the repo (requires hf_repo)
// - model.url: simple download (used if hf_repo is empty)
// - model.path: local file path
//
// tag matching (for HF repos without model.hf_file):
// - if tag is specified, searches for GGUF matching that quantization
// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
//
// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
// detected and all parts are downloaded
//
// caching:
// - HF repos: uses HuggingFace cache
// - URLs: uses ETag-based caching
//
// when opts.offline=true, no network requests are made
// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
// then with the closest quantization bits
// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
//
// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
common_download_model_result common_download_model(
const common_params_model & model,
const common_download_opts & opts = {}
);
// returns list of cached models
std::vector<common_cached_model_info> common_list_cached_models();
// download single file from url to local path
// returns status code or -1 on error
// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
// skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
int common_download_file_single(const std::string & url,
const std::string & path,
@@ -93,19 +115,3 @@ int common_download_file_single(const std::string & url,
// resolve and download model from Docker registry
// return local path to downloaded model file
std::string common_docker_resolve_model(const std::string & docker);
// Remove a cached model from disk
// input format: "user/model" or "user/model:tag"
// - if tag is omitted, removes the entire repo cache directory
// - if tag is present, removes only files matching that tag (and orphaned blobs)
// returns true if anything was removed
bool common_download_remove(const std::string & hf_repo_with_tag);
struct common_download_hf_plan {
hf_cache::hf_file primary;
hf_cache::hf_files model_files;
hf_cache::hf_file mmproj;
hf_cache::hf_file mtp;
hf_cache::hf_file preset; // if set, only this file is downloaded
};
common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts);
+7 -30
View File
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
static std::vector<llama_device_memory_data> common_get_device_memory_data_impl(
std::vector<llama_device_memory_data> common_get_device_memory_data(
const char * path_model,
const llama_model_params * mparams,
const llama_context_params * cparams,
@@ -150,29 +150,6 @@ static std::vector<llama_device_memory_data> common_get_device_memory_data_impl(
return ret;
}
common_device_memory_data_vec common_get_device_memory_data(
const char * path_model,
const llama_model_params * mparams,
const llama_context_params * cparams,
std::vector<ggml_backend_dev_t> & devs,
uint32_t & hp_ngl,
uint32_t & hp_n_ctx_train,
uint32_t & hp_n_expert,
ggml_log_level log_level) {
std::vector<llama_device_memory_data> impl = common_get_device_memory_data_impl(
path_model, mparams, cparams, devs, hp_ngl, hp_n_ctx_train, hp_n_expert, log_level);
common_device_memory_data_vec ret(impl.size());
for (size_t i = 0; i < impl.size(); i++) {
ret[i].total = impl[i].total;
ret[i].free = impl[i].free;
ret[i].model = impl[i].mb.model;
ret[i].context = impl[i].mb.context;
ret[i].compute = impl[i].mb.compute;
}
return ret;
}
static void common_params_fit_impl(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
@@ -192,7 +169,7 @@ static void common_params_fit_impl(
// step 1: get data for default parameters and check whether any changes are necessary in the first place
LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
const dmds_t dmds_full = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
const size_t nd = devs.size(); // number of devices
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
@@ -233,7 +210,7 @@ static void common_params_fit_impl(
sum_projected_used = dmds_full.back().mb.total();
sum_free = dmds_full.back().total;
sum_projected_free = sum_free - sum_projected_used;
LOG_TRC("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (sum_projected_free >= margins[0]) {
LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
@@ -327,7 +304,7 @@ static void common_params_fit_impl(
int64_t sum_projected_used_min_ctx = 0;
cparams->n_ctx = n_ctx_min;
const dmds_t dmds_min_ctx = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
if (nd == 0) {
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
} else {
@@ -505,7 +482,7 @@ static void common_params_fit_impl(
llama_model_params mparams_copy = *mparams;
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
const dmds_t dmd_nl = common_get_device_memory_data_impl(
const dmds_t dmd_nl = common_get_device_memory_data(
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
LOG_TRC("%s: memory for test allocation by device:\n", func_name);
@@ -533,7 +510,7 @@ static void common_params_fit_impl(
mparams->tensor_buft_overrides = tensor_buft_overrides;
LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
const dmds_t dmds_cpu_moe = common_get_device_memory_data_impl(
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (size_t id = 0; id < nd; id++) {
@@ -963,7 +940,7 @@ void common_fit_print(
uint32_t hp_nct = 0; // hparams.n_ctx_train
uint32_t hp_nex = 0; // hparams.n_expert
auto dmd = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
GGML_ASSERT(dmd.size() == devs.size() + 1);
for (size_t id = 0; id < devs.size(); id++) {
+24 -32
View File
@@ -1,7 +1,9 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include "llama.h"
#include "../src/llama-ext.h"
#include <vector>
@@ -16,41 +18,31 @@ enum common_params_fit_status {
// - this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified
// with the exception of the context size which is modified if and only if equal to 0
common_params_fit_status common_fit_params(
const char * path_model,
llama_model_params * mparams,
llama_context_params * cparams,
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
size_t * margins, // margins of memory to leave per device in bytes
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
enum common_params_fit_status common_fit_params(
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams,
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
size_t * margins, // margins of memory to leave per device in bytes
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
// print estimated memory to stdout
void common_fit_print(
const char * path_model,
llama_model_params * mparams,
llama_context_params * cparams);
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams);
void common_memory_breakdown_print(const llama_context * ctx);
struct common_device_memory_data {
int64_t total;
int64_t free;
size_t model;
size_t context;
size_t compute;
};
using common_device_memory_data_vec = std::vector<common_device_memory_data>;
void common_memory_breakdown_print(const struct llama_context * ctx);
// Load a model + context with no_alloc and return the per-device memory breakdown.
common_device_memory_data_vec common_get_device_memory_data(
const char * path_model,
const llama_model_params * mparams,
const llama_context_params * cparams,
std::vector<ggml_backend_dev_t> & devs,
uint32_t & hp_ngl,
uint32_t & hp_n_ctx_train,
uint32_t & hp_n_expert,
ggml_log_level log_level);
std::vector<llama_device_memory_data> common_get_device_memory_data(
const char * path_model,
const struct llama_model_params * mparams,
const struct llama_context_params * cparams,
std::vector<ggml_backend_dev_t> & devs,
uint32_t & hp_ngl,
uint32_t & hp_n_ctx_train,
uint32_t & hp_n_expert,
enum ggml_log_level log_level);
-15
View File
@@ -495,19 +495,4 @@ std::string finalize_file(const hf_file & file) {
return file.final_path;
}
bool remove_cached_repo(const std::string & repo_id) {
if (!is_valid_repo_id(repo_id)) {
LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
return false;
}
fs::path repo_path = get_repo_path(repo_id);
std::error_code ec;
auto removed = fs::remove_all(repo_path, ec);
if (ec) {
LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str());
return false;
}
return removed > 0;
}
} // namespace hf_cache
-3
View File
@@ -29,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {});
// Create snapshot path (link or move/copy) and return it
std::string finalize_file(const hf_file & file);
// Remove the entire cached directory for a repo, returns true if removed
bool remove_cached_repo(const std::string & repo_id);
} // namespace hf_cache
+23 -44
View File
@@ -16,34 +16,22 @@ using json = nlohmann::ordered_json;
namespace jinja {
using caps_json_fn = std::function<json()>;
using caps_ctx_fn = std::function<void(context &)>;
using caps_analyze_fn = std::function<void(bool, value &, value &, const std::string &)>;
void caps_apply_preserve_reasoning(jinja::context & ctx, bool enabled) {
ctx.set_val("preserve_thinking", mk_val<value_bool>(enabled));
ctx.set_val("clear_thinking", mk_val<value_bool>(!enabled));
ctx.set_val("truncate_history_thinking", mk_val<value_bool>(!enabled));
}
using caps_analyze_fn = std::function<void(bool, value &, value &)>;
static void caps_try_execute(jinja::program & prog,
const caps_json_fn & messages_fn,
const caps_ctx_fn & ctx_fn,
const caps_json_fn & tools_fn,
const caps_analyze_fn & analyze_fn) {
context ctx;
ctx.is_get_stats = true;
jinja::global_from_json(ctx, json{
{"messages", messages_fn()},
{"tools", tools_fn ? tools_fn() : json::array()},
{"tools", tools_fn()},
{"bos_token", ""},
{"eos_token", ""},
{"add_generation_prompt", true}
}, true);
if (ctx_fn) {
ctx_fn(ctx);
}
auto messages = ctx.get_val("messages");
auto tools = ctx.get_val("tools");
@@ -61,7 +49,7 @@ static void caps_try_execute(jinja::program & prog,
// ignore exceptions during capability analysis
}
analyze_fn(success, messages, tools, result);
analyze_fn(success, messages, tools);
}
// for debugging only
@@ -121,9 +109,11 @@ caps caps_get(jinja::program & prog) {
}
});
},
nullptr, // ctx_fn
nullptr, // tools_fn
[&](bool success, value & messages, value &, const std::string &) {
[&]() {
// tools
return json{nullptr};
},
[&](bool success, value & messages, value &) {
auto & content = messages->at(0)->at("content");
caps_print_stats(content, "messages[0].content");
if (has_op(content, "selectattr") || has_op(content, "array_access")) {
@@ -155,9 +145,11 @@ caps caps_get(jinja::program & prog) {
},
});
},
nullptr, // ctx_fn
nullptr, // tools_fn
[&](bool, value & messages, value &, const std::string &) {
[&]() {
// tools
return json::array();
},
[&](bool, value & messages, value &) {
auto & content = messages->at(0)->at("content");
caps_print_stats(content, "messages[0].content");
if (!content->stats.used) {
@@ -209,7 +201,6 @@ caps caps_get(jinja::program & prog) {
},
});
},
nullptr, // ctx_fn
[&]() {
// tools
return json::array({
@@ -233,7 +224,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&](bool success, value & messages, value & tools, const std::string &) {
[&](bool success, value & messages, value & tools) {
if (!success) {
return; // Nothing can be inferred
}
@@ -302,7 +293,6 @@ caps caps_get(jinja::program & prog) {
},
});
},
nullptr, // ctx_fn
[&]() {
// tools
return json::array({
@@ -326,7 +316,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&](bool success, value & messages, value & tools, const std::string &) {
[&](bool success, value & messages, value & tools) {
if (!success) {
result.supports_tool_calls = false;
result.supports_tools = false;
@@ -404,7 +394,6 @@ caps caps_get(jinja::program & prog) {
},
});
},
nullptr, // ctx_fn
[&]() {
// tools
return json::array({
@@ -428,7 +417,7 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&](bool success, value & messages, value &, const std::string &) {
[&](bool success, value & messages, value & /*tools*/) {
if (!success) {
result.supports_parallel_tool_calls = false;
return;
@@ -449,22 +438,11 @@ caps caps_get(jinja::program & prog) {
JJ_DEBUG("%s\n", ">>> Running capability check: preserve reasoning");
// case: preserve reasoning content in chat history
const std::string reasoning_placeholder = "<REASONING_CONTENT_PLACEHOLDER>";
caps_try_execute(
prog,
[&]() {
// messages
return json::array({
{
{"role", "user"},
{"content", "User message"}
},
{
{"role", "assistant"},
{"content", "Assistant message"},
// check of reasoning_content deeper in the history, not just the last assistant message
{"reasoning_content", reasoning_placeholder}
},
{
{"role", "user"},
{"content", "User message"}
@@ -480,13 +458,14 @@ caps caps_get(jinja::program & prog) {
},
});
},
[&](context & ctx) {
caps_apply_preserve_reasoning(ctx, true);
[&]() {
// tools
return json::array();
},
nullptr, // tools_fn
[&](bool, value &, value &, const std::string & output) {
// note: we cannot use stats here because the reasoning_content may be used for "if" condition test, but not actually outputted in the final result
if (output.find(reasoning_placeholder) != std::string::npos) {
[&](bool, value & messages, value &) {
auto & content = messages->at(1)->at("reasoning_content");
caps_print_stats(content, "messages[1].reasoning_content");
if (content->stats.used) {
result.supports_preserve_reasoning = true;
}
}
+1 -5
View File
@@ -12,9 +12,7 @@ struct caps {
bool supports_tool_calls = true;
bool supports_system_role = true;
bool supports_parallel_tool_calls = true;
// supports preserve reasoning trace in the full history, not just the last assistant message
bool supports_preserve_reasoning = false;
bool supports_preserve_reasoning = false; // support assistant message with reasoning_content
// one of the 2 content capabilities must be true
bool supports_string_content = true;
@@ -31,6 +29,4 @@ struct caps {
caps caps_get(jinja::program & prog);
void caps_apply_preserve_reasoning(jinja::context & ctx, bool enabled);
} // namespace jinja
+54 -151
View File
@@ -316,22 +316,12 @@ value filter_expression::execute_impl(context & ctx) {
JJ_DEBUG("Applying filter to %s", input->type().c_str());
auto set_filter_alias = [](auto & filter_id) {
if (filter_id == "count") {
filter_id = "length";
} else if (filter_id == "d") {
filter_id = "default";
} else if (filter_id == "e") {
filter_id = "escape";
} else if (filter_id == "trim") {
filter_id = "strip";
}
};
if (is_stmt<identifier>(filter)) {
auto filter_id = cast_stmt<identifier>(filter)->val;
set_filter_alias(filter_id);
if (filter_id == "trim") {
filter_id = "strip"; // alias
}
JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str());
// TODO: Refactor filters so this coercion can be done automatically
if (!input->is_undefined() && !is_val<value_string>(input) && (
@@ -355,7 +345,9 @@ value filter_expression::execute_impl(context & ctx) {
}
auto filter_id = cast_stmt<identifier>(call->callee)->val;
set_filter_alias(filter_id);
if (filter_id == "trim") {
filter_id = "strip"; // alias
}
JJ_DEBUG("Applying filter '%s' with arguments to %s", filter_id.c_str(), input->type().c_str());
func_args args(ctx);
for (const auto & arg_expr : call->args) {
@@ -686,62 +678,59 @@ value set_statement::execute_impl(context & ctx) {
return mk_val<value_undefined>();
}
static inline void bind_parameters(const std::string & name, const statements & this_args, const func_args & args, context & ctx) {
const size_t expected_count = this_args.size();
const size_t input_count = args.count();
JJ_DEBUG("Invoking '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
for (size_t i = 0; i < expected_count; ++i) {
if (i < input_count) {
if (is_stmt<identifier>(this_args[i])) {
// normal parameter
std::string param_name = cast_stmt<identifier>(this_args[i])->val;
value param_value = args.get_kwarg_or_pos(param_name, i);
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
ctx.set_val(param_name, param_value);
} else if (is_stmt<keyword_argument_expression>(this_args[i])) {
// default argument used as normal parameter
auto kwarg = cast_stmt<keyword_argument_expression>(this_args[i]);
if (!is_stmt<identifier>(kwarg->key)) {
throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
}
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
value param_value = args.get_kwarg_or_pos(param_name, i);
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
ctx.set_val(param_name, param_value);
} else {
throw std::runtime_error("Invalid parameter type in '" + name + "'");
}
} else {
auto & default_arg = this_args[i];
if (is_stmt<keyword_argument_expression>(default_arg)) {
auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
if (!is_stmt<identifier>(kwarg->key)) {
throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
}
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
JJ_DEBUG(" Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
ctx.set_val(param_name, kwarg->val->execute(args.ctx));
} else {
throw std::runtime_error("Not enough arguments provided to '" + name + "'");
}
//std::string param_name = cast_stmt<identifier>(default_args[i])->val;
//JJ_DEBUG(" Binding parameter '%s' to default", param_name.c_str());
//ctx.var[param_name] = default_args[i]->execute(ctx);
}
}
}
value macro_statement::execute_impl(context & ctx) {
if (!is_stmt<identifier>(this->name)) {
throw std::runtime_error("Macro name must be an identifier");
}
std::string name = cast_stmt<identifier>(this->name)->val;
const func_handler func = [this, name](const func_args & args) -> value {
context macro_ctx(args.ctx); // new scope for macro execution
const func_handler func = [this, name, &ctx](const func_args & args) -> value {
size_t expected_count = this->args.size();
size_t input_count = args.count();
bind_parameters(name, this->args, args, macro_ctx);
JJ_DEBUG("Invoking macro '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
context macro_ctx(ctx); // new scope for macro execution
// bind parameters
for (size_t i = 0; i < expected_count; ++i) {
if (i < input_count) {
if (is_stmt<identifier>(this->args[i])) {
// normal parameter
std::string param_name = cast_stmt<identifier>(this->args[i])->val;
value param_value = args.get_kwarg_or_pos(param_name, i);
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
macro_ctx.set_val(param_name, param_value);
} else if (is_stmt<keyword_argument_expression>(this->args[i])) {
// default argument used as normal parameter
auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
if (!is_stmt<identifier>(kwarg->key)) {
throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
}
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
value param_value = args.get_kwarg_or_pos(param_name, i);
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
macro_ctx.set_val(param_name, param_value);
} else {
throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
}
} else {
auto & default_arg = this->args[i];
if (is_stmt<keyword_argument_expression>(default_arg)) {
auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
if (!is_stmt<identifier>(kwarg->key)) {
throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
}
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
JJ_DEBUG(" Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
macro_ctx.set_val(param_name, kwarg->val->execute(ctx));
} else {
throw std::runtime_error("Not enough arguments provided to macro '" + name + "'");
}
//std::string param_name = cast_stmt<identifier>(default_args[i])->val;
//JJ_DEBUG(" Binding parameter '%s' to default", param_name.c_str());
//macro_ctx.var[param_name] = default_args[i]->execute(ctx);
}
}
// execute macro body
JJ_DEBUG("Executing macro '%s' body with %zu statements", name.c_str(), this->body.size());
@@ -755,46 +744,6 @@ value macro_statement::execute_impl(context & ctx) {
return mk_val<value_undefined>();
}
value call_statement::execute_impl(context & ctx) {
auto call_expr = cast_stmt<call_expression>(this->call);
if (!call_expr) {
throw std::runtime_error("Call statement requires a valid call expression");
}
value callee_val = call_expr->callee->execute(ctx);
if (!is_val<value_func>(callee_val)) {
throw std::runtime_error("Callee is not a function: got " + callee_val->type());
}
auto * callee_func = cast_val<value_func>(callee_val);
context caller_ctx(ctx); // new scope for caller execution
const func_handler func = [this, caller_ctx = std::move(caller_ctx)](const func_args & args) -> value {
context block_ctx(caller_ctx); // new scope for block execution
bind_parameters("caller", this->caller_args, args, block_ctx);
JJ_DEBUG("Executing call body with %zu statements", this->body.size());
auto res = exec_statements(this->body, block_ctx);
JJ_DEBUG("Call body execution complete, result: %s", res->val_str.str().c_str());
return res;
};
context call_ctx(ctx);
call_ctx.set_val("caller", mk_val<value_func>("caller", func));
func_args args(call_ctx);
for (const auto & arg_expr : call_expr->args) {
auto arg_val = arg_expr->execute(ctx);
JJ_DEBUG(" Argument type: %s", arg_val->type().c_str());
args.push_back(arg_val);
}
JJ_DEBUG("Calling macro '%s' with %zu arguments", callee_func->name.c_str(), args.count());
return callee_func->invoke(args);
}
value member_expression::execute_impl(context & ctx) {
value object = this->object->execute(ctx);
@@ -812,9 +761,9 @@ value member_expression::execute_impl(context & ctx) {
if (is_stmt<slice_expression>(this->property)) {
auto s = cast_stmt<slice_expression>(this->property);
value start_val = s->start_expr ? s->start_expr->execute(ctx) : mk_val<value_int>(0);
value stop_val = s->stop_expr ? s->stop_expr->execute(ctx) : mk_val<value_int>(arr_size);
value step_val = s->step_expr ? s->step_expr->execute(ctx) : mk_val<value_int>(1);
value start_val = s->start_expr ? s->start_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val<value_int>(arr_size - 1) : mk_val<value_int>(0));
value stop_val = s->stop_expr ? s->stop_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val<value_int>(-1) : mk_val<value_int>(arr_size));
// translate to function call: obj.slice(start, stop, step)
JJ_DEBUG("Member expression is a slice: start %s, stop %s, step %s",
@@ -954,50 +903,4 @@ value keyword_argument_expression::execute_impl(context & ctx) {
return mk_val<value_kwarg>(k, v);
}
std::string runtime::debug_dump_program(const program & prog, const std::string & src) {
std::ostringstream oss;
size_t lvl = 0;
context ctx;
ctx.src.reset(new std::string(src));
auto indent = [](size_t lvl) -> std::string {
return std::string(lvl * 2, ' ');
};
ctx.visitor = [&](bool is_leaf, statement * node, std::vector<visitor_pair> children) {
oss << indent(lvl) << node->type() << ":\n";
lvl++;
if (is_leaf) {
const auto & pos = node->pos;
oss << indent(lvl) << "(leaf) at " << get_line_col(src, pos) << " in source:\n";
std::string snippet = peak_source(src, pos);
string_replace_all(snippet, "\n", "\n" + indent(lvl));
oss << indent(lvl) << snippet << "\n";
} else {
for (auto & [label, children_vec] : children) {
oss << indent(lvl) << label << ":\n";
lvl++;
if (children_vec.empty()) {
oss << indent(lvl) << "<empty>\n\n";
} else {
for (auto * child : children_vec) {
if (!child) {
continue;
}
child->visit(ctx);
}
}
lvl--;
}
}
lvl--;
};
for (const auto & stmt : prog.body) {
stmt->visit(ctx);
}
return oss.str();
}
} // namespace jinja
-128
View File
@@ -47,19 +47,12 @@ const T * cast_stmt(const statement_ptr & ptr) {
// not thread-safe
void enable_debug(bool enable);
// for visiting AST nodes
// function signature: void(bool is_leaf, statement * node, pair of <label, children>)
using visitor_pair = std::pair<std::string, std::vector<statement *>>;
using visitor_fn = std::function<void(bool, statement *, std::vector<visitor_pair>)>;
struct context {
std::shared_ptr<std::string> src; // for debugging; use shared_ptr to avoid copying on scope creation
std::time_t current_time; // for functions that need current time
bool is_get_stats = false; // whether to collect stats
visitor_fn visitor;
// src is optional, used for error reporting
context(std::string src = "") : src(std::make_shared<std::string>(std::move(src))) {
env = mk_val<value_object>();
@@ -106,15 +99,6 @@ private:
value_object env;
};
// utils for visiting AST nodes
static std::vector<statement *> stmts_to_ptr(const statements & stmts) {
std::vector<statement *> children;
for (const auto & stmt : stmts) {
children.push_back(stmt.get());
}
return children;
}
/**
* Base class for all nodes in the AST.
*/
@@ -122,7 +106,6 @@ struct statement {
size_t pos; // position in source, for debugging
virtual ~statement() = default;
virtual std::string type() const { return "Statement"; }
virtual void visit(context & ctx) { ctx.visitor(true, this, {}); }
// execute_impl must be overridden by derived classes
virtual value execute_impl(context &) { throw_exec_error(); }
@@ -183,13 +166,6 @@ struct if_statement : public statement {
std::string type() const override { return "If"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"test", {test.get()}},
{"body", stmts_to_ptr(body)},
{"alternate", stmts_to_ptr(alternate)}
});
}
};
struct identifier;
@@ -214,14 +190,6 @@ struct for_statement : public statement {
std::string type() const override { return "For"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"loopvar", {loopvar.get()}},
{"iterable", {iterable.get()}},
{"body", stmts_to_ptr(body)},
{"default_block", stmts_to_ptr(default_block)}
});
}
};
struct break_statement : public statement {
@@ -273,13 +241,6 @@ struct set_statement : public statement {
std::string type() const override { return "Set"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"assignee", {assignee.get()}},
{"value", {val.get()}},
{"body", stmts_to_ptr(body)}
});
}
};
struct macro_statement : public statement {
@@ -295,13 +256,6 @@ struct macro_statement : public statement {
std::string type() const override { return "Macro"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"name", {name.get()}},
{"args", stmts_to_ptr(args)},
{"body", stmts_to_ptr(body)}
});
}
};
struct comment_statement : public statement {
@@ -335,12 +289,6 @@ struct member_expression : public expression {
}
std::string type() const override { return "MemberExpression"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"object", {object.get()}},
{"property", {property.get()}}
});
}
};
struct call_expression : public expression {
@@ -354,12 +302,6 @@ struct call_expression : public expression {
}
std::string type() const override { return "CallExpression"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"callee", {callee.get()}},
{"args", stmts_to_ptr(args)}
});
}
};
/**
@@ -463,12 +405,6 @@ struct binary_expression : public expression {
}
std::string type() const override { return "BinaryExpression"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"left", {left.get()}},
{"right", {right.get()}}
});
}
};
/**
@@ -495,12 +431,6 @@ struct filter_expression : public expression {
std::string type() const override { return "FilterExpression"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"operand", {operand.get()}},
{"filter", {filter.get()}}
});
}
};
struct filter_statement : public statement {
@@ -513,12 +443,6 @@ struct filter_statement : public statement {
}
std::string type() const override { return "FilterStatement"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"filter", {filter.get()}},
{"body", stmts_to_ptr(body)}
});
}
};
/**
@@ -544,12 +468,6 @@ struct select_expression : public expression {
}
return lhs->execute_impl(ctx);
}
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"lhs", {lhs.get()}},
{"test", {test.get()}}
});
}
};
/**
@@ -568,12 +486,6 @@ struct test_expression : public expression {
}
std::string type() const override { return "TestExpression"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"operand", {operand.get()}},
{"test", {test.get()}}
});
}
};
/**
@@ -589,11 +501,6 @@ struct unary_expression : public expression {
}
std::string type() const override { return "UnaryExpression"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"argument", {argument.get()}}
});
}
};
struct slice_expression : public expression {
@@ -611,13 +518,6 @@ struct slice_expression : public expression {
[[noreturn]] value execute_impl(context &) override {
throw std::runtime_error("must be handled by MemberExpression");
}
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"start_expr", {start_expr.get()}},
{"stop_expr", {stop_expr.get()}},
{"step_expr", {step_expr.get()}}
});
}
};
struct keyword_argument_expression : public expression {
@@ -631,12 +531,6 @@ struct keyword_argument_expression : public expression {
}
std::string type() const override { return "KeywordArgumentExpression"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"key", {key.get()}},
{"val", {val.get()}}
});
}
};
struct spread_expression : public expression {
@@ -645,11 +539,6 @@ struct spread_expression : public expression {
chk_type<expression>(this->argument);
}
std::string type() const override { return "SpreadExpression"; }
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"argument", {argument.get()}}
});
}
};
struct call_statement : public statement {
@@ -663,14 +552,6 @@ struct call_statement : public statement {
for (const auto & arg : this->caller_args) chk_type<expression>(arg);
}
std::string type() const override { return "CallStatement"; }
value execute_impl(context & ctx) override;
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"call", {call.get()}},
{"caller_args", stmts_to_ptr(caller_args)},
{"body", stmts_to_ptr(body)}
});
}
};
struct ternary_expression : public expression {
@@ -693,13 +574,6 @@ struct ternary_expression : public expression {
return false_expr->execute(ctx);
}
}
void visit(context & ctx) override {
ctx.visitor(false, this, {
{"condition", {condition.get()}},
{"true_expr", {true_expr.get()}},
{"false_expr", {false_expr.get()}}
});
}
};
struct raised_exception : public std::exception {
@@ -773,8 +647,6 @@ struct runtime {
}
return parts;
}
static std::string debug_dump_program(const program & prog, const std::string & src);
};
} // namespace jinja
+7 -70
View File
@@ -90,14 +90,14 @@ static T slice(const T & array, int64_t start, int64_t stop, int64_t step = 1) {
stop_val = std::min(stop_val, len);
}
} else {
start_val = start;
start_val = len - 1;
if (start_val < 0) {
start_val = std::max(len + start_val, (int64_t)0);
start_val = std::max(len + start_val, (int64_t)-1);
} else {
start_val = std::min(start_val, len - 1);
}
stop_val = stop;
stop_val = -1;
if (stop_val < -1) {
stop_val = std::max(len + stop_val, (int64_t)-1);
} else {
@@ -673,9 +673,6 @@ const func_builtins & value_string_t::get_builtins() const {
std::string str = val_input->as_string().str();
// FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
if (delim.empty()) {
throw raised_exception("empty separator");
}
int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
auto result = mk_val<value_array>();
size_t pos = 0;
@@ -700,9 +697,6 @@ const func_builtins & value_string_t::get_builtins() const {
std::string str = val_input->as_string().str();
// FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace)
std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " ";
if (delim.empty()) {
throw raised_exception("empty separator");
}
int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1;
auto result = mk_val<value_array>();
size_t pos = 0;
@@ -728,23 +722,10 @@ const func_builtins & value_string_t::get_builtins() const {
if (count > 0) {
throw not_implemented_exception("String replace with count argument not implemented");
}
if (old_str != new_str) {
size_t pos = 0;
if (old_str.empty()) {
std::string new_res;
new_res.reserve(str.length() + new_str.length() * (str.length() + 1));
new_res += new_str;
for (const char c : str) {
new_res.push_back(c);
new_res += new_str;
}
str = new_res;
} else {
while ((pos = str.find(old_str, pos)) != std::string::npos) {
str.replace(pos, old_str.length(), new_str);
pos += new_str.length();
}
}
size_t pos = 0;
while ((pos = str.find(old_str, pos)) != std::string::npos) {
str.replace(pos, old_str.length(), new_str);
pos += new_str.length();
}
auto res = mk_val<value_string>(str);
res->val_str.mark_input_based_on(args.get_pos(0)->val_str);
@@ -1108,50 +1089,6 @@ const func_builtins & value_array_t::get_builtins() const {
std::reverse(arr.begin(), arr.end());
return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
}},
{"min", [](const func_args & args) -> value {
args.ensure_count(1, 4);
args.ensure_vals<value_array>();
value val_case = args.get_kwarg_or_pos("case_sensitive", 1);
value attribute = args.get_kwarg_or_pos("attribute", 2);
if (!attribute->is_undefined()) {
throw not_implemented_exception("min: attribute not implemented");
}
// FIXME: min is currently always case sensitive
(void) val_case;
const auto & arr = args.get_pos(0)->as_array();
if (arr.empty()) {
return mk_val<value_undefined>();
}
value result = arr[0];
for (size_t i = 1; i < arr.size(); ++i) {
if (value_compare(arr[i], result, value_compare_op::lt)) {
result = arr[i];
}
}
return result;
}},
{"max", [](const func_args & args) -> value {
args.ensure_count(1, 4);
args.ensure_vals<value_array>();
value val_case = args.get_kwarg_or_pos("case_sensitive", 1);
value attribute = args.get_kwarg_or_pos("attribute", 2);
if (!attribute->is_undefined()) {
throw not_implemented_exception("max: attribute not implemented");
}
// FIXME: max is currently always case sensitive
(void) val_case;
const auto & arr = args.get_pos(0)->as_array();
if (arr.empty()) {
return mk_val<value_undefined>();
}
value result = arr[0];
for (size_t i = 1; i < arr.size(); ++i) {
if (value_compare(arr[i], result, value_compare_op::gt)) {
result = arr[i];
}
}
return result;
}},
{"unique", array_unique_not_implemented},
};
return builtins;
+324
View File
@@ -0,0 +1,324 @@
#include "json-partial.h"
#include "log.h"
#include <nlohmann/json.hpp>
#include <string>
#include <regex>
using json = nlohmann::ordered_json;
enum common_json_stack_element_type {
COMMON_JSON_STACK_ELEMENT_OBJECT,
COMMON_JSON_STACK_ELEMENT_KEY,
COMMON_JSON_STACK_ELEMENT_ARRAY,
};
struct common_json_stack_element {
common_json_stack_element_type type;
std::string key;
};
bool common_json_parse(
const std::string & input,
const std::string & healing_marker,
common_json & out)
{
std::string::const_iterator it = input.begin();
const auto end = input.end();
return common_json_parse(it, end, healing_marker, out);
}
bool common_json_parse(
std::string::const_iterator & it,
const std::string::const_iterator & end,
const std::string & healing_marker,
common_json & out)
{
// // https://json.nlohmann.me/features/parsing/sax_interface/
struct json_error_locator : public nlohmann::json_sax<json> {
std::size_t position;
bool found_error;
std::string last_token;
std::string exception_message;
std::vector<common_json_stack_element> stack;
json_error_locator() : position(0), found_error(false) {}
bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
this->position = position - 1;
this->found_error = true;
this->last_token = last_token;
this->exception_message = ex.what();
return false;
}
void close_value() {
if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
stack.pop_back();
}
}
bool null() override { // NOLINT
close_value();
return true;
}
bool boolean(bool) override { // NOLINT
close_value();
return true;
}
bool number_integer(number_integer_t) override { // NOLINT
close_value();
return true;
}
bool number_unsigned(number_unsigned_t) override { // NOLINT
close_value();
return true;
}
bool number_float(number_float_t, const string_t &) override { // NOLINT
close_value();
return true;
}
bool string(string_t &) override { // NOLINT
close_value();
return true;
}
bool binary(binary_t &) override { // NOLINT
close_value();
return true;
}
bool start_object(std::size_t) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
return true;
}
bool end_object() override {
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
stack.pop_back();
close_value();
return true;
}
bool key(string_t & key) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
return true;
}
bool start_array(std::size_t) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
return true;
}
bool end_array() override {
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
stack.pop_back();
close_value();
return true;
}
};
json_error_locator err_loc;
auto start = it;
json::sax_parse(it, end, &err_loc);
if (err_loc.found_error) {
it = start;
auto temptative_end = it + err_loc.position;
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
auto input = std::string(it, temptative_end);
try {
out.json = json::parse(input);
// out.json = json::parse(it, temptative_end);
it = temptative_end;
return true;
} catch (const std::exception & ex) {
// No, needs healing.
LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
}
auto can_parse = [](const std::string & str) {
try {
auto _ = json::parse(str); // NOLINT
return true;
} catch (const std::exception &) {
return false;
}
};
if (!healing_marker.empty() && !err_loc.stack.empty()) {
std::string str(it, temptative_end);
auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
if (last_non_sp_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
}
auto last_non_sp_char = str[last_non_sp_pos];
// Used to detect stops on a number, which may not be complete.
auto was_maybe_number = [&]() {
if (!str.empty() && std::isspace(str.back())) {
return false;
}
return std::isdigit(last_non_sp_char) ||
last_non_sp_char == '.' ||
last_non_sp_char == 'e' ||
last_non_sp_char == 'E' ||
last_non_sp_char == '-';
};
std::string closing;
for (size_t i = err_loc.stack.size(); i > 0; i--) {
auto & el = err_loc.stack[i - 1];
if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
closing += "}";
} else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
closing += "]";
} else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
throw std::runtime_error("Unexpected stack element type");
}
}
// Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
auto is_high_surrogate = [&](const std::string & s) {
// Check if a partial of a high surrogate (U+D800-U+DBFF)
return s.length() >= 4 &&
s[0] == '\\' && s[1] == 'u' &&
std::tolower(s[2]) == 'd' &&
(s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
};
// Initialize the unicode marker to a low surrogate to handle the edge case
// where a high surrogate (U+D800-U+DBFF) is immediately followed by a
// backslash (\)
std::string unicode_marker_padding = "udc00";
std::smatch last_unicode_seq;
if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
std::smatch second_last_seq;
std::string prelude = str.substr(0, last_unicode_seq.position());
// Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
if (is_high_surrogate(last_unicode_seq.str())) {
// If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
unicode_marker_padding += "\\udc00";
} else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
if (is_high_surrogate(second_last_seq.str())) {
// If this follows a high surrogate, pad it to be a low surrogate
if (last_unicode_seq.length() == 2) {
unicode_marker_padding = "dc00";
} else if (last_unicode_seq.length() == 3) {
unicode_marker_padding = "c00";
} else {
// The original unicode_marker_padding is already padded with 0s
}
}
}
}
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
// We're inside an object value
if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
// Was about to create an object value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
} else if (can_parse(str + ": 1" + closing)) {
str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
} else if (last_non_sp_char == '{' && can_parse(str + closing)) {
// Was about to create an object
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
} else if (can_parse(str + "\"" + closing)) {
// Was inside an object value string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
// Was inside an object value string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
// Was inside an object value string after a partial unicode escape
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
} else {
// find last :
auto last_pos = str.find_last_of(':');
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
}
// Cutting back to opening : for object value
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
// Was about to create an array value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
} else if (can_parse(str + "\"" + closing)) {
// Was inside an array value string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
// Was inside an array value string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
// Was inside an array value string after a partial unicode escape
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
// Had just finished a value
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
} else {
auto last_pos = str.find_last_of("[,");
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
}
// Cutting back to last [ or , for array value
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
(last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
// Was about to create an object key+value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
} else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
// Was about to create an object key+value
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
} else if (can_parse(str + "\": 1" + closing)) {
// Was inside an object key string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
// Was inside an object key string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
} else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
// Was inside an object key string after a partial unicode escape
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
} else {
auto last_pos = str.find_last_of(':');
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
}
// fprintf(stderr, "Cutting back to last : for object key+value\n");
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else {
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
}
// fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
out.json = json::parse(str);
it = temptative_end;
return true;
}
// handle unclosed top-level primitive
if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
std::string str(it, temptative_end);
const auto & magic_seed = out.healing_marker.marker = healing_marker;
if (can_parse(str + "\"")) {
// Was inside an string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
// Was inside an string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
} else {
// TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
// fprintf(stderr, "Closing: TODO\n");
return false;
}
out.json = json::parse(str);
it = temptative_end;
return true;
}
return false;
}
out.json = json::parse(it, end);
it = end;
return true;
}
+39
View File
@@ -0,0 +1,39 @@
#pragma once
// TODO: use json_fwd.hpp when possible
#include <nlohmann/json.hpp>
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
struct common_healing_marker {
// Raw marker.
std::string marker;
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
std::string json_dump_marker;
};
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
struct common_json {
nlohmann::ordered_json json;
common_healing_marker healing_marker;
};
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
//
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
//
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
bool common_json_parse(
const std::string & input,
const std::string & healing_marker,
common_json & out);
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
bool common_json_parse(
std::string::const_iterator & it,
const std::string::const_iterator & end,
const std::string & healing_marker,
common_json & out);
+23 -23
View File
@@ -233,27 +233,27 @@ struct BuiltinRule {
};
static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"boolean", {"(\"true\" | \"false\")", {}}},
{"boolean", {"(\"true\" | \"false\") space", {}}},
{"decimal-part", {"[0-9]{1,16}", {}}},
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)?", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part)", {"integral-part"}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? space \"}\"", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? space \"]\"", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\"", {}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\"", {"char"}}},
{"null", {"\"null\"", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\"", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\"", {"time"}}},
{"date-time-string", {"\"\\\"\" date-time \"\\\"\"", {"date-time"}}}
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
{"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
};
static bool is_reserved_name(const std::string & name) {
@@ -551,16 +551,16 @@ private:
}
return join_seq();
};
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"");
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
}
/*
Returns a rule that matches a JSON string that is none of the provided strings
not_strings({"a"})
-> ["] ( [a] char+ | [^"a] char* )? ["]
-> ["] ( [a] char+ | [^"a] char* )? ["] space
not_strings({"and", "also"})
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["]
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
*/
std::string _not_strings(const std::vector<std::string> & strings) {
@@ -619,7 +619,7 @@ private:
if (!trie.is_end_of_string) {
out << "?";
}
out << " [\"]";
out << " [\"] space";
return out.str();
}
@@ -725,7 +725,7 @@ private:
rule += " )?";
}
rule += " space \"}\"";
rule += " \"}\" space";
return rule;
}
@@ -858,14 +858,14 @@ public:
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
}
if (schema.contains("const")) {
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
}
if (schema.contains("enum")) {
std::vector<std::string> enum_values;
for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v));
}
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ")");
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
}
if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") ||
@@ -933,7 +933,7 @@ public:
}
}
if (!enum_intersection.empty()) {
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ")");
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
}
}
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
@@ -948,7 +948,7 @@ public:
}
rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
}
rule += " space \"]\"";
rule += " \"]\" space";
return _add_rule(rule_name, rule);
}
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
@@ -956,7 +956,7 @@ public:
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " space \"]\"");
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
}
if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
return _visit_pattern(schema["pattern"], rule_name);
@@ -972,7 +972,7 @@ public:
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\"");
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
}
if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
int64_t min_value = std::numeric_limits<int64_t>::min();
@@ -990,7 +990,7 @@ public:
std::stringstream out;
out << "(";
build_min_max_int(min_value, max_value, out);
out << ")";
out << ") space";
return _add_rule(rule_name, out.str());
}
if (schema.empty() || schema_type == "object") {
+77 -83
View File
@@ -11,13 +11,8 @@
#include <sstream>
#include <thread>
#include <vector>
#include <algorithm>
#if defined(_WIN32)
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <io.h>
# include <windows.h>
# define isatty _isatty
@@ -67,15 +62,16 @@ static const char* g_col[] = {
};
struct common_log_entry {
enum ggml_log_level level {GGML_LOG_LEVEL_INFO};
enum ggml_log_level level;
bool prefix;
int64_t timestamp;
std::vector<char> msg;
int64_t timestamp { 0 };
bool is_end { false }; // signals the worker thread to stop
bool prefix { false };
common_log_entry(size_t size = 256) : msg(size) { }
// signals the worker thread to stop
bool is_end;
void print(FILE * file = nullptr) const {
FILE * fcur = file;
@@ -126,15 +122,22 @@ struct common_log_entry {
};
struct common_log {
// default capacity
common_log(size_t capacity = 512) {
file = nullptr;
prefix = false;
timestamps = false;
running = false;
t_start = t_us();
// default capacity - will be expanded if needed
common_log() : common_log(256) {}
common_log(size_t capacity) {
file = nullptr;
prefix = false;
timestamps = false;
running = false;
t_start = t_us();
// initial message size - will be expanded if longer messages arrive
entries.resize(capacity);
for (auto & entry : entries) {
entry.msg.resize(256);
}
queue.resize(capacity, common_log_entry(256));
head = 0;
tail = 0;
@@ -149,10 +152,9 @@ struct common_log {
}
private:
std::mutex mtx;
std::thread thrd;
std::condition_variable cv_new; // new entry
std::condition_variable cv_full; // wait on full
std::mutex mtx;
std::thread thrd;
std::condition_variable cv;
FILE * file;
@@ -162,53 +164,24 @@ private:
int64_t t_start;
// queue of entries
std::vector<common_log_entry> queue;
// ring buffer of entries
std::vector<common_log_entry> entries;
size_t head;
size_t tail;
bool print_entry(const common_log_entry & e) const {
if (e.is_end) return true;
e.print();
if (file) {
e.print(file);
}
return false;
}
bool flush_queue(size_t start_head, size_t end_tail, size_t & out_head) const {
bool stop = false;
size_t h = start_head;
while (h != end_tail && !stop) {
stop = print_entry(queue[h]);
h = (h + 1) % queue.size();
}
out_head = h;
return stop;
}
// worker thread copies into this
common_log_entry cur;
public:
bool is_full() const {
return ((tail + 1) % queue.size()) == head;
}
bool is_empty() const {
return head == tail;
}
void add(enum ggml_log_level level, const char * fmt, va_list args) {
std::unique_lock<std::mutex> lock(mtx);
// block if the queue is full
cv_full.wait(lock, [this]() { return !running || !is_full(); });
std::lock_guard<std::mutex> lock(mtx);
if (!running) {
// discard messages while the worker thread is paused
return;
}
auto & entry = queue[tail];
auto & entry = entries[tail];
{
// cannot use args twice, so make a copy in case we need to expand the buffer
@@ -243,16 +216,38 @@ public:
va_end(args_copy);
}
entry.is_end = false;
entry.level = level;
entry.prefix = prefix;
entry.level = level;
entry.prefix = prefix;
entry.timestamp = 0;
if (timestamps) {
entry.timestamp = t_us() - t_start;
}
entry.is_end = false;
tail = (tail + 1) % queue.size();
cv_new.notify_one();
tail = (tail + 1) % entries.size();
if (tail == head) {
// expand the buffer
std::vector<common_log_entry> new_entries(2*entries.size());
size_t new_tail = 0;
do {
new_entries[new_tail] = std::move(entries[head]);
head = (head + 1) % entries.size();
new_tail = (new_tail + 1);
} while (head != tail);
head = 0;
tail = new_tail;
for (size_t i = tail; i < new_entries.size(); i++) {
new_entries[i].msg.resize(256);
}
entries = std::move(new_entries);
}
cv.notify_one();
}
void resume() {
@@ -266,24 +261,23 @@ public:
thrd = std::thread([this]() {
while (true) {
std::unique_lock<std::mutex> lock(mtx);
cv_new.wait(lock, [this]() { return !is_empty(); });
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return head != tail; });
cur = entries[head];
size_t cached_head = head;
size_t cached_tail = tail;
head = (head + 1) % entries.size();
}
lock.unlock(); // drop the lock during flush
size_t next_head;
bool stop = flush_queue(cached_head, cached_tail, next_head);
lock.lock();
head = next_head;
cv_full.notify_all();
if (stop) {
if (cur.is_end) {
break;
}
cur.print(); // stdout and stderr
if (file) {
cur.print(file);
}
}
});
}
@@ -299,13 +293,13 @@ public:
running = false;
// push an entry to signal the worker thread to stop
auto & entry = queue[tail];
entry.is_end = true;
tail = (tail + 1) % queue.size();
{
auto & entry = entries[tail];
entry.is_end = true;
// wakeup everyone
cv_new.notify_one();
cv_full.notify_all();
tail = (tail + 1) % entries.size();
}
cv.notify_one();
}
thrd.join();
+78 -206
View File
@@ -6,14 +6,13 @@
#include "unicode.h"
#include <algorithm>
#include <deque>
#include <initializer_list>
#include <map>
#include <memory>
#include <nlohmann/json.hpp>
#include <regex>
#include <set>
#include <stdexcept>
#include <unordered_set>
// Trick to catch missing branches
template <typename T>
@@ -89,7 +88,40 @@ struct trie {
return match_result{match_result::NO_MATCH};
}
struct prefix_and_next {
std::vector<uint32_t> prefix;
std::vector<uint32_t> next_chars;
};
std::vector<prefix_and_next> collect_prefix_and_next() {
std::vector<uint32_t> prefix;
std::vector<prefix_and_next> result;
collect_prefix_and_next(0, prefix, result);
return result;
}
private:
void collect_prefix_and_next(size_t index, std::vector<uint32_t> & prefix, std::vector<prefix_and_next> & out) {
if (!nodes[index].is_word) {
if (!nodes[index].children.empty()) {
std::vector<uint32_t> chars;
chars.reserve(nodes[index].children.size());
for (const auto & p : nodes[index].children) {
chars.push_back(p.first);
}
out.emplace_back(prefix_and_next{prefix, chars});
}
}
for (const auto & p : nodes[index].children) {
uint32_t ch = p.first;
auto child = p.second;
prefix.push_back(ch);
collect_prefix_and_next(child, prefix, out);
prefix.pop_back();
}
}
size_t create_node() {
size_t index = nodes.size();
nodes.emplace_back();
@@ -121,65 +153,6 @@ struct trie {
}
};
// Aho-Corasick automaton
struct aho_corasick {
trie t;
std::vector<size_t> fail; // failure links
std::vector<size_t> order; // states in BFS order
std::vector<bool> terminal; // match states (directly or via a suffix link)
std::set<uint32_t> alphabet; // every character with a transition
aho_corasick(const std::vector<std::string> & strings) : t(strings) {
const auto & nodes = t.nodes;
const size_t n = nodes.size();
fail.assign(n, 0);
order.reserve(n);
std::deque<size_t> queue{ 0 };
while (!queue.empty()) {
size_t u = queue.front();
queue.pop_front();
order.push_back(u);
for (const auto & [ch, v] : nodes[u].children) {
if (u != 0) {
size_t f = fail[u];
while (f && nodes[f].children.find(ch) == nodes[f].children.end()) {
f = fail[f];
}
auto it = nodes[f].children.find(ch);
fail[v] = (it != nodes[f].children.end() && it->second != v) ? it->second : 0;
}
queue.push_back(v);
}
}
terminal.assign(n, false);
for (size_t u : order) {
terminal[u] = nodes[u].is_word || (u != 0 && terminal[fail[u]]);
}
for (const auto & node : nodes) {
for (const auto & [ch, v] : node.children) {
alphabet.insert(ch);
}
}
}
size_t num_states() const { return t.nodes.size(); }
bool is_terminal(size_t s) const { return terminal[s]; }
// follow failure links until a transition on `ch` exists.
size_t next(size_t state, uint32_t ch) const {
const auto & nodes = t.nodes;
while (state && nodes[state].children.find(ch) == nodes[state].children.end()) {
state = fail[state];
}
auto it = nodes[state].children.find(ch);
return it != nodes[state].children.end() ? it->second : 0;
}
};
static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
if (pos + hex_count > str.length()) {
return {0, 0};
@@ -921,10 +894,6 @@ struct parser_executor {
common_peg_parse_result operator()(const common_peg_gbnf_parser & p) {
return arena.parse(p.child, ctx, start_pos);
}
common_peg_parse_result operator()(const common_peg_ac_parser & p) {
return arena.parse(p.child, ctx, start_pos);
}
};
common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
@@ -993,8 +962,7 @@ void common_peg_arena::resolve_refs() {
std::is_same_v<T, common_peg_not_parser> ||
std::is_same_v<T, common_peg_tag_parser> ||
std::is_same_v<T, common_peg_atomic_parser> ||
std::is_same_v<T, common_peg_gbnf_parser> ||
std::is_same_v<T, common_peg_ac_parser>) {
std::is_same_v<T, common_peg_gbnf_parser>) {
p.child = resolve_ref(p.child);
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
p.child = resolve_ref(p.child);
@@ -1024,12 +992,12 @@ void common_peg_arena::resolve_refs() {
}
std::string common_peg_arena::dump(common_peg_parser_id id) const {
std::set<common_peg_parser_id> visited;
std::unordered_set<common_peg_parser_id> visited;
return dump_impl(id, visited);
}
std::string common_peg_arena::dump_impl(common_peg_parser_id id,
std::set<common_peg_parser_id> & visited) const {
std::unordered_set<common_peg_parser_id> & visited) const {
// Check for cycles
if (visited.count(id)) {
return "[cycle]";
@@ -1075,8 +1043,6 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
return "Atomic(" + dump_impl(p.child, visited) + ")";
} else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
return "Gbnf(" + p.grammar + ", " + dump_impl(p.child, visited) + ")";
} else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
return "Ac(" + string_join(p.delimiters, " | ") + ", " + dump_impl(p.child, visited) + ")";
} else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
return "Any";
} else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
@@ -1306,13 +1272,13 @@ common_peg_parser common_peg_parser_builder::string_content(char delimiter) {
common_peg_parser common_peg_parser_builder::double_quoted_string() {
return rule("double-quoted-string", [this]() {
return sequence({literal("\""), string_content('"'), literal("\"")});
return sequence({literal("\""), string_content('"'), literal("\""), space()});
});
}
common_peg_parser common_peg_parser_builder::single_quoted_string() {
return rule("single-quoted-string", [this]() {
return sequence({literal("'"), string_content('\''), literal("'")});
return sequence({literal("'"), string_content('\''), literal("'"), space()});
});
}
@@ -1335,25 +1301,25 @@ common_peg_parser common_peg_parser_builder::json_number() {
// At EOF in partial mode, chars returns NEED_MORE → negate propagates NEED_MORE → number not committed.
// This prevents premature commits of partial numbers (e.g. "3" when "3.14" is incoming).
auto not_number_continuation = negate(chars("[0-9.eE+-]", 1, 1));
return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation });
return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation, space() });
});
}
common_peg_parser common_peg_parser_builder::json_string() {
return rule("json-string", [this]() {
return sequence({literal("\""), string_content('"'), literal("\"")});
return sequence({literal("\""), string_content('"'), literal("\""), space()});
});
}
common_peg_parser common_peg_parser_builder::json_bool() {
return rule("json-bool", [this]() {
return choice({literal("true"), literal("false")});
return sequence({choice({literal("true"), literal("false")}), space()});
});
}
common_peg_parser common_peg_parser_builder::json_null() {
return rule("json-null", [this]() {
return literal("null");
return sequence({literal("null"), space()});
});
}
@@ -1368,7 +1334,8 @@ common_peg_parser common_peg_parser_builder::json_object() {
choice({
literal("}"),
sequence({members, ws, literal("}")})
})
}),
ws
});
});
}
@@ -1376,14 +1343,15 @@ common_peg_parser common_peg_parser_builder::json_object() {
common_peg_parser common_peg_parser_builder::json_array() {
return rule("json-array", [this]() {
auto ws = space();
auto elements = sequence({json(), zero_or_more(sequence({ws, literal(","), ws, json()}))});
auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
return sequence({
literal("["),
ws,
choice({
literal("]"),
sequence({elements, ws, literal("]")})
})
}),
ws
});
});
}
@@ -1413,13 +1381,16 @@ common_peg_parser common_peg_parser_builder::python_number() {
common_peg_parser common_peg_parser_builder::python_bool() {
return rule("python-bool", [this]() {
return choice({literal("True"), literal("False")});
return sequence({
choice({literal("True"), literal("False")}),
space()
});
});
}
common_peg_parser common_peg_parser_builder::python_null() {
return rule("python-none", [this]() {
return literal("None");
return sequence({literal("None"), space()});
});
}
@@ -1486,13 +1457,6 @@ common_peg_parser common_peg_parser_builder::json_member(const std::string & key
});
}
common_peg_parser common_peg_parser_builder::ac(const common_peg_parser & p, const std::vector<std::string> & delimiters) {
if (delimiters.empty()) {
throw std::runtime_error("ac parser requires at least one delimiter");
}
return add(common_peg_ac_parser{p, delimiters});
}
static std::string gbnf_escape_char_class(uint32_t c) {
if (c == '-' || c == ']' || c == '[' || c == '\\') {
return "\\" + std::string(1, (char) c);
@@ -1543,118 +1507,41 @@ static std::string gbnf_escape_char_class(uint32_t c) {
return std::string(buf);
}
static std::string gbnf_char_class(const std::vector<uint32_t> & chars, bool negate) {
std::string s = negate ? "[^" : "[";
for (uint32_t ch : chars) {
s += gbnf_escape_char_class(ch);
}
return s + "]";
}
static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
trie matcher(strings);
auto pieces = matcher.collect_prefix_and_next();
static std::string gbnf_ac_grammar(
const common_grammar_builder & builder,
const std::string & prefix,
const std::vector<std::string> & strings,
const std::function<std::string(const std::vector<uint32_t> &,
const std::map<size_t, std::vector<uint32_t>> &,
const std::vector<uint32_t> &,
const std::function<std::string(size_t)> &)> & build_rule) {
aho_corasick ac(strings);
auto state_name = [&](size_t s) -> std::string {
if (s == 0) {
return prefix;
}
std::string num = std::to_string(s);
num = num.size() == 1 ? ("0" + num) : num;
return prefix + "-" + num;
};
for (size_t q = 0; q < ac.num_states(); q++) {
if (ac.is_terminal(q)) {
continue; // match states
std::string pattern;
for (size_t i = 0; i < pieces.size(); ++i) {
if (i > 0) {
pattern += " | ";
}
std::map<size_t, std::vector<uint32_t>> buckets;
std::vector<uint32_t> completing; // chars that complete a delimiter
std::vector<uint32_t> specific; // chars with an explicit transition
for (uint32_t c : ac.alphabet) {
size_t d = ac.next(q, c);
if (ac.is_terminal(d)) {
completing.push_back(c);
specific.push_back(c);
} else if (d != 0) {
buckets[d].push_back(c); // specific non-root destination
specific.push_back(c);
}
const auto & pre = pieces[i].prefix;
const auto & chars = pieces[i].next_chars;
std::string cls;
cls.reserve(chars.size());
for (uint32_t ch : chars) {
cls += gbnf_escape_char_class(ch);
}
builder.add_rule(state_name(q), build_rule(completing, buckets, specific, state_name));
if (!pre.empty()) {
pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
} else {
pattern += "[^" + cls + "]";
}
}
// An empty delimiter makes the start state terminal. Emit an entry rule
// that matches the empty string so the returned reference stays valid.
if (ac.is_terminal(0)) {
builder.add_rule(prefix, "|");
}
return state_name(0);
return "(" + pattern + ")*";
}
// GBNF grammar matching strings that contain no string in `strings` as a
// substring. Emits the complement of an Aho-Corasick automaton DFA and returns
// the start state rule name.
//
// ref: https://github.com/ggml-org/llama.cpp/pull/24839
static std::string gbnf_excluding_grammar(const common_grammar_builder & builder,
const std::string & prefix,
const std::vector<std::string> & strings) {
return gbnf_ac_grammar(builder, prefix, strings,
[](const std::vector<uint32_t> & /*completing*/,
const std::map<size_t, std::vector<uint32_t>> & buckets,
const std::vector<uint32_t> & specific,
const std::function<std::string(size_t)> & state_name) {
// every state is accepting and completing chars get no
// alternative, so a forbidden string can never be matched
std::string rhs = "|";
for (const auto & [d, chars] : buckets) {
rhs += " " + gbnf_char_class(chars, false) + " " + state_name(d) + " |";
}
rhs += " " + gbnf_char_class(specific, true) + " " + state_name(0);
return rhs;
});
}
// GBNF grammar matching everything up to and including the first occurrence of
// any string in `strings`. Emits the Aho-Corasick automaton DFA and returns
// the start state rule name.
static std::string gbnf_including_grammar(const common_grammar_builder & builder,
const std::string & prefix,
const std::vector<std::string> & strings) {
return gbnf_ac_grammar(builder, prefix, strings,
[](const std::vector<uint32_t> & completing,
const std::map<size_t, std::vector<uint32_t>> & buckets,
const std::vector<uint32_t> & specific,
const std::function<std::string(size_t)> & state_name) {
std::vector<std::string> alts;
if (!completing.empty()) {
alts.push_back(gbnf_char_class(completing, false)); // terminate on match
}
for (const auto & [d, chars] : buckets) {
alts.push_back(gbnf_char_class(chars, false) + " " + state_name(d));
}
// every other character keeps scanning from the start state
alts.push_back(gbnf_char_class(specific, true) + " " + state_name(0));
return string_join(alts, " | ");
});
}
static std::set<std::string> collect_reachable_rules(
static std::unordered_set<std::string> collect_reachable_rules(
const common_peg_arena & arena,
const common_peg_parser_id & rule
) {
std::set<std::string> reachable;
std::set<std::string> visited;
std::unordered_set<std::string> reachable;
std::unordered_set<std::string> visited;
std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
const auto & parser = arena.get(id);
@@ -1686,7 +1573,6 @@ static std::set<std::string> collect_reachable_rules(
std::is_same_v<T, common_peg_tag_parser> ||
std::is_same_v<T, common_peg_atomic_parser> ||
std::is_same_v<T, common_peg_gbnf_parser> ||
std::is_same_v<T, common_peg_ac_parser> ||
std::is_same_v<T, common_peg_schema_parser>) {
visit(p.child);
} else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
@@ -1864,7 +1750,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
if (p.delimiters.empty()) {
return ".*";
}
return gbnf_excluding_grammar(builder, "until-" + std::to_string(id), p.delimiters);
return gbnf_excluding_pattern(p.delimiters);
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
if (schema_delegates(p)) {
return to_gbnf(p.child);
@@ -1881,8 +1767,6 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
return to_gbnf(p.child);
} else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
return p.grammar;
} else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
return gbnf_including_grammar(builder, "ac-" + std::to_string(id), p.delimiters);
} else {
static_assert(is_always_false_v<T>);
}
@@ -1890,7 +1774,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
};
// Collect reachable rules
std::set<std::string> reachable_rules;
std::unordered_set<std::string> reachable_rules;
if (lazy) {
// Collect rules reachable from trigger rules
@@ -2019,8 +1903,6 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
};
} else if constexpr (std::is_same_v<T, common_peg_gbnf_parser>) {
return json{{"type", "gbnf"}, {"child", p.child}, {"grammar", p.grammar}};
} else if constexpr (std::is_same_v<T, common_peg_ac_parser>) {
return json{{"type", "ac"}, {"child", p.child}, {"delimiters", p.delimiters}};
}
}, variant);
}
@@ -2193,16 +2075,6 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
};
}
if (type == "ac") {
if (!j.contains("child") || !j.contains("delimiters") || !j["delimiters"].is_array() || j["delimiters"].empty()) {
throw std::runtime_error("ac parser requires 'child' and a non-empty 'delimiters' array");
}
return common_peg_ac_parser{
j["child"].get<common_peg_parser_id>(),
j["delimiters"].get<std::vector<std::string>>(),
};
}
throw std::runtime_error("Unknown parser type: " + type);
}
+3 -16
View File
@@ -3,8 +3,8 @@
#include <nlohmann/json_fwd.hpp>
#include <memory>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <string_view>
#include <functional>
@@ -275,11 +275,6 @@ struct common_peg_gbnf_parser {
std::string grammar;
};
struct common_peg_ac_parser {
common_peg_parser_id child;
std::vector<std::string> delimiters;
};
// Variant holding all parser types
using common_peg_parser_variant = std::variant<
common_peg_epsilon_parser,
@@ -301,8 +296,7 @@ using common_peg_parser_variant = std::variant<
common_peg_ref_parser,
common_peg_atomic_parser,
common_peg_tag_parser,
common_peg_gbnf_parser,
common_peg_ac_parser
common_peg_gbnf_parser
>;
class common_peg_arena {
@@ -341,7 +335,7 @@ class common_peg_arena {
friend class common_peg_parser_builder;
private:
std::string dump_impl(common_peg_parser_id id, std::set<common_peg_parser_id> & visited) const;
std::string dump_impl(common_peg_parser_id id, std::unordered_set<common_peg_parser_id> & visited) const;
common_peg_parser_id add_parser(common_peg_parser_variant parser);
void add_rule(const std::string & name, common_peg_parser_id id);
@@ -520,13 +514,6 @@ class common_peg_parser_builder {
// the child's grammar. Parsing delegates entirely to the child.
common_peg_parser gbnf(const common_peg_parser & p, const std::string & grammar) { return add(common_peg_gbnf_parser{p, grammar}); }
// Wraps a child parser but emits a GBNF grammar built from the Aho-Corasick
// automaton of `delimiters`, matching everything up to and including the
// first delimiter. Parsing delegates entirely to the child, which is
// responsible for consuming the delimiter (e.g. until(D) + literal(D)).
common_peg_parser ac(const common_peg_parser & p, const std::vector<std::string> & delimiters);
common_peg_parser ac(const common_peg_parser & p, const std::string & delimiter) { return ac(p, std::vector<std::string>{delimiter}); }
void set_root(const common_peg_parser & p);
common_peg_arena build();
+49 -26
View File
@@ -7,7 +7,6 @@
#include <fstream>
#include <sstream>
#include <filesystem>
#include <regex>
static std::string rm_leading_dashes(const std::string & str) {
size_t pos = 0;
@@ -17,21 +16,46 @@ static std::string rm_leading_dashes(const std::string & str) {
return str.substr(pos);
}
static std::string canonical_tag(const std::string & tag) {
static const std::regex re_tag("[-.]([A-Z0-9_]+)$", std::regex::icase);
std::smatch m;
if (std::regex_search(tag, m, re_tag)) {
std::string canon = m[1].str();
for (char & c : canon) {
c = (char) std::toupper((unsigned char) c);
// only allow a subset of args for remote presets for security reasons
// do not add more args unless absolutely necessary
// args that output to files are strictly prohibited
static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
static const std::set<std::string> allowed_options = {
"model-url",
"hf-repo",
"hf-repo-draft",
"hf-repo-v", // vocoder
"hf-file-v", // vocoder
"mmproj-url",
"pooling",
"jinja",
"batch-size",
"ubatch-size",
"cache-reuse",
"chat-template-kwargs",
"mmap",
// note: sampling params are automatically allowed by default
// negated args will be added automatically if the positive arg is specified above
};
std::set<std::string> allowed_keys;
for (const auto & it : key_to_opt) {
const std::string & key = it.first;
const common_arg & opt = it.second;
if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
allowed_keys.insert(key);
// also add variant keys (args without leading dashes and env vars)
for (const auto & arg : opt.get_args()) {
allowed_keys.insert(rm_leading_dashes(arg));
}
for (const auto & env : opt.get_env()) {
allowed_keys.insert(env);
}
}
return canon;
}
std::string upper = tag;
for (char & c : upper) {
c = (char) std::toupper((unsigned char) c);
}
return upper;
return allowed_keys;
}
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
@@ -276,10 +300,16 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
return value;
}
common_preset_context::common_preset_context(llama_example ex)
common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
: ctx_params(common_params_parser_init(default_params, ex)) {
common_params_add_preset_options(ctx_params.options);
key_to_opt = get_map_key_opt(ctx_params);
// setup allowed keys if only_remote_allowed is true
if (only_remote_allowed) {
filter_allowed_keys = true;
allowed_keys = get_remote_preset_whitelist(key_to_opt);
}
}
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
@@ -288,18 +318,11 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
for (auto section : ini_data) {
common_preset preset;
std::string section_name = section.first.empty() ? std::string(COMMON_PRESET_DEFAULT_NAME) : section.first;
if (section_name != "*" && section_name != COMMON_PRESET_DEFAULT_NAME) {
auto colon_idx = section_name.rfind(':');
if (colon_idx != std::string::npos) {
std::string tag = section_name.substr(colon_idx + 1);
std::string canon_tag = canonical_tag(tag);
if (canon_tag != tag) {
section_name = section_name.substr(0, colon_idx + 1) + canon_tag;
}
}
if (section.first.empty()) {
preset.name = COMMON_PRESET_DEFAULT_NAME;
} else {
preset.name = section.first;
}
preset.name = section_name;
LOG_DBG("loading preset: %s\n", preset.name.c_str());
for (const auto & [key, value] : section.second) {
if (key == "version") {
+1 -1
View File
@@ -60,7 +60,7 @@ struct common_preset_context {
std::set<std::string> allowed_keys;
// if only_remote_allowed is true, only accept whitelisted keys
common_preset_context(llama_example ex);
common_preset_context(llama_example ex, bool only_remote_allowed = false);
// load presets from INI file
common_presets load_from_ini(const std::string & path, common_preset & global) const;
+10 -10
View File
@@ -65,12 +65,12 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
if (ctx->start_matcher.advance(token)) {
ctx->state = REASONING_BUDGET_COUNTING;
ctx->remaining = ctx->budget;
COM_TRC("activated, budget=%d tokens\n", ctx->budget);
LOG_INF("reasoning-budget: activated, budget=%d tokens\n", ctx->budget);
if (ctx->remaining <= 0) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
COM_TRC("%s", "budget=0, forcing immediately\n");
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
}
}
break;
@@ -80,7 +80,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
{
if (ctx->end_matcher.advance(token)) {
ctx->state = REASONING_BUDGET_DONE;
COM_TRC("%s", "deactivated (natural end)\n");
LOG_INF("reasoning-budget: deactivated (natural end)\n");
break;
}
@@ -95,7 +95,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
ctx->end_matcher.reset();
COM_TRC("%s", "UTF-8 complete, now forcing end sequence\n");
LOG_INF("reasoning-budget: UTF-8 complete, now forcing end sequence\n");
}
} else if (ctx->state == REASONING_BUDGET_COUNTING) {
ctx->remaining--;
@@ -104,11 +104,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
ctx->end_matcher.reset();
COM_TRC("%s", "budget exhausted, forcing end sequence\n");
LOG_INF("reasoning-budget: budget exhausted, forcing end sequence\n");
} else {
ctx->state = REASONING_BUDGET_WAITING_UTF8;
ctx->end_matcher.reset();
COM_TRC("%s", "budget exhausted, waiting for UTF-8 completion\n");
LOG_INF("reasoning-budget: budget exhausted, waiting for UTF-8 completion\n");
}
}
}
@@ -118,7 +118,7 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
ctx->force_pos++;
if (ctx->force_pos >= ctx->forced_tokens.size()) {
ctx->state = REASONING_BUDGET_DONE;
COM_TRC("%s", "forced sequence complete, done\n");
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
break;
case REASONING_BUDGET_DONE:
@@ -128,12 +128,12 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
ctx->state = REASONING_BUDGET_COUNTING;
ctx->remaining = ctx->budget;
ctx->end_matcher.reset();
COM_TRC("re-activated on new start tag, budget=%d tokens\n", ctx->budget);
LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
if (ctx->remaining <= 0) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
COM_TRC("%s", "budget=0, forcing immediately\n");
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
}
}
break;
@@ -264,7 +264,7 @@ bool common_reasoning_budget_force(struct llama_sampler * smpl) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
ctx->end_matcher.reset();
COM_TRC("%s", "forced into forcing state (manual transition)\n");
LOG_INF("reasoning-budget: forced into forcing state (manual transition)\n");
return true;
}
+204
View File
@@ -0,0 +1,204 @@
#include "regex-partial.h"
#include "common.h"
#include <functional>
#include <optional>
common_regex::common_regex(const std::string & pattern) :
pattern(pattern),
rx(pattern),
rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
std::smatch match;
if (pos > input.size()) {
throw std::runtime_error("Position out of bounds");
}
auto start = input.begin() + pos;
auto found = as_match
? std::regex_match(start, input.end(), match, rx)
: std::regex_search(start, input.end(), match, rx);
if (found) {
common_regex_match res;
res.type = COMMON_REGEX_MATCH_TYPE_FULL;
for (size_t i = 0; i < match.size(); ++i) {
auto begin = pos + match.position(i);
res.groups.emplace_back(begin, begin + match.length(i));
}
return res;
}
std::match_results<std::string::const_reverse_iterator> srmatch;
if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) {
auto group = srmatch[1].str();
if (group.length() != 0) {
auto it = srmatch[1].second.base();
// auto position = static_cast<size_t>(std::distance(input.begin(), it));
if ((!as_match) || it == input.begin()) {
common_regex_match res;
res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
const size_t begin = std::distance(input.begin(), it);
const size_t end = input.size();
if (begin == std::string::npos || end == std::string::npos || begin > end) {
throw std::runtime_error("Invalid range");
}
res.groups.push_back({begin, end});
return res;
}
}
}
return {};
}
/*
Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
to see if a string ends with a partial regex match, but but it's not in std::regex yet.
Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
- /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a)
- /a|b/ -> ^(a|b)
- /a*?/ -> error, could match ""
- /a*b/ -> ^((?:b)?a*+) (final repetitions become eager)
- /.*?ab/ -> ^((?:b)?a) (omit .*)
- /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches)
- /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a)
- /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
- /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a)
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern.
All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored.
*/
std::string regex_to_reversed_partial_regex(const std::string & pattern) {
auto it = pattern.begin();
const auto end = pattern.end();
std::function<std::string()> process = [&]() {
std::vector<std::vector<std::string>> alternatives(1);
std::vector<std::string> * sequence = &alternatives.back();
while (it != end) {
if (*it == '[') {
auto start = it;
++it;
while (it != end) {
if ((*it == '\\') && (++it != end)) {
++it;
} else if ((it != end) && (*it == ']')) {
break;
} else {
++it;
}
}
if (it == end) {
throw std::runtime_error("Unmatched '[' in pattern");
}
++it;
sequence->push_back(std::string(start, it));
} else if (*it == '*' || *it == '?' || *it == '+') {
if (sequence->empty()) {
throw std::runtime_error("Quantifier without preceding element");
}
sequence->back() += *it;
auto is_star = *it == '*';
++it;
if (is_star) {
if (it != end && *it == '?') {
++it;
}
}
} else if (*it == '{') {
if (sequence->empty()) {
throw std::runtime_error("Repetition without preceding element");
}
++it;
auto start = it;
while (it != end && *it != '}') {
++it;
}
if (it == end) {
throw std::runtime_error("Unmatched '{' in pattern");
}
auto parts = string_split(std::string(start, it), ",");
++it;
if (parts.size() > 2) {
throw std::runtime_error("Invalid repetition range in pattern");
}
auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
if (s.empty()) {
return def;
}
return std::stoi(s);
};
auto min = parseOptInt(parts[0], 0);
auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
if (min && max && *max < *min) {
throw std::runtime_error("Invalid repetition range in pattern");
}
// Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
auto part = sequence->back();
sequence->pop_back();
for (int i = 0; i < *min; i++) {
sequence->push_back(part);
}
if (max) {
for (int i = *min; i < *max; i++) {
sequence->push_back(part + "?");
}
} else {
sequence->push_back(part + "*");
}
} else if (*it == '(') {
++it;
if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
it += 2;
}
auto sub = process();
if (*it != ')') {
throw std::runtime_error("Unmatched '(' in pattern");
}
++it;
auto & part = sequence->emplace_back("(?:");
part += sub;
part += ")";
} else if (*it == ')') {
break;
} else if (*it == '|') {
++it;
alternatives.emplace_back();
sequence = &alternatives.back();
} else if (*it == '\\' && (++it != end)) {
auto str = std::string("\\") + *it;
sequence->push_back(str);
++it;
} else if (it != end) {
sequence->push_back(std::string(1, *it));
++it;
}
}
// /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a)
// if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
// We'll do the outermost capturing group and final .* in the enclosing function.
std::vector<std::string> res_alts;
for (const auto & parts : alternatives) {
auto & res = res_alts.emplace_back();
for (size_t i = 0; i < parts.size() - 1; i++) {
res += "(?:";
}
for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
res += *it;
if (it != parts.rend() - 1) {
res += ")?";
}
}
}
return string_join(res_alts, "|");
};
auto res = process();
if (it != end) {
throw std::runtime_error("Unmatched '(' in pattern");
}
return "^(" + res + ")";
}
+56
View File
@@ -0,0 +1,56 @@
#pragma once
#include <regex>
#include <string>
enum common_regex_match_type {
COMMON_REGEX_MATCH_TYPE_NONE,
COMMON_REGEX_MATCH_TYPE_PARTIAL,
COMMON_REGEX_MATCH_TYPE_FULL,
};
struct common_string_range {
size_t begin;
size_t end;
common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
if (begin > end) {
throw std::runtime_error("Invalid range");
}
}
// prevent default ctor
common_string_range() = delete;
bool empty() const {
return begin == end;
}
bool operator==(const common_string_range & other) const {
return begin == other.begin && end == other.end;
}
};
struct common_regex_match {
common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
std::vector<common_string_range> groups;
bool operator==(const common_regex_match & other) const {
return type == other.type && groups == other.groups;
}
bool operator!=(const common_regex_match & other) const {
return !(*this == other);
}
};
class common_regex {
std::string pattern;
std::regex rx;
std::regex rx_reversed_partial;
public:
explicit common_regex(const std::string & pattern);
common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
const std::string & str() const { return pattern; }
};
// For testing only (pretty print of failures).
std::string regex_to_reversed_partial_regex(const std::string & pattern);
+40 -52
View File
@@ -259,9 +259,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
}
if (!grmr && !grammar_str.empty()) {
throw std::runtime_error("failed to parse grammar");
}
// Compute prefill tokens from the generation prompt
std::vector<llama_token> prefill_tokens;
@@ -772,63 +769,54 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
}
}
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names) {
// sampler names can be written multiple ways; generate aliases from canonical names
static const auto sampler_name_map = []{
// canonical sampler name mapping
std::unordered_map<std::string, common_sampler_type> canonical_name_map {
{ "dry", COMMON_SAMPLER_TYPE_DRY },
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
{ "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P }
};
std::unordered_map<std::string, common_sampler_type> alias_name_map;
for (const auto & entry : canonical_name_map) {
const std::string & canonical = entry.first;
if (canonical.find('_') == std::string::npos) {
continue;
}
// kebab-case: "top-k", "min-p", etc.
{
std::string kebab_case = canonical;
std::replace(kebab_case.begin(), kebab_case.end(), '_', '-');
alias_name_map.insert({kebab_case, entry.second});
}
// no dash: "topk", "minp", etc.
{
std::string no_dash = canonical;
no_dash.erase(std::remove(no_dash.begin(), no_dash.end(), '_'), no_dash.end());
alias_name_map.insert({no_dash, entry.second});
}
}
// misc. aliases
alias_name_map.insert({"nucleus", COMMON_SAMPLER_TYPE_TOP_P});
alias_name_map.insert({"temp", COMMON_SAMPLER_TYPE_TEMPERATURE});
alias_name_map.insert({"typ", COMMON_SAMPLER_TYPE_TYPICAL_P});
// include aliases + canonical names in the complete mapping
alias_name_map.merge(canonical_name_map);
return alias_name_map;
}();
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
{ "dry", COMMON_SAMPLER_TYPE_DRY },
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
{ "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "adaptive-p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
};
std::vector<common_sampler_type> samplers;
samplers.reserve(names.size());
for (const auto & name : names) {
std::string name_lower = name;
std::transform(name_lower.begin(), name_lower.end(), name_lower.begin(), ::tolower);
auto sampler = sampler_name_map.find(name_lower);
if (sampler != sampler_name_map.end()) {
auto sampler = sampler_canonical_name_map.find(name);
if (sampler != sampler_canonical_name_map.end()) {
samplers.push_back(sampler->second);
continue;
}
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name_lower.c_str());
if (allow_alt_names) {
sampler = sampler_alt_name_map.find(name);
if (sampler != sampler_alt_name_map.end()) {
samplers.push_back(sampler->second);
continue;
}
}
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
}
return samplers;
+1 -1
View File
@@ -109,7 +109,7 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx,
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names);
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
+101 -1021
View File
File diff suppressed because it is too large Load Diff
-4
View File
@@ -68,10 +68,6 @@ void common_speculative_draft(common_speculative * spec);
// informs the speculative context that n_accepted tokens were accepted by the target model
void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
// (optional) get/set internal state
bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data);
void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data);
// print statistics about the speculative decoding
void common_speculative_print_stats(const common_speculative * spec);
-14
View File
@@ -40,18 +40,14 @@ TEXT_MODEL_MAP: dict[str, str] = {
"ChatGLMModel": "chatglm",
"CodeShellForCausalLM": "codeshell",
"CogVLMForCausalLM": "cogvlm",
"Cohere2MoeForCausalLM": "command_r",
"Cohere2ForCausalLM": "command_r",
"CohereForCausalLM": "command_r",
"DbrxForCausalLM": "dbrx",
"DeciLMForCausalLM": "deci",
"DeepseekForCausalLM": "deepseek",
"DeepseekOCRForCausalLM": "deepseek",
"DeepseekV2ForCausalLM": "deepseek",
"DeepseekV3ForCausalLM": "deepseek",
"DeepseekV32ForCausalLM": "deepseek",
"DFlashDraftModel": "qwen",
"DeepseekV4ForCausalLM": "deepseek",
"DistilBertForMaskedLM": "bert",
"DistilBertForSequenceClassification": "bert",
"DistilBertModel": "bert",
@@ -99,7 +95,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
"GraniteMoeHybridForCausalLM": "granite",
"GraniteMoeSharedForCausalLM": "granite",
"GraniteSpeechForConditionalGeneration": "granite",
"GraniteSpeechPlusForConditionalGeneration": "granite",
"Grok1ForCausalLM": "grok",
"GrokForCausalLM": "grok",
"GroveMoeForCausalLM": "grovemoe",
@@ -127,7 +122,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
"LLaDAModelLM": "llada",
"LLaMAForCausalLM": "llama",
"Lfm25AudioTokenizer": "lfm2",
"Lfm2BidirectionalModel": "lfm2",
"Lfm2ForCausalLM": "lfm2",
"Lfm2Model": "lfm2",
"Lfm2MoeForCausalLM": "lfm2",
@@ -136,10 +130,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
"LlamaBidirectionalModel": "llama",
"LlamaForCausalLM": "llama",
"LlamaModel": "llama",
"Eagle3DraftModel": "llama",
"Eagle3Speculator": "llama",
"Eagle3LlamaForCausalLM": "llama",
"LlamaForCausalLMEagle3": "llama",
"LlavaForConditionalGeneration": "llama",
"LlavaStableLMEpochForCausalLM": "stablelm",
"MPTForCausalLM": "mpt",
@@ -237,7 +227,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
"UMT5ForConditionalGeneration": "t5",
"UMT5Model": "t5",
"UltravoxModel": "ultravox",
"UnlimitedOCRForCausalLM": "deepseek",
"VLlama3ForCausalLM": "llama",
"VoxtralForConditionalGeneration": "llama",
"WavTokenizerDec": "wavtokenizer",
@@ -266,9 +255,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
"Glm4vMoeForConditionalGeneration": "qwen3vl",
"GlmOcrForConditionalGeneration": "qwen3vl",
"GlmasrModel": "ultravox",
"Granite4VisionForConditionalGeneration": "granite",
"GraniteSpeechForConditionalGeneration": "granite",
"GraniteSpeechPlusForConditionalGeneration": "granite",
"HunYuanVLForConditionalGeneration": "hunyuan",
"Idefics3ForConditionalGeneration": "smolvlm",
"InternVisionModel": "internvl",
@@ -304,7 +291,6 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
"StepVLForConditionalGeneration": "step3",
"Step3p7ForConditionalGeneration": "step3",
"UltravoxModel": "ultravox",
"UnlimitedOCRForCausalLM": "deepseek",
"VoxtralForConditionalGeneration": "ultravox",
"YoutuVLForConditionalGeneration": "youtuvl",
}
+1 -1
View File
@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+4 -30
View File
@@ -94,7 +94,6 @@ class ModelBase:
metadata: gguf.Metadata
dir_model_card: Path
remote_hf_model_id: str | None
target_model_dir: Path | None
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
@@ -120,7 +119,6 @@ class ModelBase:
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
disable_mistral_community_chat_template: bool = False,
sentence_transformers_dense_modules: bool = False,
target_model_dir: Path | None = None,
fuse_gate_up_exps: bool = False,
fp8_as_q8: bool = False):
if type(self) is ModelBase or \
@@ -141,7 +139,6 @@ class ModelBase:
self.dry_run = dry_run
self.remote_hf_model_id = remote_hf_model_id
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
self.target_model_dir = target_model_dir
self.fuse_gate_up_exps = fuse_gate_up_exps
self._gate_exp_buffer: dict[int, Tensor] = {}
self._up_exp_buffer: dict[int, Tensor] = {}
@@ -1119,10 +1116,8 @@ class TextModel(ModelBase):
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)
# Ensure global params are mirrored in rope_parameters
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
if local_rope_theta is not None:
self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@@ -1130,10 +1125,6 @@ class TextModel(ModelBase):
self.rope_parameters["rope_theta"] = rope_theta
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
self.rope_parameters["rope_type"] = rope_type
if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings
@classmethod
def __init_subclass__(cls):
@@ -1201,7 +1192,7 @@ class TextModel(ModelBase):
self.gguf_writer.add_embedding_length(n_embd)
logger.info(f"gguf: embedding length = {n_embd}")
if (n_ff := self.find_hparam(["prefix_dense_intermediate_size", "intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
self.gguf_writer.add_feed_forward_length(n_ff)
logger.info(f"gguf: feed forward length = {n_ff}")
@@ -1273,7 +1264,7 @@ class TextModel(ModelBase):
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
if (n_experts := self.find_hparam(["num_local_experts", "num_experts", "n_routed_experts"], optional=True)) is not None:
if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
self.gguf_writer.add_expert_count(n_experts)
logger.info(f"gguf: expert count = {n_experts}")
if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None:
@@ -1286,13 +1277,11 @@ class TextModel(ModelBase):
self.gguf_writer.add_expert_group_used_count(n_group_used)
logger.info(f"gguf: expert groups used count = {n_group_used}")
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func", "expert_selection_fn"], optional=True)) is not None:
if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None:
if score_func == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif score_func == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
elif score_func == "sqrtsoftplus":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SQRTSOFTPLUS)
else:
raise ValueError(f"Unsupported expert score gating function value: {score_func}")
logger.info(f"gguf: expert score gating function = {score_func}")
@@ -1503,9 +1492,6 @@ class TextModel(ModelBase):
if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1":
# ref: https://huggingface.co/CohereLabs/tiny-aya-base
res = "tiny_aya"
if chkhsh == "52df12b4c8d4176e7481aab4b6e8454d1fd0a210a04a574f6d4e067d10e23c3e":
# ref: https://huggingface.co/CohereLabs/North-Mini-Code-1.0
res = "cohere2moe"
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
res = "qwen2"
@@ -2495,7 +2481,6 @@ class LazyTorchTensor(gguf.LazyBase):
torch.float16: np.float16,
torch.float32: np.float32,
torch.uint8: np.uint8,
torch.int64: np.int64,
}
# only used when byteswapping data. Only correct size is needed
@@ -2602,17 +2587,6 @@ class LazyTorchTensor(gguf.LazyBase):
return cls._wrap_fn(func)(*args, **kwargs)
if hasattr(torch, "float8_e8m0fnu"):
_torch_float8_e8m0 = torch.float8_e8m0fnu
LazyTorchTensor._dtype_map[_torch_float8_e8m0] = np.uint8
LazyTorchTensor._dtype_byteswap_map[_torch_float8_e8m0] = np.uint8
LazyTorchTensor._dtype_str_map["F8_E8M0"] = _torch_float8_e8m0
else:
# Older torch builds do not expose F8_E8M0. Keep the raw bytes so callers
# that know the format can decode them explicitly.
LazyTorchTensor._dtype_str_map["F8_E8M0"] = torch.uint8
def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
# TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
# maybe we should fallback to text model's arch in that case, since not many models have both
+1 -1
View File
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
rope_dim = self.hparams["attention_dim"]
else:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_add_bos_token(False)
rope_freq = 10000
if "rope_ratio" in self.hparams:
-120
View File
@@ -1,6 +1,5 @@
from __future__ import annotations
import re
from typing import Iterable, TYPE_CHECKING
import torch
@@ -56,122 +55,3 @@ class Cohere2Model(TextModel):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Cohere2MoeForCausalLM")
class Cohere2MoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.COHERE2MOE
_n_main_layers: int | None = None
_expert_tensor_re = re.compile(
r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight"
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp:
self.block_count += n_nextn
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)]
def _set_vocab_gpt2(self) -> None:
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
hparams = self.hparams
expert_intermediate_size = hparams["intermediate_size"]
mlp_layer_types = hparams.get("mlp_layer_types")
n_dense_lead = hparams.get("first_k_dense_replace", 0)
if mlp_layer_types is not None:
n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types))
super().set_gguf_parameters()
self.gguf_writer.add_logit_scale(hparams["logit_scale"])
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
self.gguf_writer.add_leading_dense_block_count(n_dense_lead)
self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False))
if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0:
if hparams.get("shared_expert_combination_strategy", "average") != "average":
raise ValueError("Cohere2 MoE only supports average shared expert combination")
self.gguf_writer.add_expert_shared_count(num_shared_experts)
self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts)
if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp:
self.gguf_writer.add_nextn_predict_layers(n_nextn)
self.gguf_writer.add_rope_dimension_count(hparams["head_dim"])
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
def index_tensors(self, remote_hf_model_id: str | None = None):
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
self._n_main_layers = hparams.get("num_hidden_layers")
type(self)._n_main_layers = self._n_main_layers
return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
@classmethod
def filter_tensors(cls, item):
if (titem := super().filter_tensors(item)) is None:
return None
name, gen = titem
if cls._n_main_layers is not None:
is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
if is_mtp and cls.no_mtp:
return None
if cls.mtp_only and not is_mtp and name not in (
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
):
return None
return name, gen
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.endswith(".bias"):
if torch.any(data_torch != 0):
raise ValueError(f"Bias tensor {name!r} is not zero.")
logger.debug(f"Skipping bias tensor {name!r}.")
return
if (m := self._expert_tensor_re.fullmatch(name)) is not None:
n_experts = self.hparams["num_experts"]
layer_idx = int(m.group(1))
assert bid is None or bid == layer_idx
self._experts[layer_idx][name] = data_torch
expected = {
f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
for xid in range(n_experts)
for w_name in ("down_proj", "gate_proj", "up_proj")
}
if expected.issubset(self._experts[layer_idx]):
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[layer_idx][ename])
del self._experts[layer_idx][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight"
yield from super().modify_tensors(data_torch, merged_name, layer_idx)
return
yield from super().modify_tensors(data_torch, name, bid)
def prepare_tensors(self):
super().prepare_tensors()
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
+1 -1
View File
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
+3 -318
View File
@@ -1,23 +1,20 @@
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Any, Callable, Iterable, TYPE_CHECKING
import numpy as np
import torch
if TYPE_CHECKING:
from torch import Tensor
from .base import LazyTorchTensor, MmprojModel, ModelBase, TextModel, gguf, logger
from .base import MmprojModel, ModelBase, TextModel, gguf, logger
from .qwen import QwenModel
@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -208,8 +205,6 @@ class DeepseekModel(TextModel):
@ModelBase.register(
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"DeepseekOCRForCausalLM",
"UnlimitedOCRForCausalLM",
"KimiVLForConditionalGeneration",
"KimiK25ForConditionalGeneration",
"YoutuForCausalLM",
@@ -229,7 +224,7 @@ class DeepseekV2Model(TextModel):
self.origin_hf_arch = hparams.get('architectures', [None])[0]
# special handling for Deepseek OCR
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
@@ -355,12 +350,6 @@ class DeepseekV2Model(TextModel):
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
# Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
if is_ocr:
sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
if sliding_window:
self.gguf_writer.add_sliding_window(sliding_window)
if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
@@ -470,307 +459,3 @@ class DeepseekV32Model(DeepseekV2Model):
self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
@ModelBase.register("DeepseekV4ForCausalLM")
class DeepseekV4Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK4
_skipped_mtp_tensors = 0
def __init__(self, *args, **kwargs):
type(self)._skipped_mtp_tensors = 0
super().__init__(*args, **kwargs)
with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
raw_hparams = json.load(f)
for key, value in raw_hparams.items():
self.hparams.setdefault(key, value)
self.block_count = self.hparams["num_hidden_layers"]
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
self._dsv4_fp8_dequantized: set[str] = set()
self._dsv4_bf16_tensors: set[str] = set()
self._dsv4_f32_tensors: set[str] = set()
self._dsv4_mxfp4_generated = False
self._collect_source_dtypes()
if type(self)._skipped_mtp_tensors:
logger.info("Skipping %d DeepSeek-V4 MTP tensor(s) for conversion v0", type(self)._skipped_mtp_tensors)
# add a default chat template; if the model has a built-in template, it will be overridden later
template_path = Path(__file__).parent.parent / "models" / "templates" / "deepseek-ai-DeepSeek-V4.jinja"
if template_path.is_file():
with open(template_path, "r", encoding="utf-8") as f:
self.gguf_writer.add_chat_template(f.read())
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, _ = item
if name.startswith("mtp."):
cls._skipped_mtp_tensors += 1
return None
return super().filter_tensors(item)
@staticmethod
def _float8_dtypes() -> tuple[torch.dtype, ...]:
return tuple(
dtype for dtype in (
getattr(torch, "float8_e4m3fn", None),
getattr(torch, "float8_e5m2", None),
) if dtype is not None
)
@staticmethod
def _e8m0_to_float(scale: Tensor) -> Tensor:
torch_float8_e8m0 = getattr(torch, "float8_e8m0fnu", None)
if torch_float8_e8m0 is not None and scale.dtype == torch_float8_e8m0:
return scale.float()
bits = scale.view(torch.uint8).float()
return torch.exp2(bits - 127.0)
def _collect_source_dtypes(self) -> None:
for name, gen in self.model_tensors.items():
dtype = gen().dtype
if dtype == torch.bfloat16:
self._dsv4_bf16_tensors.add(name)
elif dtype == torch.float32:
self._dsv4_f32_tensors.add(name)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
self.gguf_writer.add_swiglu_clamp_exp([hparams["swiglu_limit"]] * self.block_count)
self.gguf_writer.add_swiglu_clamp_shexp([hparams["swiglu_limit"]] * self.block_count)
self.gguf_writer.add_indexer_head_count(hparams["index_n_heads"])
self.gguf_writer.add_indexer_key_length(hparams["index_head_dim"])
self.gguf_writer.add_indexer_top_k(hparams["index_topk"])
self.gguf_writer.add_attention_output_group_count(hparams["o_groups"])
self.gguf_writer.add_attention_output_lora_rank(hparams["o_lora_rank"])
self.gguf_writer.add_attention_compress_ratios(hparams["compress_ratios"])
self.gguf_writer.add_attention_compress_rope_freq_base(hparams["compress_rope_theta"])
self.gguf_writer.add_hyper_connection_count(hparams["hc_mult"])
self.gguf_writer.add_hyper_connection_sinkhorn_iterations(hparams["hc_sinkhorn_iters"])
self.gguf_writer.add_hyper_connection_epsilon(hparams["hc_eps"])
self.gguf_writer.add_hash_layer_count(hparams["num_hash_layers"])
def dequant_model(self):
fp8_dtypes = self._float8_dtypes()
tensors_to_remove: list[str] = []
def dequant_fp8_weight(weight: Tensor, scale: Tensor) -> Tensor:
out_features, in_features = weight.shape
scale_f = self._e8m0_to_float(scale)
scale_f = scale_f.repeat_interleave(128, 0)[:out_features]
scale_f = scale_f.repeat_interleave(128, 1)[:, :in_features]
return weight.float() * scale_f
for name in list(self.model_tensors.keys()):
if not name.endswith(".scale"):
continue
weight_name = name.removesuffix(".scale") + ".weight"
if weight_name not in self.model_tensors:
continue
weight = self.model_tensors[weight_name]
scale = self.model_tensors[name]
if weight().dtype not in fp8_dtypes:
continue
self.model_tensors[weight_name] = lambda w=weight, s=scale: dequant_fp8_weight(w(), s())
self._dsv4_fp8_dequantized.add(weight_name)
tensors_to_remove.append(name)
for name in tensors_to_remove:
del self.model_tensors[name]
@staticmethod
def _pack_mxfp4_blocks(weight: Tensor, scale: Tensor) -> np.ndarray:
packed = weight.contiguous().view(torch.uint8)
scale_u8 = scale.contiguous().view(torch.uint8)
out_features, packed_cols = packed.shape
logical_cols = packed_cols * 2
if logical_cols % 32 != 0:
raise ValueError(f"MXFP4 source row has {logical_cols} values, expected a multiple of 32")
n_blocks = logical_cols // 32
if tuple(scale_u8.shape) != (out_features, n_blocks):
raise ValueError(f"MXFP4 scale shape {tuple(scale_u8.shape)} does not match {(out_features, n_blocks)}")
src = packed.reshape(out_features, n_blocks, 16)
low = src & 0x0F
high = (src >> 4) & 0x0F
# The safetensors bytes store adjacent values as low/high nibbles.
# ggml MXFP4 blocks store values 0..15 in low nibbles and 16..31 in high nibbles.
vals = torch.stack((low, high), dim=-1).reshape(out_features, n_blocks, 32)
qs = vals[:, :, :16] | (vals[:, :, 16:] << 4)
raw = torch.cat((scale_u8.unsqueeze(-1), qs.to(torch.uint8)), dim=-1)
return raw.reshape(out_features, n_blocks * 17).cpu().numpy()
def _write_mxfp4_expert_tensor(self, bid: int, proj: str, tensor_key: gguf.MODEL_TENSOR) -> list[str]:
n_experts = self.hparams["n_routed_experts"]
data: np.ndarray | None = None
consumed: list[str] = []
for eid in range(n_experts):
weight_name = f"layers.{bid}.ffn.experts.{eid}.{proj}.weight"
scale_name = f"layers.{bid}.ffn.experts.{eid}.{proj}.scale"
if weight_name not in self.model_tensors or scale_name not in self.model_tensors:
raise KeyError(f"Missing routed expert tensors for {weight_name}")
weight = LazyTorchTensor.to_eager(self.model_tensors[weight_name]())
scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
packed = self._pack_mxfp4_blocks(weight, scale)
if data is None:
data = np.empty((n_experts, *packed.shape), dtype=packed.dtype)
data[eid] = packed
consumed.extend((weight_name, scale_name))
assert data is not None
new_name = self.format_tensor_name(tensor_key, bid)
shape = gguf.quant_shape_from_byte_shape(data.shape, gguf.GGMLQuantizationType.MXFP4)
logger.info(f"{new_name}: repacked routed experts to MXFP4, shape = {{{', '.join(str(n) for n in reversed(shape))}}}")
self.gguf_writer.add_tensor(new_name, data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
return consumed
def _write_hash_routing_tensors(self) -> list[str]:
consumed: list[str] = []
for bid in range(self.hparams["num_hash_layers"]):
name = f"layers.{bid}.ffn.gate.tid2eid"
if name not in self.model_tensors:
raise KeyError(f"Missing hash routing tensor {name}")
data_torch = LazyTorchTensor.to_eager(self.model_tensors[name]())
data = data_torch.to(torch.int32).cpu().numpy()
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_TID2EID, bid, ".weight")
logger.info(f"{new_name}: converted hash routing table to I32, shape = {{{', '.join(str(n) for n in reversed(data.shape))}}}")
self.gguf_writer.add_tensor(new_name, data)
consumed.append(name)
return consumed
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
if self._dsv4_mxfp4_generated:
return ()
consumed: list[str] = self._write_hash_routing_tensors()
for bid in range(self.block_count):
consumed.extend(self._write_mxfp4_expert_tensor(bid, "w1", gguf.MODEL_TENSOR.FFN_GATE_EXP))
consumed.extend(self._write_mxfp4_expert_tensor(bid, "w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP))
consumed.extend(self._write_mxfp4_expert_tensor(bid, "w3", gguf.MODEL_TENSOR.FFN_UP_EXP))
for name in consumed:
del self.model_tensors[name]
self._dsv4_mxfp4_generated = True
return ()
def _format_dsv4_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> str:
return self.format_tensor_name(key, bid, suffix)
def _map_dsv4_tensor_name(self, name: str, bid: int | None) -> tuple[gguf.MODEL_TENSOR, str]:
root_map: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
"embed.weight": (gguf.MODEL_TENSOR.TOKEN_EMBD, ".weight"),
"norm.weight": (gguf.MODEL_TENSOR.OUTPUT_NORM, ".weight"),
"head.weight": (gguf.MODEL_TENSOR.OUTPUT, ".weight"),
"hc_head_fn": (gguf.MODEL_TENSOR.HC_HEAD_FN, ".weight"),
"hc_head_base": (gguf.MODEL_TENSOR.HC_HEAD_BASE, ".weight"),
"hc_head_scale": (gguf.MODEL_TENSOR.HC_HEAD_SCALE, ".weight"),
}
if name in root_map:
return root_map[name]
match = re.match(r"layers\.(\d+)\.(.+)$", name)
if match is None:
raise ValueError(f"Unsupported DeepSeek-V4 tensor {name!r}")
layer = int(match.group(1))
if bid != layer:
raise ValueError(f"Tensor {name!r} parsed bid {bid} but layer name has {layer}")
layer_map: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
"hc_attn_fn": (gguf.MODEL_TENSOR.HC_ATTN_FN, ".weight"),
"hc_attn_base": (gguf.MODEL_TENSOR.HC_ATTN_BASE, ".weight"),
"hc_attn_scale": (gguf.MODEL_TENSOR.HC_ATTN_SCALE, ".weight"),
"hc_ffn_fn": (gguf.MODEL_TENSOR.HC_FFN_FN, ".weight"),
"hc_ffn_base": (gguf.MODEL_TENSOR.HC_FFN_BASE, ".weight"),
"hc_ffn_scale": (gguf.MODEL_TENSOR.HC_FFN_SCALE, ".weight"),
"attn.attn_sink": (gguf.MODEL_TENSOR.ATTN_SINKS, ".weight"),
"attn.wq_a.weight": (gguf.MODEL_TENSOR.ATTN_Q_A, ".weight"),
"attn.wq_b.weight": (gguf.MODEL_TENSOR.ATTN_Q_B, ".weight"),
"attn.q_norm.weight": (gguf.MODEL_TENSOR.ATTN_Q_A_NORM, ".weight"),
"attn.wkv.weight": (gguf.MODEL_TENSOR.ATTN_KV, ".weight"),
"attn.kv_norm.weight": (gguf.MODEL_TENSOR.ATTN_KV_NORM, ".weight"),
"attn.wo_a.weight": (gguf.MODEL_TENSOR.ATTN_OUT_A, ".weight"),
"attn.wo_b.weight": (gguf.MODEL_TENSOR.ATTN_OUT_B, ".weight"),
"attn.compressor.ape": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_APE, ".weight"),
"attn.compressor.wkv.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_WKV, ".weight"),
"attn.compressor.wgate.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_WGATE, ".weight"),
"attn.compressor.norm.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_NORM, ".weight"),
"attn.indexer.wq_b.weight": (gguf.MODEL_TENSOR.INDEXER_ATTN_Q_B, ".weight"),
"attn.indexer.weights_proj.weight": (gguf.MODEL_TENSOR.INDEXER_PROJ, ".weight"),
"attn.indexer.compressor.ape": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_APE, ".weight"),
"attn.indexer.compressor.wkv.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_WKV, ".weight"),
"attn.indexer.compressor.wgate.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_WGATE, ".weight"),
"attn.indexer.compressor.norm.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, ".weight"),
"attn_norm.weight": (gguf.MODEL_TENSOR.ATTN_NORM, ".weight"),
"ffn_norm.weight": (gguf.MODEL_TENSOR.FFN_NORM, ".weight"),
"ffn.gate.weight": (gguf.MODEL_TENSOR.FFN_GATE_INP, ".weight"),
"ffn.gate.bias": (gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, ".bias"),
"ffn.gate.tid2eid": (gguf.MODEL_TENSOR.FFN_GATE_TID2EID, ".weight"),
"ffn.shared_experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_SHEXP, ".weight"),
"ffn.shared_experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_SHEXP, ".weight"),
"ffn.shared_experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_SHEXP, ".weight"),
}
tensor_name = match.group(2)
if tensor_name in layer_map:
return layer_map[tensor_name]
if re.match(r"ffn\.experts\.\d+\.w[123]\.(weight|scale)$", tensor_name):
return gguf.MODEL_TENSOR.FFN_GATE_EXP, ".weight"
raise ValueError(f"Unsupported DeepSeek-V4 tensor {name!r}")
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if re.match(r"layers\.\d+\.ffn\.experts\.\d+\.w[123]\.(weight|scale)$", name):
return []
tensor_key, suffix = self._map_dsv4_tensor_name(name, bid)
if tensor_key == gguf.MODEL_TENSOR.FFN_GATE_TID2EID:
return []
return [(self._format_dsv4_tensor_name(tensor_key, bid, suffix), data_torch)]
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
del new_name, bid # unused
if name in self._dsv4_fp8_dequantized and n_dims >= 2:
return gguf.GGMLQuantizationType.Q8_0
if name in self._dsv4_f32_tensors:
return gguf.GGMLQuantizationType.F32
if name in self._dsv4_bf16_tensors and n_dims >= 2:
return gguf.GGMLQuantizationType.BF16
return False
def prepare_tensors(self):
super().prepare_tensors()
self._is_mxfp4 = True
self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
+3 -3
View File
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):
assert (hparams["activation_function"] == "silu")
rotary_factor = self.rope_parameters.get("partial_rotary_factor")
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
factor = rope_params.get("factor", 16.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
+5 -16
View File
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
self.gguf_writer.add_head_count_kv(value_arr)
# handle n_rot differently for global vs swa layers
partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
self.gguf_writer.add_rope_dimension_count(n_rot_full)
@@ -789,16 +789,6 @@ class Gemma4UnifiedModel(Gemma4Model):
class Gemma4AssistantModel(Gemma4Model):
model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
if "masked_embedding" in name:
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
return None
return super().filter_tensors(item)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])
@@ -832,11 +822,10 @@ class Gemma4VisionAudioModel(MmprojModel):
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
# audio params
if self.has_audio_encoder:
assert self.hparams_audio is not None
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
assert self.hparams_audio is not None
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
def is_audio_tensor(self, name: str) -> bool:
return "audio_tower" in name or "embed_audio" in name
+2 -2
View File
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
)
self.gguf_writer.add_rope_dimension_count(
int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
)
# MoE parameters - Use only routed expert count (shared experts handled separately)
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
super().set_gguf_parameters()
rope_dim = self.hparams["qk_rope_head_dim"]
partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))
# NextN/MTP prediction layers
+4 -182
View File
@@ -1,6 +1,5 @@
from __future__ import annotations
import re
from typing import Any, Callable, Iterable, TYPE_CHECKING
import torch
@@ -14,7 +13,7 @@ from .llama import LlamaModel
from .mamba import Mamba2Model
@ModelBase.register("GraniteForCausalLM")
@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
class GraniteModel(LlamaModel):
"""Conversion for IBM's GraniteForCausalLM"""
model_arch = gguf.MODEL_ARCH.GRANITE
@@ -47,29 +46,11 @@ class GraniteModel(LlamaModel):
self.gguf_writer.add_logit_scale(logits_scale)
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
# If being used as the base for Granite4 Vision, add deepstack_layer_arr
if self.hparams.get("spatial_target_layers") or self.hparams.get("deepstack_layer_map"):
normalized_projector_map = Granite4VisionMmprojModel.get_normalized_projector_map(self.hparams)
deepstack_mapping_arr = [-1 for _ in range(self.block_count)] # Populate with -1 sentinels
for proj_idx, (_, llm_layer, _, _) in enumerate(normalized_projector_map):
# Skip the first projector which is handled as the base embedding
# stream like normal
if proj_idx == 0:
continue
deepstack_mapping_arr[llm_layer] = proj_idx
self.gguf_writer.add_deepstack_mapping(deepstack_mapping_arr)
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
# Skip multimodal tensors
if (
name.startswith(("encoder."))
or "image_" in name
or "layerwise_projectors" in name
or "spatial_projectors" in name
):
return
if name.startswith("encoder."):
return None
return super().filter_tensors(item)
@@ -260,8 +241,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
def set_vocab(self):
# For models with no ssm layers, don't pad for mamba2
self.hparams["pad_vocab_size_multiple"] = 8 if self._ssm_layers else 1
self.hparams["pad_vocab_size_multiple"] = 8
Mamba2Model.set_vocab(self)
@@ -346,161 +326,3 @@ class GraniteSpeechMmprojModel(MmprojModel):
data_torch = data_torch.squeeze(1)
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
"""Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
has_vision_encoder = False
has_audio_encoder = True
def set_gguf_parameters(self):
assert self.hparams_audio is not None
super().set_gguf_parameters()
# Add feature_layer if present in encoder config
if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
self.gguf_writer.add_audio_feature_layers(feature_layers)
logger.info(f"gguf: audio feature_layers = {feature_layers}")
# Validate projector dimension matches concatenated encoder output
hidden_dim = self.hparams_audio["hidden_dim"]
expected_dim = hidden_dim * (len(feature_layers) + 1)
projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
if projector_dim != expected_dim:
raise ValueError(
f"Projector encoder_hidden_size ({projector_dim}) does not match "
f"expected concatenated dimension ({expected_dim}). "
f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
)
@ModelBase.register("Granite4VisionForConditionalGeneration")
class Granite4VisionMmprojModel(MmprojModel):
has_vision_encoder = True
has_audio_encoder = False
@staticmethod
def get_normalized_projector_map(global_config: dict) -> list[tuple[int, int, str, int]]:
"""Normalize both deepstack and spatial projector maps to the form:
(vision_layer, llm_layer, <type>, type_index)
This is then used to populate the following mappings:
- vision_feature_layers (mmproj hparam): ordered list of all
vision_layer values where order corresponds with the order of the
stacked projector tensors
NOTE: Values may appear multiple times for spatial projectors
- tensor_prefix_map (mmproj tensors): mapping from tensor prefixes to
the index of the corresponding projector in the stacked tensors
- deepstack_layer_arr (llm hparam): per-text-layer array indicating
which input vision feature should be injected at that layer
(-1 if none)
Output: (vision_layer, llm_layer, <type>, type_index)
"""
deepstack_map = global_config.get("deepstack_layer_map", []) # [[vis_layer, llm_layer], ...]
spatial_layers = global_config.get("spatial_target_layers", []) # [llm_layer, ...]
n_text_layers = global_config["text_config"]["num_hidden_layers"]
n_vision_layers = global_config["vision_config"]["num_hidden_layers"]
normalized_projector_map = []
if deepstack_map:
for deepstack_idx, (vision_layer, llm_layer) in enumerate(sorted(deepstack_map)):
if vision_layer < 0:
vision_layer = n_vision_layers + vision_layer
if llm_layer < 0:
llm_layer = n_text_layers + llm_layer
normalized_projector_map.append((vision_layer, llm_layer, "layerwise", deepstack_idx))
if spatial_layers:
spatial_vision_layer = global_config.get("spatial_vision_layer", -1)
if spatial_vision_layer < 0:
spatial_vision_layer = n_vision_layers + spatial_vision_layer
for spatial_idx, llm_layer in enumerate(spatial_layers):
normalized_projector_map.append((spatial_vision_layer, llm_layer, "spatial", spatial_idx))
return list(sorted(normalized_projector_map, key=(lambda entry: entry[1])))
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
normalized_projector_map = self.get_normalized_projector_map(self.global_config)
self._n_proj = len(normalized_projector_map)
self._tensor_prefix_map = {
f"model.{proj_type}_projectors.{type_idx}": proj_idx
for proj_idx, (_, _, proj_type, type_idx) in enumerate(normalized_projector_map)
}
self._vision_feature_layers = [vision_layer for vision_layer, _, _, _ in normalized_projector_map]
self._spatial_offsets = [
type_idx if proj_type == "spatial" else -1
for _, _, proj_type, type_idx in normalized_projector_map
]
def set_gguf_parameters(self):
assert self.hparams_vision is not None
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE4_VISION)
# SigLIP encoder hparams
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)
# Preprocessor
self.gguf_writer.add_vision_preproc_image_size(self.hparams.get("image_size", 384))
# QFormer projector config
ds_rate = self.global_config["downsample_rate"]
ds_parts = ds_rate.split("/")
assert len(ds_parts) == 2, f"Invalid 'downsample_rate' value: {ds_rate}"
query_side, window_side = [int(p) for p in ds_parts]
self.gguf_writer.add_vision_projector_query_side(query_side)
self.gguf_writer.add_vision_projector_window_side(window_side)
# Set vision feature layers
self.gguf_writer.add_vision_feature_layers(self._vision_feature_layers)
# Set the spatial offests per projector
self.gguf_writer.add_vision_spatial_offsets(self._spatial_offsets)
# Add flattened image grind pinpoints (resolution candidates internally)
if pinpoints := self.global_config.get("image_grid_pinpoints"):
# Flatten with h, w -> w, h inversion
pinpoints = [val for h, w in pinpoints for val in (w, h)]
self.gguf_writer.add_vision_image_grid_pinpoints(pinpoints)
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, _ = item
if ("vision_model.head" in name or name.startswith("lm_head")):
return None
return super().filter_tensors(item)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Detect projector tensors and bin them
projector_idx = None
for prefix, proj_idx in self._tensor_prefix_map.items():
if name.startswith(prefix):
projector_idx = proj_idx
break
if projector_idx is not None:
# If this projector tensor has a block id within the projector,
# alias the bid to projector_idx
#
# TODO: currently, none of the Granite 4 Vision models have
# projectors with multiple QFormer layers, so the `layer.{}` index
# is always 0. This allows us to simply map to a single `bid` that
# matches the projector index. If this changes, we'll need a
# convention that merges the two IDs.
id_matches = list(re.finditer(r"\.([0-9]+)\.", name))
all_ids = [int(m.group(1)) for m in id_matches]
assert len(all_ids) >= 1 and len(all_ids) <= 2, "Must have at least 1 and at most 2 ids in tensor names"
# If not layer id, just use the projector index
new_bid = projector_idx
if len(all_ids) == 1:
new_name = name[:id_matches[0].span(1)[0]] + str(new_bid) + name[id_matches[0].span(1)[1]:]
else: # len(all_ids) == 2
new_bid = projector_idx # + all_ids[1]
new_name = name[:id_matches[0].span(0)[0]] + name[id_matches[0].span(1)[1]:id_matches[1].span(1)[0]] + str(new_bid) + name[id_matches[1].span(1)[1]:]
yield from super().modify_tensors(data_torch, new_name, new_bid)
return
yield from super().modify_tensors(data_torch, name, bid)
+3 -10
View File
@@ -64,17 +64,11 @@ class LFM2Model(TextModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel")
@ModelBase.register("Lfm2Model")
class LFM2ColBertModel(LFM2Model):
model_arch = gguf.MODEL_ARCH.LFM2
dense_tensor_name = "dense_2"
def set_gguf_parameters(self):
super().set_gguf_parameters()
if self.hf_arch == "Lfm2BidirectionalModel":
self.gguf_writer.add_causal_attention(False)
self._try_set_pooling_type()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not name.startswith(self.dense_tensor_name):
name = "model." + name
@@ -82,11 +76,10 @@ class LFM2ColBertModel(LFM2Model):
yield from super().modify_tensors(data_torch, name, bid)
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# optional dense tensor is stored in a separate safetensors file
# dense tensor is stored in a separate safetensors file
from safetensors.torch import load_file
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
if not tensors_file.is_file():
return
assert tensors_file.is_file()
tensor = load_file(tensors_file)["linear.weight"]
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
yield f"{self.dense_tensor_name}.weight", tensor.clone()
+2 -132
View File
@@ -5,13 +5,12 @@ import math
from typing import Callable, Iterable, TYPE_CHECKING
import numpy as np
import torch
if TYPE_CHECKING:
from torch import Tensor
from .base import ModelBase, TextModel, gguf, logger
from .base import ModelBase, TextModel, gguf
@ModelBase.register(
@@ -22,10 +21,6 @@ from .base import ModelBase, TextModel, gguf, logger
"VLlama3ForCausalLM",
"LlavaForConditionalGeneration",
"VoxtralForConditionalGeneration",
"LlamaForCausalLMEagle3",
"Eagle3LlamaForCausalLM",
"Eagle3Speculator",
"Eagle3DraftModel",
"IQuestCoderForCausalLM",
"LlamaModel")
class LlamaModel(TextModel):
@@ -44,61 +39,7 @@ class LlamaModel(TextModel):
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
self.origin_hf_arch = hparams.get('architectures', [None])[0]
# Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
self.is_eagle3 = True
self.model_arch = gguf.MODEL_ARCH.EAGLE3
logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
# Re-initialize tensor_map with eagle3 architecture
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
# Update gguf_writer architecture
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
if self.target_model_dir is None:
raise ValueError(
"EAGLE-3 model requires --target-model-dir to be specified. "
"Please provide the path to the target model directory to read config.json"
)
# Read both eagle3 raw config and target model config
with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
eagle3_raw_config = json.load(f)
with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
target_config = json.load(f)
if "text_config" in target_config:
target_config = {**target_config, **target_config["text_config"]}
self.target_vocab_size = target_config["vocab_size"]
# target_layers: derived from target model layer count (low/mid/high)
target_num_layers = target_config["num_hidden_layers"]
target_layers = [2, target_num_layers // 2, target_num_layers - 3]
logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
self.gguf_writer.add_target_layers(target_layers)
# target_hidden_size: prefer eagle3 config, fallback to target config
if eagle3_raw_config.get("target_hidden_size") is not None:
target_hidden_size = eagle3_raw_config["target_hidden_size"]
src = "EAGLE-3 config"
else:
target_hidden_size = target_config["hidden_size"]
src = "target model config"
logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
self.gguf_writer.add_target_hidden_size(target_hidden_size)
# norm_before_residual (RedHat-style eagle3 specific)
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
self.gguf_writer.add_norm_before_residual(norm_before_residual)
def set_vocab(self):
# eagle3: use tokenizer from target model if provided
original_dir_model = None
if getattr(self, 'is_eagle3', False):
assert self.target_model_dir is not None
logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
original_dir_model = self.dir_model
self.dir_model = self.target_model_dir
if self.origin_hf_arch == "GlmasrModel":
return self._set_vocab_glmedge()
@@ -144,10 +85,6 @@ class LlamaModel(TextModel):
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)
# eagle3: Restore original dir_model
if original_dir_model is not None:
self.dir_model = original_dir_model
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
@@ -192,49 +129,7 @@ class LlamaModel(TextModel):
return super().filter_tensors((name, gen))
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
tensors = super().index_tensors(remote_hf_model_id)
# Handle Eagle3Speculator nested config
if "transformer_layer_config" in self.hparams:
self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
# eagle3 detection
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
new_tensors = {}
for name, gen in tensors.items():
if name.startswith("midlayer."):
new_name = "model.layers.0." + name[len("midlayer."):]
new_tensors[new_name] = gen
elif name.startswith("layers.0."): # Eagle3Speculator format
new_name = "model." + name
new_tensors[new_name] = gen
else:
new_tensors[name] = gen
return new_tensors
return tensors
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# eagle3: special tensors that bypass standard llama mapping
if getattr(self, 'is_eagle3', False):
if name == "fc.weight":
yield (name, data_torch)
return
if name == "d2t":
# store for manual int64 handling in prepare_tensors (avoid F32 conversion)
if not hasattr(self, '_eagle3_int_tensors'):
self._eagle3_int_tensors = {}
self._eagle3_int_tensors[name] = data_torch
return
if name == "t2d":
# not used at runtime, skip
return
if name.endswith(".hidden_norm.weight"):
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
return
n_head = self.find_hparam(["n_heads", "num_attention_heads"])
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
@@ -290,7 +185,7 @@ class LlamaModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
@@ -310,33 +205,8 @@ class LlamaModel(TextModel):
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
def prepare_tensors(self):
# eagle3: collect d2t original dtype before parent converts tensors to F32
eagle3_original_dtypes = {}
if getattr(self, 'is_eagle3', False):
for name, data_torch in self.get_tensors():
if name == "d2t":
eagle3_original_dtypes[name] = data_torch.dtype
super().prepare_tensors()
# eagle3: write d2t as absolute target token ids
if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
for name, data_torch in self._eagle3_int_tensors.items():
old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
data = data_torch.to(torch.int64).cpu().numpy()
if name == "d2t":
data = data.reshape(-1)
data = data + np.arange(data.size, dtype=np.int64)
if np.any((data < 0) | (data >= self.target_vocab_size)):
raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
if np.unique(data).size != data.size:
raise ValueError("EAGLE-3 d2t contains duplicate target ids")
data_qtype = gguf.GGMLQuantizationType.I64
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
+4 -3
View File
@@ -114,8 +114,7 @@ class Mamba2Model(TextModel):
hparams["text_config"] = hparams["llm_config"]
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
self.expand = self.find_hparam(["mamba_expand", "expand"], optional=True) or 2
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or self.expand * self.d_model
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
def set_vocab(self):
@@ -145,9 +144,11 @@ class Mamba2Model(TextModel):
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
# Fail early for models which don't have a block expansion factor of 2
# TODO: does this really matter?
# skip the assertion for FalconH1 Model
if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
assert self.d_inner == self.expand * self.d_model
assert self.d_inner == 2 * self.d_model
assert self.d_inner % head_dim == 0
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
+1 -1
View File
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
+10 -6
View File
@@ -32,9 +32,11 @@ class MiniCPMModel(TextModel):
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if long_factors or short_factors:
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
@@ -83,11 +85,13 @@ class MiniCPM3Model(TextModel):
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if long_factors or short_factors:
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
rope_dims = self.hparams["qk_rope_head_dim"]
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+2 -3
View File
@@ -105,9 +105,8 @@ class MistralModel(LlamaModel):
gguf_writer.add_rope_scaling_yarn_log_mul(mscale_all_dim)
gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
llama_4_scaling = hparams.get("llama_4_scaling")
if llama_4_scaling is not None:
gguf_writer.add_attn_temperature_scale(llama_4_scaling["beta"])
if "llama_4_scaling" in hparams:
gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
class MistralMoeModel(DeepseekV2Model):
+3 -4
View File
@@ -125,18 +125,17 @@ class NemotronModel(TextModel):
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
# * Partial RoPE
rot_pct = self.rope_parameters["partial_rotary_factor"]
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
# * RopeScaling for Nemotron
factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
if factor is None:
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
else:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(factor)
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
+11 -9
View File
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
model_arch = gguf.MODEL_ARCH.PHI2
def set_gguf_parameters(self):
rot_pct = self.rope_parameters["partial_rotary_factor"]
rot_pct = self.find_hparam(["partial_rotary_factor"])
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
rms_eps = self.find_hparam(["rms_norm_eps"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
rope_dims = int(rot_pct * n_embd) // n_head
self.gguf_writer.add_context_length(max_pos_embds)
@@ -174,19 +174,18 @@ class Phi3MiniModel(TextModel):
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
rope_dims = int(rot_pct * n_embd) // n_head
# write rope scaling for long context (128k) model
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if not long_factors:
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is None:
return
scale = max_pos_embds / orig_max_pos_embds
rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')
@@ -199,6 +198,9 @@ class Phi3MiniModel(TextModel):
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+1 -49
View File
@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
@@ -625,51 +625,3 @@ class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReor
@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
class Qwen3_5MoeTextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase):
model_arch = gguf.MODEL_ARCH.QWEN35MOE
@ModelBase.register("DFlashDraftModel")
class DFlashModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.DFLASH
def set_vocab(self):
if self.target_model_dir is None:
raise ValueError(
"DFlash draft model requires --target-model-dir to be specified. "
"Please provide the path to the target model directory containing the tokenizer."
)
logger.info(f"DFlash: Using tokenizer from target model: {self.target_model_dir}")
original_dir = self.dir_model
self.dir_model = self.target_model_dir
super().set_vocab()
self.dir_model = original_dir
mask_token_id = self.hparams.get("dflash_config", {}).get("mask_token_id")
if mask_token_id is not None:
self.gguf_writer.add_mask_token_id(mask_token_id)
def set_gguf_parameters(self):
super().set_gguf_parameters()
block_size = self.hparams.get("block_size", 16)
self.gguf_writer.add_block_size(block_size)
dflash_config = self.hparams.get("dflash_config", {})
target_layer_ids = dflash_config.get("target_layer_ids", [])
if target_layer_ids:
extract_layer_ids = [i + 1 for i in target_layer_ids]
self.gguf_writer.add_target_layers(extract_layer_ids)
use_sliding_window = self.hparams.get("use_sliding_window", False)
sliding_window = self.hparams.get("sliding_window")
layer_types = self.hparams.get("layer_types")
if use_sliding_window and sliding_window and layer_types:
is_swa = [lt == "sliding_attention" for lt in layer_types]
self.gguf_writer.add_sliding_window(sliding_window)
self.gguf_writer.add_sliding_window_pattern(is_swa)
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
if not name.startswith("model."):
name = "model." + name
return super().filter_tensors((name, gen))
+1 -1
View File
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
rotary_factor = self.rope_parameters["partial_rotary_factor"]
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
+1 -1
View File
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
factor = float(rope_params.get("factor", 8.0))
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))
old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
+1 -11
View File
@@ -153,15 +153,6 @@ def parse_args() -> argparse.Namespace:
help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
)
parser.add_argument(
"--target-model-dir", type=str, default=None,
help=(
"path to the target model directory; required when converting a standalone draft model "
"(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
"layer count to populate its GGUF."
),
)
args = parser.parse_args()
if not args.print_supported_models and args.model is None:
parser.error("the following arguments are required: model")
@@ -247,7 +238,7 @@ def main() -> None:
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
from conversion.pixtral import PixtralModel
model_class = PixtralModel
elif hparams.get("moe") is not None:
elif "moe" in hparams:
from conversion.mistral import MistralMoeModel
model_class = MistralMoeModel
else:
@@ -278,7 +269,6 @@ def main() -> None:
small_first_shard=args.no_tensor_first_split,
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
fuse_gate_up_exps=args.fuse_gate_up_exps,
fp8_as_q8=args.fp8_as_q8,
)
-1
View File
@@ -100,7 +100,6 @@ models = [
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "tiny_aya", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
{"name": "cohere2moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/North-Mini-Code-1.0", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
+7 -13
View File
@@ -25,7 +25,7 @@ import gguf
from gguf.constants import GGUFValueType
# reuse model definitions from the conversion/ package
from conversion import LazyTorchTensor, ModelBase, get_model_class, ModelType, get_model_architecture
from conversion import LazyTorchTensor, ModelBase, get_model_class
logger = logging.getLogger("lora-to-gguf")
@@ -311,10 +311,6 @@ def parse_args() -> argparse.Namespace:
"--base-model-id", type=str,
help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
)
parser.add_argument(
"--trust-remote-code", default=False, action="store_true",
help="trust remote code in the model",
)
parser.add_argument(
"lora_path", type=Path,
help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
@@ -323,11 +319,11 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args()
def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dict[str, Any], Path | None]:
def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
from huggingface_hub import try_to_load_from_cache
# normally, adapter does not come with base model config, we need to load it from AutoConfig
config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code)
config = AutoConfig.from_pretrained(hf_model_id)
cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
@@ -376,13 +372,13 @@ if __name__ == '__main__':
# load base model
if base_model_id is not None:
logger.info(f"Loading base model from Hugging Face: {base_model_id}")
hparams, dir_base_model = load_hparams_from_hf(base_model_id, args.trust_remote_code)
hparams, dir_base_model = load_hparams_from_hf(base_model_id)
elif dir_base_model is None:
if "base_model_name_or_path" in lparams:
model_id = lparams["base_model_name_or_path"]
logger.info(f"Loading base model from Hugging Face: {model_id}")
try:
hparams, dir_base_model = load_hparams_from_hf(model_id, args.trust_remote_code)
hparams, dir_base_model = load_hparams_from_hf(model_id)
except OSError as e:
logger.error(f"Failed to load base model config: {e}")
logger.error("Please try downloading the base model and add its path to --base")
@@ -396,12 +392,10 @@ if __name__ == '__main__':
hparams = ModelBase.load_hparams(dir_base_model, False)
with torch.inference_mode():
model_arch = get_model_architecture(hparams, ModelType.TEXT)
try:
model_class = get_model_class(model_arch)
logger.info("Using model architecture: %s", model_arch)
model_class = get_model_class(hparams["architectures"][0])
except NotImplementedError:
logger.error(f"Model {model_arch} is not supported")
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
class LoraModel(model_class): # ty: ignore[unsupported-base]

Some files were not shown because too many files have changed in this diff Show More