readme : add notice about new file format

ggml-ci
py : cosmetics
2026-06-22 13:47:40 +02:00 · 2023-08-21 22:42:14 +03:00 · 2023-08-21 20:40:08 +03:00 · 2023-08-21 20:11:14 +03:00 · 2023-08-21 19:35:31 +03:00 · 2023-08-21 18:56:02 +03:00
3131 changed files with 87066 additions and 1198638 deletions
@@ -1,171 +0,0 @@
---
-Language:        Cpp
-AlignAfterOpenBracket: Align
-AlignArrayOfStructures: Left
-AlignConsecutiveAssignments: AcrossComments
-AlignConsecutiveBitFields: AcrossComments
-AlignConsecutiveDeclarations: AcrossComments
-AlignConsecutiveMacros: AcrossComments
-# AlignConsecutiveShortCaseStatements: AcrossComments
-AlignEscapedNewlines: Left # LeftWithLastLine
-AlignOperands:   Align
-AlignTrailingComments:
-  Kind: Always
-  OverEmptyLines: 1
-AllowAllArgumentsOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: false
-# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
-AllowShortBlocksOnASingleLine: Never
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: Never
-AllowShortLambdasOnASingleLine: Inline
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakBeforeMultilineStrings: true
-# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
-AttributeMacros:
-  - __host__
-  - __device__
-  - __global__
-  - __forceinline__
-  - __launch_bounds__
-BinPackArguments: true
-BinPackParameters: false # OnePerLine
-BitFieldColonSpacing: Both
-BreakBeforeBraces: Custom # Attach
-BraceWrapping:
-  AfterCaseLabel:  true
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  BeforeLambdaBody: false
-  BeforeWhile: false
-  IndentBraces:    false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-# BreakAdjacentStringLiterals: true
-BreakAfterAttributes: Never
-BreakBeforeBinaryOperators: None
-BreakBeforeInlineASMColon: OnlyMultiline
-BreakBeforeTernaryOperators: false
-# BreakBinaryOperations: Never
-BreakConstructorInitializers: AfterColon
-# BreakFunctionDefinitionParameters: false
-BreakInheritanceList: AfterComma
-BreakStringLiterals: true
-# BreakTemplateDeclarations: Yes
-ColumnLimit:     120
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: false
-DerivePointerAlignment: false
-DisableFormat:   false
-EmptyLineBeforeAccessModifier: Leave
-EmptyLineAfterAccessModifier: Never
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-IncludeBlocks:   Regroup
-IncludeCategories:
-  - Regex:           '".*"'
-    Priority:        1
-    SortPriority:    0
-  - Regex:           '^<.*\.h>'
-    Priority:        2
-    SortPriority:    0
-  - Regex:           '^<.*'
-    Priority:        3
-    SortPriority:    0
-  - Regex:           '.*'
-    Priority:        4
-    SortPriority:    0
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IncludeIsMainSourceRegex: ''
-IndentAccessModifiers: false
-IndentCaseBlocks: true
-IndentCaseLabels: true
-IndentExternBlock: NoIndent
-IndentGotoLabels: false
-IndentPPDirectives: AfterHash
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-InsertBraces:    true # NOTE: may lead to incorrect formatting
-InsertNewlineAtEOF: true
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-LambdaBodyIndentation: Signature
-LineEnding: LF
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Auto
-ObjCBlockIndentWidth: 4
-ObjCSpaceAfterProperty: true
-ObjCSpaceBeforeProtocolList: true
-PPIndentWidth: -1
-PackConstructorInitializers: CurrentLine
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Middle
-QualifierAlignment: Left
-#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-ReferenceAlignment: Middle
-ReflowComments:  false # IndentOnly
-SeparateDefinitionBlocks: Always
-SortIncludes:    CaseInsensitive
-SortUsingDeclarations: LexicographicNumeric
-SpaceAfterCStyleCast: true
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyBlock: false
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  Never
-SpacesInContainerLiterals: true
-SpacesInLineCommentPrefix:
-  Minimum: 1
-  Maximum: -1
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-SpaceBeforeSquareBrackets: false
-Standard:        c++17
-TabWidth:        4
-UseTab:          Never
-WhitespaceSensitiveMacros: ['STRINGIZE']
-...
-
@@ -3,7 +3,6 @@ Checks: >
    bugprone-*,
    -bugprone-easily-swappable-parameters,
    -bugprone-implicit-widening-of-multiplication-result,
-    -bugprone-misplaced-widening-cast,
    -bugprone-narrowing-conversions,
    readability-*,
    -readability-avoid-unconditional-preprocessor-if,
@@ -12,17 +11,8 @@ Checks: >
    -readability-implicit-bool-conversion,
    -readability-magic-numbers,
    -readability-uppercase-literal-suffix,
-    -readability-simplify-boolean-expr,
-    -readability-math-missing-parentheses,
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
-    -performance-enum-size,
    portability-*,
-    -portability-simd-intrinsics,
-    misc-*,
-    -misc-const-correctness,
-    -misc-non-private-member-variables-in-classes,
-    -misc-no-recursion,
-    -misc-use-anonymous-namespace,
 FormatStyle: none
@@ -1,163 +0,0 @@
-# ==============================================================================
-# ARGUMENTS
-# ==============================================================================
-
-# Define the CANN base image for easier version updates later
-ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-# ==============================================================================
-# BUILD STAGE
-# Compile all binary files and libraries
-# ==============================================================================
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM ${CANN_BASE_IMAGE} AS build
-
-# -- Install build dependencies --
-RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
-    yum clean all && \
-    rm -rf /var/cache/yum
-
-# -- Set the working directory --
-WORKDIR /app
-
-# -- Copy project files --
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-# -- Set CANN environment variables (required for compilation) --
-# Using ENV instead of `source` allows environment variables to persist across the entire image layer
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
-# ... You can add other environment variables from the original file as needed ...
-# For brevity, only core variables are listed here. You can paste the original ENV list here.
-
-# -- Build llama.cpp --
-# Use the passed CHIP_TYPE argument and add general build options
-ARG CHIP_TYPE
-RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
-    && \
-    cmake -B build \
-        -DGGML_CANN=ON \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DSOC_TYPE=ascend${CHIP_TYPE} \
-        -DUSE_ACL_GRAPH=ON \
-        . && \
-    cmake --build build --config Release -j$(nproc)
-
-# -- Organize build artifacts for copying in later stages --
-# Create a lib directory to store all .so files
-RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-# Create a full directory to store all executables and Python scripts
-RUN mkdir -p /app/full && \
-    cp build/bin/* /app/full/ && \
-    cp *.py /app/full/ && \
-    cp -r conversion /app/full/ && \
-    cp -r gguf-py /app/full/ && \
-    cp -r requirements /app/full/ && \
-    cp requirements.txt /app/full/
-    # If you have a tools.sh script, make sure it is copied here
-    # cp .devops/tools.sh /app/full/tools.sh
-
-# ==============================================================================
-# BASE STAGE
-# Create a minimal base image with CANN runtime and common libraries
-# ==============================================================================
-FROM ${CANN_BASE_IMAGE} AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-# -- Install runtime dependencies --
-RUN yum install -y libgomp curl && \
-    yum clean all && \
-    rm -rf /var/cache/yum
-
-# -- Set CANN environment variables (required for runtime) --
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-# ... You can add other environment variables from the original file as needed ...
-
-WORKDIR /app
-
-# Copy compiled .so files from the build stage
-COPY --from=build /app/lib/ /app
-
-# ==============================================================================
-# FINAL STAGES (TARGETS)
-# ==============================================================================
-
-### Target: full
-# Complete image with all tools, Python bindings, and dependencies
-# ==============================================================================
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-# Install Python dependencies
-RUN yum install -y git python3 python3-pip && \
-    pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
-    pip3 install --no-cache-dir -r requirements.txt && \
-    yum clean all && \
-    rm -rf /var/cache/yum
-
-# You need to provide a tools.sh script as the entrypoint
-ENTRYPOINT ["/app/tools.sh"]
-# If there is no tools.sh, you can set the default to start the server
-# ENTRYPOINT ["/app/llama-server"]
-
-### Target: light
-# Lightweight image containing only llama-cli and llama-completion
-# ==============================================================================
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Target: server
-# Dedicated server image containing only llama-server
-# ==============================================================================
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -1,124 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
-
-ARG TARGETARCH
-
-RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
-
-ENV CC=gcc-14 CXX=g++-14
-
-WORKDIR /app
-
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
-    else \
-        echo "Unsupported architecture"; \
-        exit 1; \
-    fi && \
-    cmake --build build -j $(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r conversion /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -1,133 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.8.1
-ARG GCC_VERSION=14
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-
-ARG GCC_VERSION
-# CUDA architecture to build for (defaults to all supported archs)
-ARG CUDA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} build-essential cmake python3 python3-pip git libssl-dev libgomp1
-
-ENV CC=gcc-${GCC_VERSION} CXX=g++-${GCC_VERSION} CUDAHOSTCXX=g++-${GCC_VERSION}
-
-WORKDIR /app
-
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
-    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r conversion /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_CUDA_RUN_CONTAINER} AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -0,0 +1,33 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1
+
+RUN make
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
@@ -0,0 +1,21 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip git
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
@@ -1,162 +0,0 @@
-ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-## Build Image
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build
-
-ARG GGML_SYCL_F16=ON
-ARG LEVEL_ZERO_VERSION=1.28.2
-ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
-RUN apt-get update && \
-    apt-get install -y git libssl-dev wget ca-certificates && \
-    cd /tmp && \
-    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
-    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
-    apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
-    rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb
-
-WORKDIR /app
-
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" \
-        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
-        && export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \
-    fi && \
-    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r conversion /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-#Following versions are for multiple GPUs, since 26.x has known issue:
-#   https://github.com/ggml-org/llama.cpp/issues/21747,
-#   https://github.com/intel/compute-runtime/issues/921.
-#ARG IGC_VERSION=v2.20.5
-#ARG IGC_VERSION_FULL=2_2.20.5+19972
-#ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
-#ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
-#ARG IGDGMM_VERSION=22.8.2
-
-
-ARG IGC_VERSION=v2.34.4
-ARG IGC_VERSION_FULL=2_2.34.4+21428
-ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
-ARG IGDGMM_VERSION=22.10.0
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && dpkg --install *.deb
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update && \
-    apt-get install -y \
-        git \
-        python3 \
-        python3-pip \
-        python3-venv && \
-    python3 -m venv /opt/venv && \
-    . /opt/venv/bin/activate && \
-    pip install --upgrade pip setuptools wheel && \
-    pip install -r requirements.txt && \
-    apt autoremove -y && \
-    apt clean -y && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-ENV PATH="/opt/venv/bin:$PATH"
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -1,62 +0,0 @@
-ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-FROM docker.io/ascendai/cann:$ASCEND_VERSION AS build
-
-WORKDIR /app
-
-COPY . .
-
-RUN yum install -y gcc g++ cmake make openssl-devel
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
-ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
-ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
-ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
-
-# find libascend_hal.so, because the drive hasn`t been mounted.
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
-
-RUN echo "Building with static libs" && \
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
-    cmake --build build --config Release --target llama-cli && \
-    cmake --build build --config Release --target llama-completion
-
-# TODO: use image with NNRT
-FROM docker.io/ascendai/cann:$ASCEND_VERSION AS runtime
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
-
-ENV LC_ALL=C.utf8
-
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
-ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
-ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
-ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
-
-ENTRYPOINT ["/llama-cli" ]
@@ -1,85 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp-cuda
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
-Requires:       cuda-toolkit
-URL:            https://github.com/ggml-org/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j GGML_CUDA=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
-cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
-cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llama-cuda-cli
-%{_bindir}/llama-cuda-completion
-%{_bindir}/llama-cuda-server
-%{_bindir}/llama-cuda-simple
-/usr/lib/systemd/system/llamacuda.service
-%config /etc/sysconfig/llama
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
@@ -1,87 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-#    In the meantime, YYYYMMDD format will be used.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
-Requires:       libstdc++
-URL:            https://github.com/ggml-org/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-Models are not included in this package and must be downloaded separately.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
-cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
-cp -p llama-server %{buildroot}%{_bindir}/llama-server
-cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llama-server $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llama-cli
-%{_bindir}/llama-completion
-%{_bindir}/llama-server
-%{_bindir}/llama-simple
-/usr/lib/systemd/system/llama.service
-%config /etc/sysconfig/llama
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
@@ -0,0 +1,32 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1
+
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+
+COPY --from=build /app/main /main
+
+ENTRYPOINT [ "/main" ]
@@ -0,0 +1,20 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/main /main
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/main" ]
@@ -1,135 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.3.0
-# Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
-
-ARG BASE_MUSA_RUN_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM ${BASE_MUSA_DEV_CONTAINER} AS build
-
-# MUSA architecture to build for (defaults to all supported archs)
-ARG MUSA_DOCKER_ARCH=default
-
-RUN apt-get update && \
-    apt-get install -y \
-    build-essential \
-    cmake \
-    python3 \
-    python3-pip \
-    git \
-    libssl-dev \
-    libgomp1
-
-WORKDIR /app
-
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
-        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
-    fi && \
-    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r conversion /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_MUSA_RUN_CONTAINER} AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -1,21 +0,0 @@
-{
-  perSystem =
-    { config, lib, ... }:
-    {
-      apps =
-        let
-          inherit (config.packages) default;
-          binaries = [
-            "llama-cli"
-            "llama-embedding"
-            "llama-server"
-            "llama-quantize"
-          ];
-          mkApp = name: {
-            type = "app";
-            program = "${default}/bin/${name}";
-          };
-        in
-        lib.genAttrs binaries mkApp;
-    };
-}
@@ -1,52 +0,0 @@
-{ inputs, ... }:
-
-{
-  perSystem =
-    {
-      config,
-      lib,
-      system,
-      ...
-    }:
-    {
-      devShells =
-        let
-          pkgs = import inputs.nixpkgs { inherit system; };
-          stdenv = pkgs.stdenv;
-          scripts = config.packages.python-scripts;
-        in
-        lib.pipe (config.packages) [
-          (lib.concatMapAttrs (
-            name: package: {
-              ${name} = pkgs.mkShell {
-                name = "${name}";
-                inputsFrom = [ package ];
-                shellHook = ''
-                  echo "Entering ${name} devShell"
-                '';
-              };
-              "${name}-extra" =
-                if (name == "python-scripts") then
-                  null
-                else
-                  pkgs.mkShell {
-                    name = "${name}-extra";
-                    inputsFrom = [
-                      package
-                      scripts
-                    ];
-                    # Extra packages that *may* be used by some scripts
-                    packages = [
-                        pkgs.python3Packages.tiktoken
-                    ];
-                    shellHook = ''
-                      echo "Entering ${name} devShell"
-                      addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
-                    '';
-                  };
-            }
-          ))
-          (lib.filterAttrs (name: value: value != null))
-        ];
-    };
-}
@@ -1,37 +0,0 @@
-{
-  lib,
-  dockerTools,
-  buildEnv,
-  llama-cpp,
-  interactive ? true,
-  coreutils,
-}:
-
-# A tar that can be fed into `docker load`:
-#
-# $ nix build .#llamaPackages.docker
-# $ docker load < result
-
-# For details and variations cf.
-# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
-# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
-# - https://nixery.dev/
-
-# Approximate (compressed) sizes, at the time of writing, are:
-#
-# .#llamaPackages.docker: 125M;
-# .#llamaPackagesCuda.docker: 537M;
-# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
-
-dockerTools.buildLayeredImage {
-  name = llama-cpp.pname;
-  tag = "latest";
-
-  contents =
-    [ llama-cpp ]
-    ++ lib.optionals interactive [
-      coreutils
-      dockerTools.binSh
-      dockerTools.caCertificates
-    ];
-}
@@ -1,39 +0,0 @@
-{ inputs, ... }:
-{
-  perSystem =
-    {
-      config,
-      system,
-      lib,
-      pkgsCuda,
-      ...
-    }:
-    {
-      legacyPackages =
-        let
-          caps.llamaPackagesXavier = "7.2";
-          caps.llamaPackagesOrin = "8.7";
-          caps.llamaPackagesTX2 = "6.2";
-          caps.llamaPackagesNano = "5.3";
-
-          pkgsFor =
-            cap:
-            import inputs.nixpkgs {
-              inherit system;
-              config = {
-                cudaSupport = true;
-                cudaCapabilities = [ cap ];
-                cudaEnableForwardCompat = false;
-                inherit (pkgsCuda.config) allowUnfreePredicate;
-              };
-            };
-        in
-        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
-
-      packages = lib.optionalAttrs (system == "aarch64-linux") {
-        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
-        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
-        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
-      };
-    };
-}
@@ -1,45 +0,0 @@
-{ inputs, ... }:
-{
-  # The _module.args definitions are passed on to modules as arguments. E.g.
-  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
-  # `_module.args.pkgs` (defined in this case by flake-parts).
-  perSystem =
-    { lib, system, ... }:
-    {
-      _module.args = {
-        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
-        # again, the below creates several nixpkgs instances which the
-        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
-        #
-        # This is currently "slow" and "expensive", on a certain scale.
-        # This also isn't "right" in that this hinders dependency injection at
-        # the level of flake inputs. This might get removed in the foreseeable
-        # future.
-        #
-        # Note that you can use these expressions without Nix
-        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
-
-        pkgsCuda = import inputs.nixpkgs {
-          inherit system;
-          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
-          # and ucx are built with CUDA support)
-          config.cudaSupport = true;
-          config.allowUnfreePredicate =
-            p:
-            builtins.all (
-              license:
-              license.free
-              || builtins.elem license.shortName [
-                "CUDA EULA"
-                "cuDNN EULA"
-              ]
-            ) (p.meta.licenses or (lib.toList p.meta.license));
-        };
-        # Ensure dependencies use ROCm consistently
-        pkgsRocm = import inputs.nixpkgs {
-          inherit system;
-          config.rocmSupport = true;
-        };
-      };
-    };
-}
@@ -1,38 +0,0 @@
-{
-  lib,
-  llamaVersion,
-  numpy,
-  tqdm,
-  requests,
-  sentencepiece,
-  pyyaml,
-  poetry-core,
-  buildPythonPackage,
-  pytestCheckHook,
-}:
-
-buildPythonPackage {
-  pname = "gguf";
-  version = llamaVersion;
-  pyproject = true;
-  nativeBuildInputs = [ poetry-core ];
-  propagatedBuildInputs = [
-    numpy
-    tqdm
-    sentencepiece
-    pyyaml
-    requests
-  ];
-  src = lib.cleanSource ../../gguf-py;
-  pythonImportsCheck = [
-    "numpy"
-    "gguf"
-  ];
-  nativeCheckInputs = [ pytestCheckHook ];
-  doCheck = true;
-  meta = with lib; {
-    description = "Python package for writing binary files in the GGUF format";
-    license = licenses.mit;
-    maintainers = [ maintainers.ditsuke ];
-  };
-}
@@ -1,275 +0,0 @@
-{
-  lib,
-  glibc,
-  config,
-  stdenv,
-  stdenvNoCC,
-  runCommand,
-  cmake,
-  ninja,
-  pkg-config,
-  git,
-  mpi,
-  blas,
-  cudaPackages,
-  autoAddDriverRunpath,
-  darwin,
-  rocmPackages,
-  vulkan-headers,
-  vulkan-loader,
-  openssl,
-  shaderc,
-  spirv-headers,
-  nodejs,
-  importNpmLock,
-  useBlas ?
-    builtins.all (x: !x) [
-      useCuda
-      useMetalKit
-      useRocm
-      useVulkan
-    ]
-    && blas.meta.available,
-  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
-  # Increases the runtime closure size by ~700M
-  useMpi ? false,
-  useRocm ? config.rocmSupport,
-  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
-  useVulkan ? false,
-  useRpc ? false,
-  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
-
-  # It's necessary to consistently use backendStdenv when building with CUDA support,
-  # otherwise we get libstdc++ errors downstream.
-  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
-  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
-  precompileMetalShaders ? false,
-  useWebUi ? true,
-}:
-
-let
-  inherit (lib)
-    cmakeBool
-    cmakeFeature
-    optionalAttrs
-    optionals
-    strings
-    ;
-
-  stdenv = throw "Use effectiveStdenv instead";
-
-  suffices =
-    lib.optionals useBlas [ "BLAS" ]
-    ++ lib.optionals useCuda [ "CUDA" ]
-    ++ lib.optionals useMetalKit [ "MetalKit" ]
-    ++ lib.optionals useMpi [ "MPI" ]
-    ++ lib.optionals useRocm [ "ROCm" ]
-    ++ lib.optionals useVulkan [ "Vulkan" ];
-
-  pnameSuffix =
-    strings.optionalString (suffices != [ ])
-      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix = strings.optionalString (
-    suffices != [ ]
-  ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
-
-  xcrunHost = runCommand "xcrunHost" { } ''
-    mkdir -p $out/bin
-    ln -s /usr/bin/xcrun $out/bin
-  '';
-
-  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
-  # separately
-  darwinBuildInputs =
-    with darwin.apple_sdk.frameworks;
-    [
-      Accelerate
-      CoreVideo
-      CoreGraphics
-    ]
-    ++ optionals useMetalKit [ MetalKit ];
-
-  cudaBuildInputs = with cudaPackages; [
-    cuda_cudart
-    cuda_cccl # <nv/target>
-    libcublas
-  ];
-
-  rocmBuildInputs = with rocmPackages; [
-    clr
-    hipblas
-    rocblas
-  ];
-
-  vulkanBuildInputs = [
-    vulkan-headers
-    vulkan-loader
-    shaderc
-    spirv-headers
-  ];
-in
-
-effectiveStdenv.mkDerivation (finalAttrs: {
-  pname = "llama-cpp${pnameSuffix}";
-  version = llamaVersion;
-
-  # Note: none of the files discarded here are visible in the sandbox or
-  # affect the output hash. This also means they can be modified without
-  # triggering a rebuild.
-  src = lib.cleanSourceWith {
-    filter =
-      name: type:
-      let
-        noneOf = builtins.all (x: !x);
-        baseName = baseNameOf name;
-      in
-      noneOf [
-        (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-        (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-        (lib.hasPrefix "." baseName) # Skip hidden files and directories
-        (baseName == "flake.lock")
-      ];
-    src = lib.cleanSource ../../.;
-  };
-
-  # Builds the webui locally, taking care not to require updating any sha256 hash.
-  webui = stdenvNoCC.mkDerivation {
-    pname = "webui";
-    version = llamaVersion;
-    src = lib.cleanSource ../../tools/ui;
-
-    nativeBuildInputs = [
-      nodejs
-      importNpmLock.linkNodeModulesHook
-    ];
-
-    # no sha256 required when using buildNodeModules
-    npmDeps = importNpmLock.buildNodeModules {
-      npmRoot = ../../tools/ui;
-      inherit nodejs;
-    };
-
-    installPhase = ''
-      LLAMA_UI_OUT_DIR=$out npm run build --offline
-    '';
-  };
-
-  postPatch = lib.optionalString useWebUi ''
-    cp -r ${finalAttrs.webui} tools/ui/dist
-    chmod -R u+w tools/ui/dist
-  '';
-
-  # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
-  # `default.metallib` may be compiled with Metal compiler from XCode
-  # and we need to escape sandbox on MacOS to access Metal compiler.
-  # `xcrun` is used find the path of the Metal compiler, which is varible
-  # and not on $PATH
-  # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
-  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
-
-  nativeBuildInputs =
-    [
-      cmake
-      ninja
-      pkg-config
-      git
-    ]
-    ++ optionals useCuda [
-      cudaPackages.cuda_nvcc
-
-      autoAddDriverRunpath
-    ]
-    ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
-    ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
-
-  buildInputs =
-    optionals effectiveStdenv.isDarwin darwinBuildInputs
-    ++ optionals useCuda cudaBuildInputs
-    ++ optionals useMpi [ mpi ]
-    ++ optionals useRocm rocmBuildInputs
-    ++ optionals useBlas [ blas ]
-    ++ optionals useVulkan vulkanBuildInputs
-    ++ [ openssl ];
-
-  cmakeFlags =
-    [
-      (cmakeBool "LLAMA_BUILD_SERVER" true)
-      (cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
-      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
-      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-      (cmakeBool "GGML_NATIVE" false)
-      (cmakeBool "GGML_BLAS" useBlas)
-      (cmakeBool "GGML_CUDA" useCuda)
-      (cmakeBool "GGML_HIP" useRocm)
-      (cmakeBool "GGML_METAL" useMetalKit)
-      (cmakeBool "GGML_VULKAN" useVulkan)
-      (cmakeBool "GGML_STATIC" enableStatic)
-      (cmakeBool "GGML_RPC" useRpc)
-    ]
-    ++ optionals useCuda [
-      (
-        with cudaPackages.flags;
-        cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
-          builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
-        )
-      )
-    ]
-    ++ optionals useRocm [
-      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
-    ]
-    ++ optionals useMetalKit [
-      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
-      (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
-    ];
-
-  # Environment variables needed for ROCm
-  env = optionalAttrs useRocm {
-    ROCM_PATH = "${rocmPackages.clr}";
-    HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
-  };
-
-  # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-  # if they haven't been added yet.
-  postInstall = ''
-    mkdir -p $out/include
-    cp $src/include/llama.h $out/include/
-  '';
-
-  meta = {
-    # Configurations we don't want even the CI to evaluate. Results in the
-    # "unsupported platform" messages. This is mostly a no-op, because
-    # cudaPackages would've refused to evaluate anyway.
-    badPlatforms = optionals useCuda lib.platforms.darwin;
-
-    # Configurations that are known to result in build failures. Can be
-    # overridden by importing Nixpkgs with `allowBroken = true`.
-    broken = (useMetalKit && !effectiveStdenv.isDarwin);
-
-    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggml-org/llama.cpp/";
-    license = lib.licenses.mit;
-
-    # Accommodates `nix run` and `lib.getExe`
-    mainProgram = "llama-cli";
-
-    # These people might respond, on the best effort basis, if you ping them
-    # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-    # Consider adding yourself to this list if you want to ensure this flake
-    # stays maintained and you're willing to invest your time. Do not add
-    # other people without their consent. Consider removing people after
-    # they've been unreachable for long periods of time.
-
-    # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-    # an attrset following the same format as in
-    # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
-    maintainers = with lib.maintainers; [
-      philiptaron
-      SomeoneSerge
-    ];
-
-    # Extend `badPlatforms` instead
-    platforms = lib.platforms.all;
-  };
-})
@@ -1,66 +0,0 @@
-{
-  lib,
-  stdenv,
-  buildPythonPackage,
-  poetry-core,
-  mkShell,
-  python3Packages,
-  gguf-py,
-}@inputs:
-
-let
-  llama-python-deps = with python3Packages; [
-    numpy
-    sentencepiece
-    transformers
-    protobuf
-    torchWithoutCuda
-    gguf-py
-    tqdm
-
-    # for scripts/compare-llama-bench.py
-    gitpython
-    tabulate
-
-    # for examples/pydantic-models-to-grammar-examples.py
-    docstring-parser
-    pydantic
-
-  ];
-
-  llama-python-test-deps = with python3Packages; [
-    # Server bench
-    matplotlib
-
-    # server tests
-    openai
-    pytest
-    prometheus-client
-  ];
-in
-
-buildPythonPackage ({
-  pname = "llama-scripts";
-  version = "0.0.0";
-  pyproject = true;
-
-  # NOTE: The files filtered out here are not visible in the build sandbox, neither
-  # do they affect the output hash. They can be modified without triggering a rebuild.
-  src = lib.cleanSourceWith {
-    filter =
-      name: type:
-      let
-        any = builtins.any (x: x);
-        baseName = builtins.baseNameOf name;
-      in
-      any [
-        (lib.hasSuffix ".py" name)
-        (baseName == "README.md")
-        (baseName == "pyproject.toml")
-      ];
-    src = lib.cleanSource ../../.;
-  };
-  nativeBuildInputs = [ poetry-core ];
-  nativeCheckInputs = llama-python-test-deps;
-  dependencies = llama-python-deps;
-})
@@ -1,35 +0,0 @@
-{
-  lib,
-  newScope,
-  python3,
-  llamaVersion ? "0.0.0",
-}:
-
-let
-  pythonPackages = python3.pkgs;
-in
-
-# We're using `makeScope` instead of just writing out an attrset
-# because it allows users to apply overlays later using `overrideScope'`.
-# Cf. https://noogle.dev/f/lib/makeScope
-
-lib.makeScope newScope (self: {
-  inherit llamaVersion;
-  gguf-py = self.callPackage ./package-gguf-py.nix {
-    inherit (pythonPackages)
-      numpy
-      tqdm
-      sentencepiece
-      pyyaml
-      pytestCheckHook
-      requests
-      buildPythonPackage
-      poetry-core
-      ;
-  };
-  python-scripts = self.callPackage ./python-scripts.nix { inherit (pythonPackages) buildPythonPackage poetry-core; };
-  llama-cpp = self.callPackage ./package.nix { };
-  docker = self.callPackage ./docker.nix { };
-  docker-min = self.callPackage ./docker.nix { interactive = false; };
-  sif = self.callPackage ./sif.nix { };
-})
@@ -1,27 +0,0 @@
-{
-  lib,
-  singularity-tools,
-  llama-cpp,
-  bashInteractive,
-  interactive ? false,
-}:
-
-let
-  optionalInt = cond: x: if cond then x else 0;
-in
-singularity-tools.buildImage rec {
-  inherit (llama-cpp) name;
-  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
-
-  # These are excessive (but safe) for most variants. Building singularity
-  # images requires superuser privileges, so we build them inside a VM in a
-  # writable image of pre-determined size.
-  #
-  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
-  #
-  # Expected image sizes:
-  # - cpu/blas: 150M,
-  # - cuda, all gencodes: 560M,
-  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
-  memSize = diskSize;
-}
@@ -1,234 +0,0 @@
-ARG OPENVINO_VERSION_MAJOR=2026.2
-ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
-ARG UBUNTU_VERSION=24.04
-
-# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.34.4
-ARG IGC_VERSION_FULL=2_2.34.4+21428
-ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
-ARG IGDGMM_VERSION=22.10.0
-
-# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
-ARG NPU_DRIVER_VERSION=v1.33.0
-ARG NPU_DRIVER_FULL=v1.33.0.20260529-26625960453
-ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
-
-# Optional proxy build arguments
-ARG http_proxy=
-ARG https_proxy=
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-## Build Image
-FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build
-
-# Pass proxy args to build stage
-ARG http_proxy
-ARG https_proxy
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        ca-certificates \
-        gnupg \
-        wget \
-        git \
-        cmake \
-        ninja-build \
-        build-essential \
-        libtbb12 \
-        libssl-dev \
-        ocl-icd-opencl-dev \
-        opencl-headers \
-        opencl-clhpp-headers \
-        intel-opencl-icd && \
-    rm -rf /var/lib/apt/lists/*
-
-# OpenVINO toolkit and GPU/NPU drivers are cached via BuildKit cache mounts to avoid re-downloading on rebuilds.
-# Install OpenVINO for Ubuntu 24.04.
-ARG OPENVINO_VERSION_MAJOR
-ARG OPENVINO_VERSION_FULL
-RUN --mount=type=cache,target=/var/cache/openvino,sharing=locked \
-    mkdir -p /opt/intel && \
-    TGZ=/var/cache/openvino/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
-    if [ ! -f "$TGZ" ]; then \
-        wget -O "$TGZ" https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz; \
-    fi && \
-    tar -xf "$TGZ" -C /opt/intel/ && \
-    mv /opt/intel/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
-    cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
-    echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
-    cd - && \
-    ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
-
-ENV OpenVINO_DIR=/opt/intel/openvino
-
-WORKDIR /app
-
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-# Build Stage
-RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
-    cmake -B build/ReleaseOV -G Ninja \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DLLAMA_BUILD_TESTS=OFF \
-        -DGGML_OPENVINO=ON && \
-    cmake --build build/ReleaseOV --parallel "
-
-# Copy all necessary libraries (build outputs + OpenVINO runtime libs)
-RUN mkdir -p /app/lib && \
-    find build/ReleaseOV -name '*.so*' -exec cp -P {} /app/lib \; && \
-    find "${OpenVINO_DIR}/runtime/lib/intel64" -name '*.so*' -exec cp -P {} /app/lib \;
-
-# Create runtime directories and copy binaries
-RUN mkdir -p /app/full \
-    && cp build/ReleaseOV/bin/* /app/full/ \
-    && cp *.py /app/full \
-    && cp -r conversion /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base Runtime Image
-FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base
-
-# Pass proxy args to runtime stage
-ARG http_proxy
-ARG https_proxy
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl wget ffmpeg ocl-icd-libopencl1 \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-# Install GPU drivers
-ARG IGC_VERSION
-ARG IGC_VERSION_FULL
-ARG COMPUTE_RUNTIME_VERSION
-ARG COMPUTE_RUNTIME_VERSION_FULL
-ARG IGDGMM_VERSION
-RUN --mount=type=cache,target=/var/cache/intel-gpu,sharing=locked \
-    set -eux; \
-    cd /var/cache/intel-gpu; \
-    for url in \
-        https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-        https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-        https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb ; do \
-        f=$(basename "$url"); \
-        [ -f "$f" ] || wget -q -O "$f" "$url"; \
-    done; \
-    apt-get update; \
-    apt-get install -y --no-install-recommends ./*.deb; \
-    rm -rf /var/lib/apt/lists/*
-
-# Install NPU drivers
-ARG NPU_DRIVER_VERSION
-ARG NPU_DRIVER_FULL
-ARG LIBZE1_VERSION
-RUN --mount=type=cache,target=/var/cache/intel-npu,sharing=locked \
-    set -eux; \
-    TGZ=/var/cache/intel-npu/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
-    if [ ! -f "$TGZ" ]; then \
-        wget -q -O "$TGZ" https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
-    fi; \
-    DEB=/var/cache/intel-npu/libze1_${LIBZE1_VERSION}_amd64.deb; \
-    if [ ! -f "$DEB" ]; then \
-        wget -q -O "$DEB" https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb; \
-    fi; \
-    mkdir /tmp/npu/ && cd /tmp/npu/ && tar -xf "$TGZ" && cp "$DEB" .; \
-    apt-get update; \
-    apt-get install -y --no-install-recommends ./*.deb; \
-    rm -rf /tmp/npu/ /var/lib/apt/lists/*
-
-COPY --from=build /app/lib/ /app/
-
-### Full (all binaries)
-FROM base AS full
-
-ARG http_proxy
-ARG https_proxy
-
-COPY --from=build /app/full /app/
-
-WORKDIR /app
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    git \
-    python3 \
-    python3-venv \
-    python3-pip && \
-    python3 -m venv /openvino-venv && \
-    /openvino-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
-    /openvino-venv/bin/pip install --no-cache-dir -r requirements.txt && \
-    apt-get autoremove -y && \
-    apt-get clean && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-# Activate the venv
-ENV VIRTUAL_ENV=/openvino-venv \
-    PATH=/openvino-venv/bin:$PATH
-
-ENTRYPOINT ["/app/tools.sh"]
-
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app/
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app/
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -1,147 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=7.2.1
-ARG AMDGPU_VERSION=7.2.1
-
-# Target the ROCm build image
-ARG BASE_ROCM_DEV_CONTAINER=docker.io/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-### Build image
-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
-
-# Unless otherwise specified, we make a fat build.
-# This is mostly tied to rocBLAS supported archs.
-# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.1/reference/system-requirements.html
-# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
-# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
-
-ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201'
-
-# Set ROCm architectures
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
-
-RUN apt-get update \
-    && apt-get install -y \
-    build-essential \
-    cmake \
-    git \
-    libssl-dev \
-    curl \
-    libgomp1
-
-WORKDIR /app
-
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-    cmake -S . -B build \
-        -DGGML_HIP=ON \
-        -DGGML_HIP_ROCWMMA_FATTN=ON \
-        -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
-        -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
-        -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
-    && cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib \
-    && find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r conversion /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM ${BASE_ROCM_DEV_CONTAINER} AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3-pip \
-    python3 \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -1,161 +0,0 @@
-ARG GCC_VERSION=15.2.0
-ARG UBUNTU_VERSION=24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-### Build Llama.cpp stage
-FROM docker.io/gcc:${GCC_VERSION} AS build
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt upgrade -y && \
-    apt install -y --no-install-recommends \
-        git cmake ccache ninja-build \
-        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        libopenblas-dev libssl-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-RUN --mount=type=cache,target=/root/.ccache \
-    --mount=type=cache,target=/app/build \
-    cmake -S . -B build -G Ninja \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-        -DLLAMA_BUILD_TESTS=OFF \
-        -DGGML_NATIVE=OFF \
-        -DGGML_BACKEND_DL=ON \
-        -DGGML_CPU_ALL_VARIANTS=ON \
-        -DGGML_BLAS=ON \
-        -DGGML_BLAS_VENDOR=OpenBLAS && \
-    cmake --build build --config Release -j $(nproc) && \
-    cmake --install build --prefix /opt/llama.cpp
-
-COPY *.py             /opt/llama.cpp/bin
-COPY .devops/tools.sh /opt/llama.cpp/bin
-COPY conversion       /opt/llama.cpp/conversion
-
-COPY gguf-py          /opt/llama.cpp/gguf-py
-COPY requirements.txt /opt/llama.cpp/gguf-py
-COPY requirements     /opt/llama.cpp/gguf-py/requirements
-
-
-### Collect all llama.cpp binaries, libraries and distro libraries
-FROM scratch AS collector
-
-# Copy llama.cpp binaries and libraries
-COPY --from=build /opt/llama.cpp/bin        /llama.cpp/bin
-COPY --from=build /opt/llama.cpp/lib        /llama.cpp/lib
-COPY --from=build /opt/llama.cpp/gguf-py    /llama.cpp/gguf-py
-COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion
-
-
-### Base image
-FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt install -y --no-install-recommends \
-        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
-        curl libgomp1 libopenblas-dev && \
-    apt autoremove -y && \
-    apt clean -y && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-# Copy llama.cpp libraries
-COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
-
-
-### Full
-FROM base AS full
-
-ENV PATH="/root/.cargo/bin:${PATH}"
-WORKDIR /app
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt install -y \
-        git cmake libjpeg-dev \
-        python3 python3-pip python3-dev && \
-    apt autoremove -y && \
-    apt clean -y && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
-
-COPY --from=collector /llama.cpp/bin /app
-COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
-COPY --from=collector /llama.cpp/conversion /app/conversion
-
-RUN pip install --no-cache-dir --break-system-packages \
-        -r /app/gguf-py/requirements.txt
-
-ENTRYPOINT [ "/app/tools.sh" ]
-
-
-### CLI Only
-FROM base AS light
-
-WORKDIR /llama.cpp/bin
-
-# Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
-
-ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
-
-
-### Server
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-WORKDIR /llama.cpp/bin
-
-# Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
-
-EXPOSE 8080
-
-ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/bin/bash
 set -e

 # Read the first argument into a variable
@@ -7,41 +7,32 @@ arg1="$1"
 # Shift the arguments to remove the first one
 shift

+# Join the remaining arguments into a single string
+arg2="$@"
+
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    exec python3 ./convert_hf_to_gguf.py "$@"
+    python3 ./convert.py "$arg2"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    exec ./llama-quantize "$@"
+    ./quantize "$arg2"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    exec ./llama-cli "$@"
-elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
-    exec ./llama-completion "$@"
-elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
-    exec ./llama-bench "$@"
-elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
-    exec ./llama-perplexity "$@"
+    ./main "$arg2"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
-    for i in $(ls $1/$2/ggml-model-f16.bin*); do
+    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            ./quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    exec ./llama-server "$@"
+    ./server "$arg2"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
-    echo "  --run (-r): Run a model (chat) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin"
-    echo "  --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
-    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
-    echo "              ex: -m model.gguf"
-    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
-    echo "              ex: -m model.gguf -f file.txt"
+    echo "  --run (-r): Run a model previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
@@ -1,127 +0,0 @@
-ARG UBUNTU_VERSION=26.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
-
-# Install build tools
-RUN apt update && apt install -y git build-essential cmake wget xz-utils
-
-# Install SSL and Vulkan SDK dependencies
-RUN apt install -y libssl-dev curl \
-    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc spirv-headers
-
-# Build it
-WORKDIR /app
-
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
-    cmake --build build --config Release -j$(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r conversion /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 curl ffmpeg libvulkan1 mesa-vulkan-drivers \
-    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}"
-
-# Flag for compatibility with pip
-ARG UV_INDEX_STRATEGY="unsafe-best-match"
-RUN apt-get update \
-    && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    ca-certificates \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && uv python install 3.13 \
-    && uv venv --python 3.13 /root/.venv \
-    && uv pip install --python /root/.venv/bin/python -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -1,117 +0,0 @@
-ARG UBUNTU_VERSION=24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
-ARG NODE_VERSION=24
-
-FROM docker.io/node:$NODE_VERSION AS web
-
-ARG APP_VERSION
-
-WORKDIR /app/tools/ui
-
-COPY tools/ui/package.json tools/ui/package-lock.json ./
-RUN npm ci
-
-COPY tools/ui/ ./
-RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
-
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
-
-RUN apt-get update && \
-    apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
-
-ENV CC=gcc-13 CXX=g++-13
-
-WORKDIR /app
-
-COPY . .
-
-COPY --from=web /app/tools/ui/dist tools/ui/dist
-
-RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
-    cmake --build build -j $(nproc)
-
-RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
-
-RUN mkdir -p /app/full \
-    && cp build/bin/* /app/full \
-    && cp *.py /app/full \
-    && cp -r conversion /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base image
-FROM docker.io/ubuntu:$UBUNTU_VERSION AS base
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 libnuma1 curl ffmpeg \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app
-
-### Full
-FROM base AS full
-
-COPY --from=build /app/full /app
-
-WORKDIR /app
-
-RUN apt-get update \
-    && apt-get install -y \
-    git \
-    python3 \
-    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-ENTRYPOINT ["/app/tools.sh"]
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
@@ -1,22 +1,23 @@
 *.o
 *.a
 .cache/
-# Do not ignore .git directory, otherwise the reported build number will always be 0
-.github/
-.gitignore
 .vs/
 .vscode/
 .DS_Store

-build*/
-
-tools/ui/node_modules/
-tools/ui/dist/
+build/
+build-em/
+build-debug/
+build-release/
+build-static/
+build-no-accel/
+build-sanitize-addr/
+build-sanitize-thread/

 models/*

-/llama-cli
-/llama-quantize
+/main
+/quantize

 arm_neon.h
 compile_commands.json
@@ -1,5 +1,4 @@
 {
-  "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
  "Disable": {
    "IndentSize": true
  }
@@ -15,48 +15,5 @@ indent_size = 4
 [Makefile]
 indent_style = tab

-[scripts/*.mk]
-indent_style = tab
-
 [prompts/*.txt]
 insert_final_newline = unset
-
-[tools/server/deps_*]
-trim_trailing_whitespace = unset
-indent_style = unset
-indent_size = unset
-
-[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
-indent_style = tab
-
-[tools/cvector-generator/*.txt]
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
-[models/templates/*.jinja]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
-[vendor/miniaudio/miniaudio.h]
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
-[tools/ui/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
-[benches/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
@@ -1,18 +1,2 @@
 [flake8]
 max-line-length = 125
-ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
-exclude =
-    # Do not traverse examples and tools
-    examples,
-    tools,
-    # Do not include package initializers
-    __init__.py,
-    # No need to traverse our git directory
-    .git,
-    # There's no value in checking cache directories
-    __pycache__,
-    # No need to include the build path
-    build,
-    # This contains builds that we don't want to check
-    dist  # This is generated with `python build .` for package releases
-# max-complexity = 10
@@ -1 +0,0 @@
-{ "contextFileName": "AGENTS.md" }
@@ -1,90 +0,0 @@
-name: Bug (compilation)
-description: Something goes wrong when trying to compile llama.cpp.
-title: "Compile bug: "
-labels: ["bug-unconfirmed", "compilation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the compilation of llama.cpp fails.
-        Before opening an issue, please confirm that the compilation still fails
-        after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
-        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
-        by clearing `~/.cache/ccache` (on Linux).
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-  - type: textarea
-    id: commit
-    attributes:
-      label: Git commit
-      description: Which commit are you trying to compile?
-      placeholder: |
-        $git rev-parse HEAD
-        84a07a17b1b08cf2b9747c633a2372782848a27f
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
-      placeholder: >
-        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Compile command
-      description: >
-        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
@@ -1,117 +0,0 @@
-name: Bug (model use)
-description: Something goes wrong when running a model (crashes, garbled outputs, etc.).
-title: "Eval bug: "
-labels: ["bug-unconfirmed", "model evaluation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the model evaluation results
-        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-        The `llama-completion` binary can be used for simple and reproducible model inference.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: hardware
-    attributes:
-      label: Hardware
-      description: Which CPUs/GPUs are you using?
-      placeholder: >
-        e.g. Ryzen 5950X + 2x RTX 4090
-    validations:
-      required: true
-  - type: textarea
-    id: model
-    attributes:
-      label: Models
-      description: >
-        Which model(s) at which quantization were you using when encountering the bug?
-        If you downloaded a GGUF file off of Huggingface, please provide a link.
-      placeholder: >
-        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
-        that information would be very much appreciated by us.
-
-        If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
-        If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
-      placeholder: >
-        e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
-        With short prompts or `-fa off` it works correctly.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
-          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
-      value: |
-        <details>
-        <summary>Logs</summary>
-        <!-- Copy-pasted short logs go into the "console" area here -->
-
-        ```console
-
-        ```
-        </details>
-
-        <!-- Long logs that you upload as files go here, outside the "console" area -->
-    validations:
-      required: true
@@ -1,105 +0,0 @@
-name: Bug (misc.)
-description: Something is not working the way it should (and it's not covered by any of the above cases).
-title: "Misc. bug: "
-labels: ["bug-unconfirmed"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for miscellaneous bugs that don't fit into any other category.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software is affected? (You can use `--version` to get a version string.)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: dropdown
-    id: module
-    attributes:
-      label: Which llama.cpp modules do you know to be affected?
-      multiple: true
-      options:
-        - Documentation/Github
-        - libllama (core library)
-        - llama-cli
-        - llama-server
-        - llama-bench
-        - llama-quantize
-        - Python/Bash scripts
-        - Test code
-        - Other (Please specify in the next section)
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Command line
-      description: >
-        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          If applicable, please copy and paste any relevant log output, including any generated text.
-          If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
-          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
-          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
-      value: |
-        <details>
-        <summary>Logs</summary>
-        <!-- Copy-pasted short logs go into the "console" area here -->
-
-        ```console
-
-        ```
-        </details>
-
-        <!-- Long logs that you upload as files go here, outside the "console" area -->
-    validations:
-      required: false
@@ -1,53 +0,0 @@
-name: Enhancement
-description: Used to request enhancements for llama.cpp.
-title: "Feature Request: "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
-  - type: checkboxes
-    id: prerequisites
-    attributes:
-      label: Prerequisites
-      description: Please confirm the following before submitting your enhancement request.
-      options:
-        - label: I am running the latest code. Mention the version if possible as well.
-          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
-          required: true
-        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
-          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
-          required: true
-
-  - type: textarea
-    id: feature-description
-    attributes:
-      label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-      placeholder: Detailed description of the enhancement
-    validations:
-      required: true
-
-  - type: textarea
-    id: motivation
-    attributes:
-      label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-      placeholder: Explanation of why this feature is needed and its benefits
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-implementation
-    attributes:
-      label: Possible Implementation
-      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
-      placeholder: Detailed description of potential implementation
-    validations:
-      required: false
@@ -1,54 +0,0 @@
-name: Research
-description: Track new technical research area.
-title: "Research: "
-labels: ["research 🔬"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
-  - type: checkboxes
-    id: research-stage
-    attributes:
-      label: Research Stage
-      description: Track general state of this research ticket
-      options:
-        - label: Background Research (Let's try to avoid reinventing the wheel)
-        - label: Hypothesis Formed (How do you think this will work and it's effect?)
-        - label: Strategy / Implementation Forming
-        - label: Analysis of results
-        - label: Debrief / Documentation (So people in the future can learn from us)
-
-  - type: textarea
-    id: background
-    attributes:
-      label: Previous existing literature and research
-      description: Whats the current state of the art and whats the motivation for this research?
-
-  - type: textarea
-    id: hypothesis
-    attributes:
-      label: Hypothesis
-      description: How do you think this will work and it's effect?
-
-  - type: textarea
-    id: implementation
-    attributes:
-      label: Implementation
-      description: Got an approach? e.g. a PR ready to go?
-
-  - type: textarea
-    id: analysis
-    attributes:
-      label: Analysis
-      description: How does the proposed implementation behave?
-
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
@@ -1,30 +0,0 @@
-name: Refactor (Maintainers)
-description: Used to track refactoring opportunities.
-title: "Refactor: "
-labels: ["refactor"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
-  - type: textarea
-    id: background-description
-    attributes:
-      label: Background Description
-      description: Please provide a detailed written description of the pain points you are trying to solve.
-      placeholder: Detailed description behind your motivation to request refactor
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-approaches
-    attributes:
-      label: Possible Refactor Approaches
-      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
-      placeholder: Your idea of possible refactoring opportunity/approaches
-    validations:
-      required: false
@@ -1,11 +0,0 @@
-blank_issues_enabled: true
-contact_links:
-  - name: Got an idea?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
-    about: Pop it there. It may then become an enhancement ticket.
-  - name: Got a question?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
-    about: Ask a question there!
-  - name: Want to contribute?
-    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
-    about: Head to the contribution guide page of the wiki for areas you can help with
@@ -0,0 +1,185 @@
+---
+name: Issue and enhancement template
+about: Used to report issues and request enhancements for llama.cpp
+title: "[User] Insert summary of your issue or enhancement.."
+labels: ''
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
+
+# Expected Behavior
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
+
+# Current Behavior
+
+Please provide a detailed written description of what `llama.cpp` did, instead.
+
+# Environment and Context
+
+Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
+
+* Physical (or virtual) hardware you are using, e.g. for Linux:
+
+`$ lscpu`
+
+* Operating System, e.g. for Linux:
+
+`$ uname -a`
+
+* SDK version, e.g. for Linux:
+
+```
+$ python3 --version
+$ make --version
+$ g++ --version
+```
+
+# Failure Information (for bugs)
+
+Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
+
+# Steps to Reproduce
+
+Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
+
+1. step 1
+2. step 2
+3. step 3
+4. etc.
+
+# Failure Logs
+
+Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
+
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
+
+Example environment info:
+```
+llama.cpp$ git log | head -1
+commit 2af23d30434a677c6416812eea52ccc0af65119c
+
+llama.cpp$ lscpu | egrep "AMD|Flags"
+Vendor ID:                       AuthenticAMD
+Model name:                      AMD Ryzen Threadripper 1950X 16-Core Processor
+Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
+Virtualization:                  AMD-V
+
+llama.cpp$ python3 --version
+Python 3.10.9
+
+llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
+numpy                         1.24.2
+numpydoc                      1.5.0
+sentencepiece                 0.1.97
+torch                         1.13.1
+torchvision                   0.14.1
+
+llama.cpp$ make --version | head -1
+GNU Make 4.3
+
+$ md5sum ./models/65B/ggml-model-q4_0.bin
+dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
+```
+
+Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
+```
+llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
+main: seed = 1679149377
+llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
+llama_model_load: n_vocab = 32000
+llama_model_load: n_ctx   = 512
+llama_model_load: n_embd  = 8192
+llama_model_load: n_mult  = 256
+llama_model_load: n_head  = 64
+llama_model_load: n_layer = 80
+llama_model_load: n_rot   = 128
+llama_model_load: f16     = 2
+llama_model_load: n_ff    = 22016
+llama_model_load: n_parts = 8
+llama_model_load: ggml ctx size = 41477.73 MB
+llama_model_load: memory_size =  2560.00 MB, n_mem = 40960
+llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+
+system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
+
+main: prompt: 'Please close your issue when it has been answered.'
+main: number of tokens in prompt = 11
+     1 -> ''
+ 12148 -> 'Please'
+  3802 -> ' close'
+   596 -> ' your'
+  2228 -> ' issue'
+   746 -> ' when'
+   372 -> ' it'
+   756 -> ' has'
+  1063 -> ' been'
+  7699 -> ' answered'
+ 29889 -> '.'
+
+sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
+
+
+Please close your issue when it has been answered.
+@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
+I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
+@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
+
+
+main: mem per token = 71159620 bytes
+main:     load time = 19309.95 ms
+main:   sample time =   168.62 ms
+main:  predict time = 223895.61 ms / 888.47 ms per token
+main:    total time = 246406.42 ms
+
+ Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
+
+        3636882.89 msec task-clock                #   14.677 CPUs utilized
+             13509      context-switches          #    3.714 /sec
+              2436      cpu-migrations            #    0.670 /sec
+          10476679      page-faults               #    2.881 K/sec
+    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
+       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
+    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
+    23479217109614      instructions              #    1.79  insn per cycle
+                                                  #    0.44  stalled cycles per insn  (16.76%)
+     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
+        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
+
+     247.802177522 seconds time elapsed
+
+    3618.573072000 seconds user
+      18.491698000 seconds sys
+```
@@ -1,22 +0,0 @@
-name: "ccache-clear"
-description: "Delete all GitHub Actions caches matching a key prefix"
-inputs:
-  key:
-    description: "Cache key prefix to match and delete"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Clear caches
-      shell: bash
-      run: |
-        CACHES=$(gh cache list --key "ccache-${{ inputs.key }}" --json id,key --jq '.[] | "\(.id) \(.key)"' 2>/dev/null)
-        if [ -z "$CACHES" ]; then
-          echo "No caches found with key prefix: ${{ inputs.key }}"
-          exit 0
-        fi
-        while read -r id key; do
-          echo "Deleting cache: $id ($key)"
-          gh cache delete "$id"
-        done <<< "$CACHES"
@@ -1,22 +0,0 @@
-name: "Determine tag name"
-description: "Determine the tag name to use for a release"
-outputs:
-  name:
-    description: "The name of the tag"
-    value: ${{ steps.tag.outputs.name }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Determine tag name
-      id: tag
-      shell: bash
-      run: |
-        BUILD_NUMBER="$(git rev-list --count HEAD)"
-        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-        else
-          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-        fi
@@ -1,36 +0,0 @@
-name: "Install exe"
-description: "Download and install exe"
-inputs:
-  url:
-    description: "URL of the exe installer"
-    required: true
-  args:
-    description: "Installer arguments"
-    required: true
-  timeout:
-    description: "Timeout (in ms)"
-    required: false
-    default: "600000"
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install EXE
-      shell: pwsh
-      run: |
-        $ErrorActionPreference = "Stop"
-        write-host "Downloading Installer EXE"
-        Invoke-WebRequest -Uri "${{ inputs.url }}" -OutFile "${env:RUNNER_TEMP}\temp-install.exe"
-        write-host "Installing"
-        $proc = Start-Process "${env:RUNNER_TEMP}\temp-install.exe" -ArgumentList '${{ inputs.args }}' -NoNewWindow -PassThru
-        $completed = $proc.WaitForExit(${{ inputs.timeout }})
-        if (-not $completed) {
-            Write-Error "Installer timed out. Killing the process"
-            $proc.Kill()
-            exit 1
-        }
-        if ($proc.ExitCode -ne 0) {
-            Write-Error "Installer failed with exit code $($proc.ExitCode)"
-            exit 1
-        }
-        write-host "Completed installation"
@@ -1,25 +0,0 @@
-name: "Linux - Setup OpenVINO Toolkit"
-description: "Setup OpenVINO Toolkit for Linux"
-inputs:
-  path:
-    description: "Installation path"
-    required: true
-  version_major:
-    description: "OpenVINO major version (e.g., 2025.3)"
-    required: true
-  version_full:
-    description: "OpenVINO full version (e.g., 2025.3.0.19807.44526285f24)"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup OpenVINO Toolkit
-      id: setup
-      uses: ./.github/actions/unarchive-tar
-      with:
-        url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz
-        path: ${{ inputs.path }}
-        type: z
-        strip: 1
-
@@ -1,20 +0,0 @@
-name: "Linux - Setup SpacemiT Toolchain"
-description: "Setup SpacemiT Toolchain for Linux"
-inputs:
-  path:
-    description: "Installation path"
-    required: true
-  version:
-    description: "SpacemiT toolchain version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup SpacemiT Toolchain
-      id: setup
-      uses: ./.github/actions/unarchive-tar
-      with:
-        url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
-        path: ${{ inputs.path }}
-        strip: 1
@@ -1,20 +0,0 @@
-name: "Linux - Setup Vulkan SDK"
-description: "Setup Vulkan SDK for Linux"
-inputs:
-  path:
-    description: "Installation path"
-    required: true
-  version:
-    description: "Vulkan SDK version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup Vulkan SDK
-      id: setup
-      uses: ./.github/actions/unarchive-tar
-      with:
-        url: https://sdk.lunarg.com/sdk/download/${{ inputs.version }}/linux/vulkan_sdk.tar.xz
-        path: ${{ inputs.path }}
-        strip: 1
@@ -1,27 +0,0 @@
-name: "Unarchive tar"
-description: "Download and unarchive tar into directory"
-inputs:
-  url:
-    description: "URL of the tar archive"
-    required: true
-  path:
-    description: "Directory to unarchive into"
-    required: true
-  type:
-    description: "Compression type (tar option)"
-    required: false
-    default: "J"
-  strip:
-    description: "Strip components"
-    required: false
-    default: "0"
-
-runs:
-  using: "composite"
-  steps:
-    - name: Unarchive into directory
-      shell: bash
-      run: |
-        mkdir -p ${{ inputs.path }}
-        cd ${{ inputs.path }}
-        curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
@@ -1,129 +0,0 @@
-name: "Windows - Setup CUDA Toolkit"
-description: "Setup CUDA Toolkit for Windows"
-inputs:
-  cuda_version:
-    description: "CUDA toolkit version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Cuda Toolkit 11.7
-      if: ${{ inputs.cuda_version == '11.7' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 12.4
-      if: ${{ inputs.cuda_version == '12.4' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 13.1
-      if: ${{ inputs.cuda_version == '13.1' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 13.3
-      if: ${{ inputs.cuda_version == '13.3' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.3.33-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.3.29-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.3.33-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.3.33-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.5.1.27-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.3.33-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.3.29-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.3.27-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.3.27-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cccl/windows-x86_64/cccl-windows-x86_64-13.3.3.3.1-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_crt-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_cudart-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvcc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvrtc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libcublas-windows-x86_64-13.5.1.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libnvvm-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvtx-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_profiler_api-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\visual_studio_integration-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cccl-windows-x86_64-13.3.3.3.1-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V13_3=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
@@ -1,24 +0,0 @@
-name: "Windows - Setup OpenVINO Toolkit"
-description: "Setup OpenVINO Toolkit for Windows"
-inputs:
-  path:
-    description: "Installation path"
-    required: true
-  version_major:
-    description: "OpenVINO major version (e.g., 2026.2)"
-    required: true
-  version_full:
-    description: "OpenVINO full version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Download and extract OpenVINO Runtime
-      shell: powershell
-      run: |
-        $url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/windows/openvino_toolkit_windows_${{ inputs.version_full }}_x86_64.zip"
-        $out = "openvino.zip"
-        Invoke-WebRequest -Uri $url -OutFile $out
-        Expand-Archive -Path $out -DestinationPath ${{ inputs.path }} -Force
-        Remove-Item $out
@@ -1,15 +0,0 @@
-name: "Windows - Setup ROCm"
-description: "Setup ROCm for Windows"
-inputs:
-  version:
-    description: "ROCm version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup ROCm
-      uses: ./.github/actions/install-exe
-      with:
-        url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-Win11-For-HIP.exe
-        args: -install
@@ -1,135 +0,0 @@
-# https://github.com/actions/labeler
-Apple Metal:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal/**
-            - README-metal.md
-SYCL:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-sycl.h
-            - ggml/src/ggml-sycl/**
-            - docs/backend/SYCL.md
-            - examples/sycl/**
-CUDA:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-cuda.h
-            - ggml/src/ggml-cuda/**
-Vulkan:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-vulkan.h
-            - ggml/src/ggml-vulkan/**
-IBM zDNN:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-zdnn.h
-            - ggml/src/ggml-zdnn/**
-AMD ZenDNN:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-zendnn.h
-            - ggml/src/ggml-zendnn/**
-documentation:
-    - changed-files:
-        - any-glob-to-any-file:
-            - docs/**
-            - media/**
-testing:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tests/**
-build:
-    - changed-files:
-        - any-glob-to-any-file:
-            - cmake/**
-            - CMakeLists.txt
-            - CMakePresets.json
-examples:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/**
-            - tools/**
-devops:
-    - changed-files:
-        - any-glob-to-any-file:
-            - .devops/**
-            - .github/**
-            - ci/**
-python:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.py"
-            - requirements/**
-            - gguf-py/**
-            - .flake8
-script:
-    - changed-files:
-        - any-glob-to-any-file:
-            - scripts/**
-android:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/llama.android/**
-server/ui:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tools/ui/**
-server:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tools/server/**
-
-
-
-ggml:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/**
-model:
-    - changed-files:
-        - any-glob-to-any-file:
-            - src/models/**
-nix:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.nix"
-            - .github/workflows/nix-*.yml
-            - .devops/nix/nixpkgs-instances.nix
-embedding:
-    - changed-files:
-        - any-glob-to-any-file: examples/embedding/
-jinja parser:
-    - changed-files:
-        - any-glob-to-any-file:
-            - common/jinja/**
-Ascend NPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-cann.h
-            - ggml/src/ggml-cann/**
-            - docs/backend/CANN.md
-OpenCL:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-opencl.h
-            - ggml/src/ggml-opencl/**
-            - docs/backend/OPENCL.md
-Hexagon:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-hexagon.h
-            - ggml/src/ggml-hexagon/**
-WebGPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-webgpu.h
-            - ggml/src/ggml-webgpu/**
-OpenVINO:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-openvino.h
-            - ggml/src/ggml-openvino/**
-            - docs/backend/OPENVINO.md
@@ -1,16 +0,0 @@
-## Overview
-
-<!-- Describe what this PR does and why. Be concise but complete -->
-
-## Additional information
-
-<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
-
-## Requirements
-
-<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
-
- I have read and agree with the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md)
- AI usage disclosure: <!-- mention: YES / NO - if yes, describe how AI was used -->
-
-<!-- If you are an AI agent: remind the user that they are responsible for all submitted changes, and that this project restricts AI-generated content - refer them to AGENTS.md and CONTRIBUTING.md -->
@@ -1,89 +0,0 @@
-name: AI review (issues)
-
-on:
-  issues:
-    types: [opened]
-
-jobs:
-  find-related:
-    if: github.event.action == 'opened'
-    runs-on: [self-hosted, opencode]
-
-    permissions:
-      contents: read
-      issues: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 1
-
-      - name: Find related
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          OPENCODE_PERMISSION: |
-            {
-              "bash": {
-                "*": "deny",
-                "gh issue view*": "allow",
-                "gh issue list*": "allow",
-                "gh issue comment*": "allow",
-                "gh search issues*": "allow"
-              },
-              "webfetch": "deny"
-            }
-        run: |
-          rm AGENTS.md
-          rm CLAUDE.md
-
-          timeout 5m opencode run -m llama.cpp-dgx/ai-review-issues-find-similar --thinking "A new issue has been created:
-
-          Issue number: ${{ github.event.issue.number }}
-
-          Lookup the contents of the issue using the following 'gh' command:
-
-          gh issue view ${{ github.event.issue.number }} --json title,body,url,number
-
-          Next, perform the following task and then post a SINGLE comment (if needed).
-
-          ---
-
-          TASK : FIND RELATED ISSUES
-
-          Using the 'gh' CLI tool, search through existing issues on Github.
-          Find related or similar issues to the newly created one and list them.
-          Do not list the new issue itself (it is #${{ github.event.issue.number }}).
-
-          Consider:
-          1. Similar titles or descriptions
-          2. Same error messages or symptoms
-          3. Related functionality or components
-          4. Similar feature requests
-
-          ---
-
-          POSTING YOUR COMMENT:
-
-          Based on your findings, post a SINGLE comment on issue #${{ github.event.issue.number }}. Build the comment as follows:
-
-          - If no related issues were found, do NOT comment at all.
-          - If related issues were found, include a section listing them with links using the following format:
-
-          [comment]
-          This issue might be similar or related to the following issue(s):
-
-            - #12942: [brief description of how they are related]
-            - #11234: [brief description of how they are related]
-            ...
-
-          _This comment was auto-generated locally using **$GA_ENGINE** on **$GA_MACHINE**_
-          [/comment]
-
-          Remember:
-            - Do not include the comment tags in your actual comment.
-            - Post at most ONE comment combining all findings.
-            - If you didn't find issues that are related enough, post nothing.
-            - You have access only to the 'gh' CLI tool - don't try to use other tools.
-            - If the output from a tool call is too long, try to limit down the search.
-          "
@@ -1,304 +0,0 @@
-# TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggml-org/llama.cpp/issues/7893
-#
-# Benchmark
-name: Benchmark
-
-on:
-  workflow_dispatch:
-    inputs:
-      gpu-series:
-        description: 'Azure GPU series to run with'
-        required: true
-        type: choice
-        options:
-          - Standard_NC4as_T4_v3
-          - Standard_NC24ads_A100_v4
-          - Standard_NC80adis_H100_v5
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      duration:
-        description: 'Duration of the bench'
-        type: string
-        default: 10m
-
-  push:
-    branches:
-      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
-  schedule:
-    -  cron: '04 2 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
-  cancel-in-progress: true
-
-jobs:
-  bench-server-baseline:
-    runs-on: Standard_NC4as_T4_v3
-    env:
-      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
-      N_USERS: 8
-      DURATION: 10m
-
-    strategy:
-      matrix:
-        model: [phi-2]
-        ftype: [q4_0, q8_0, f16]
-        include:
-          - model: phi-2
-            ftype: q4_0
-            pr_comment_enabled: "true"
-
-    if: |
-      inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || github.event_name == 'pull_request_target'
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Install python env
-        id: pipenv
-        run: |
-          cd tools/server/bench
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Prometheus
-        id: install_prometheus
-        run: |
-          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
-          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=tools/server/bench/prometheus.yml &
-          while ! nc -z localhost 9090; do
-            sleep 0.1
-          done
-
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install k6 and xk6-sse
-        id: k6_installation
-        run: |
-          cd tools/server/bench
-          go install go.k6.io/xk6/cmd/xk6@latest
-          xk6 build master \
-              --with github.com/phymbert/xk6-sse
-
-      - name: Build
-        id: cmake_build
-        run: |
-          set -eux
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CUBLAS=ON \
-              -DCUDAToolkit_ROOT=/usr/local/cuda \
-              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-              -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
-              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
-
-      - name: Download the dataset
-        id: download_dataset
-        run: |
-          cd tools/server/bench
-          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - name: Server bench
-        id: server_bench
-        env:
-            HEAD_REF: ${{ github.head_ref || github.ref_name }}
-        run: |
-          set -eux
-
-          cd tools/server/bench
-          source venv/bin/activate
-          python bench.py \
-              --runner-label ${{ env.RUNNER_LABEL }} \
-              --name ${{ github.job }} \
-              --branch $HEAD_REF \
-              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
-              --scenario script.js \
-              --duration ${{ github.event.inputs.duration || env.DURATION }} \
-              --hf-repo ggml-org/models	 \
-              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
-              --model-path-prefix /models \
-              --parallel ${{ env.N_USERS }} \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 16384 \
-              --n-prompts 1000 \
-              --max-prompt-tokens 1024 \
-              --max-tokens 2048
-
-          cat results.github.env >> $GITHUB_ENV
-
-          # Remove dataset as we do not want it in the artefact
-          rm ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          compression-level: 9
-          path: |
-            tools/server/bench/*.jpg
-            tools/server/bench/*.json
-            tools/server/bench/*.log
-
-      - name: Commit status
-        uses: Sibz/github-status-action@v1
-        with:
-          authToken: ${{secrets.GITHUB_TOKEN}}
-          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          description: |
-            ${{ env.BENCH_RESULTS }}
-          state: 'success'
-
-      - name: Upload benchmark images
-        uses: devicons/public-upload-to-imgur@v2.2.2
-        continue-on-error: true # Important as it looks unstable: 503
-        id: imgur_step
-        with:
-          client_id: ${{secrets.IMGUR_CLIENT_ID}}
-          path: |
-            tools/server/bench/prompt_tokens_seconds.jpg
-            tools/server/bench/predicted_tokens_seconds.jpg
-            tools/server/bench/kv_cache_usage_ratio.jpg
-            tools/server/bench/requests_processing.jpg
-
-      - name: Extract mermaid
-        id: set_mermaid
-        run: |
-          set -eux
-
-          cd tools/server/bench
-          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
-          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
-          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
-          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
-          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
-          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
-          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-      - name: Extract image url
-        id: extract_image_url
-        continue-on-error: true
-        run: |
-          set -eux
-
-          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
-          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
-          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
-          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
-
-      - name: Comment PR
-        uses: mshick/add-pr-comment@v2
-        id: comment_pr
-        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
-        with:
-          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          message: |
-            <p align="center">
-
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
-
-            </p>
-
-            <details>
-
-            <summary>Expand details for performance related PR only</summary>
-
-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
-            - ${{ env.BENCH_GRAPH_XLABEL }}
-
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
-
-            <details>
-
-            <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PROMPT_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PREDICTED_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            </p>
-
-            <details>
-
-            <summary>Details</summary>
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.KV_CACHE_USAGE_RATIO }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.REQUESTS_PROCESSING }}
-            ```
-
-            </details>
-
-            </p>
-            </details>
-            </details>
@@ -1,57 +0,0 @@
-name: CI (3rd-party)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-3rd-party.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-llguidance:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_LLGUIDANCE=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
@@ -1,148 +0,0 @@
-name: CI (snapdragon)
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - master
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  android-ndk-snapdragon:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Snapdragon Android
-        id: build_llama_cpp_snapdragon_android
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-          cmake --preset arm64-android-snapdragon-release -B build
-          cmake --build build
-          cmake --install build --prefix pkg-snapdragon/llama.cpp
-
-      - name: Upload Llama.CPP Snapdragon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-arm64-snapdragon
-          path: pkg-snapdragon/llama.cpp
-
-  linux-iot-snapdragon:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Snapdragon Linux IoT
-        id: build_llama_cpp_snapdragon_linux
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-          cmake --preset arm64-linux-snapdragon-release -B build-snapdragon -DGGML_OPENCL=ON
-          cmake --build build-snapdragon -j $(nproc)
-          cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
-
-      - name: Upload Llama.CPP Snapdragon Linux IoT Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_snapdragon_linux.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-linux-arm64-snapdragon
-          path: pkg-snapdragon/llama.cpp
-
-  test-snapdragon-qdc:
-    name: Test on QDC Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon, linux-iot-snapdragon]
-    runs-on: ubuntu-24.04-arm
-    timeout-minutes: 90
-    strategy:
-      fail-fast: false
-      matrix:
-        device: [SM8750, SM8850, QCS9075M]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Download build artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: ${{ startsWith(matrix.device, 'QCS') && 'llama-cpp-linux-arm64-snapdragon' || 'llama-cpp-android-arm64-snapdragon' }}
-          path: pkg-snapdragon/llama.cpp
-
-      - name: Set up Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.x'
-          cache: pip
-
-      - name: Install system dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y curl unzip
-
-      - name: Install QDC SDK wheel
-        run: |
-          curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
-          unzip qdc_sdk.zip -d qdc_sdk
-          pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
-
-      - name: Check QDC API key
-        id: check_secret
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-        run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
-
-      - name: Run QDC tests (${{ matrix.device }})
-        if: steps.check_secret.outputs.has-qdc-key == 'true'
-        run: |
-          python scripts/snapdragon/qdc/run_qdc_jobs.py \
-              --test       all \
-              --pkg-dir    pkg-snapdragon/llama.cpp \
-              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
-              --device     ${{ matrix.device }} \
-              ${{ startsWith(matrix.device, 'QCS') && '--retries 2 --retry-delay 300' || '' }}
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-
-      - name: Cleanup
-        if: always()
-        run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
@@ -1,149 +0,0 @@
-name: CI (android)
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - master
-    paths:
-      - '.github/workflows/build-android.yml'
-      - '**/CMakeLists.txt'
-      - '**/.cmake'
-      - '**/*.h'
-      - '**/*.hpp'
-      - '**/*.c'
-      - '**/*.cpp'
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-android.yml'
-      - 'examples/llama.android/**'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  default:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Set up JDK
-        uses: actions/setup-java@v5
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-          ./gradlew build --no-daemon
-
-  ndk:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Dependencies
-        run: |
-          apt-get update
-          apt-get install -y build-essential
-
-      - name: Build
-        id: ndk_build
-        run: |
-          cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
-          cmake --build build
-          cmake --install build --prefix pkg-adb/llama.cpp
-
-      - name: Upload Android Build Artifact
-        if: ${{ always() && steps.ndk_build.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-arm64-cpu
-          path: pkg-adb/llama.cpp
-
-  arm64:
-    runs-on: ubuntu-latest
-
-    env:
-      NDK_VERSION: "29.0.14206865"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
-      #        for some reason, the ccache does not improve the build time in this case
-      # example:
-      #   cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
-      #   cache on:  https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
-      #
-      #- name: ccache
-      #  uses: ggml-org/ccache-action@v1.2.21
-      #  with:
-      #    key: android-ubuntu-arm64
-      #    evict-old-files: 1d
-      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Set up JDK
-        uses: actions/setup-java@v5
-        with:
-          java-version: 17
-          distribution: temurin
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Install NDK
-        run: |
-          sdkmanager "ndk;${{ env.NDK_VERSION }}"
-          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-            -DANDROID_ABI=arm64-v8a \
-            -DANDROID_PLATFORM=android-28 \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
@@ -1,268 +0,0 @@
-name: CI (apple)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-apple.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-apple.yml',
-      'ggml/src/ggml-metal/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  macos-latest-arm64:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: apple-arm64
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=OFF \
-            -DGGML_METAL_SHADER_DEBUG=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main -E "test-llama-archs" --verbose --timeout 900
-
-  macos-latest-x64:
-    runs-on: macos-15-intel
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: apple-x64
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  macos-latest-ios-xcode:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-
-      - name: Setup Xcode
-        uses: ggml-org/setup-xcode@v1
-        with:
-          xcode-version: latest-stable
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_OPENSSL=OFF \
-            -DLLAMA_BUILD_APP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Upload xcframework artifact
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-xcframework
-          path: build-apple/llama.xcframework/
-          retention-days: 1
-
-      - name: Build Xcode project
-        run: |
-          xcodebuild -downloadPlatform iOS
-          xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-  macos-latest-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # TODO: this likely does not do anything - if yes, remove it
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: apple-tvos
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_APP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macos-latest-visionos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # TODO: this likely does not do anything - if yes, remove it
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: apple-visionos
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_APP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=visionOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macos-latest-swift:
-    runs-on: macos-latest
-    needs: macos-latest-ios-xcode
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # TODO: this likely does not do anything - if yes, remove it
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: apple-swift
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Download xcframework artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: llama-xcframework
-          path: build-apple/llama.xcframework/
-
-      - name: Build llama.cpp with CMake
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_OPENSSL=OFF \
-            -DLLAMA_BUILD_APP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@@ -1,145 +0,0 @@
-name: Build Actions Cache
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    - cron: '0 * * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ubuntu-24-vulkan-cache:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Get latest Vulkan SDK version
-        id: vulkan_sdk_version
-        run: |
-          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
-
-      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-sdk
-        with:
-          path: ./vulkan_sdk
-          key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
-
-      - name: Setup Vulkan SDK
-        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
-        with:
-          path: ./vulkan_sdk
-          version: ${{ env.VULKAN_SDK_VERSION }}
-
-  #ubuntu-24-spacemit-cache:
-  #  runs-on: ubuntu-24.04
-
-  #  env:
-  #    # Make sure this is in sync with build-linux-cross.yml
-  #    SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
-
-  #  steps:
-  #    - name: Clone
-  #      id: checkout
-  #      uses: actions/checkout@v6
-
-  #    - name: Setup Cache
-  #      uses: actions/cache@v5
-  #      id: cache-toolchain
-  #      with:
-  #        path: ./spacemit_toolchain
-  #        key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
-
-  #    - name: Setup SpacemiT Toolchain
-  #      if: steps.cache-toolchain.outputs.cache-hit != 'true'
-  #      uses: ./.github/actions/linux-setup-spacemit
-  #      with:
-  #        path: ./spacemit_toolchain
-  #        version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
-
-  ubuntu-24-openvino-cache:
-    runs-on: ubuntu-24.04
-
-    env:
-      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-  windows-2022-openvino-cache:
-    runs-on: windows-2022
-
-    env:
-      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-  windows-2022-rocm-cache:
-    runs-on: windows-2022
-
-    env:
-      # Make sure this is in sync with build.yml
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-rocm
-        with:
-          path: C:\Program Files\AMD\ROCm
-          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Setup ROCm
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-rocm
-        with:
-          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
@@ -1,104 +0,0 @@
-name: CI (cann)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-cann.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-cann.yml',
-      'ggml/src/ggml-cann/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  openEuler-latest-cann:
-#    defaults:
-#      run:
-#        shell: bash -el {0}
-#    strategy:
-#      matrix:
-#        arch: [x86, aarch64]
-#        chip_type: ['910b', '310p']
-#        build: ['Release']
-#        use_acl_graph: ['on', 'off']
-#        exclude:
-#          # 310P does not support USE_ACL_GRAPH=on
-#          - chip_type: '310p'
-#            use_acl_graph: 'on'
-#    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-#    steps:
-#      - name: Checkout
-#        uses: actions/checkout@v6
-#        with:
-#          fetch-depth: 0
-#
-#      - name: Free up disk space
-#        uses: ggml-org/free-disk-space@v1.3.1
-#        with:
-#          tool-cache: true
-#
-#      - name: Set container image
-#        id: cann-image
-#        run: |
-#          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
-#          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-#
-#      - name: Pull container image
-#        run: docker pull "${{ steps.cann-image.outputs.image }}"
-#
-#      - name: Build
-#        env:
-#          BUILD_TYPE: ${{ matrix.build }}
-#          SOC_TYPE: ascend${{ matrix.chip_type }}
-#          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
-#        run: |
-#          HOST_UID=$(id -u)
-#          HOST_GID=$(id -g)
-#
-#          docker run --rm \
-#            -v "${PWD}:/workspace" \
-#            -w /workspace \
-#            -e SOC_TYPE=${SOC_TYPE} \
-#            -e BUILD_TYPE=${BUILD_TYPE} \
-#            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
-#            "${{ steps.cann-image.outputs.image }}" \
-#            bash -lc '
-#              set -e
-#              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
-#              yum clean all && rm -rf /var/cache/yum
-#              git config --global --add safe.directory "/workspace"
-#              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-#              cmake -S . -B build \
-#                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-#                  -DGGML_CANN=on \
-#                  -DSOC_TYPE=${SOC_TYPE} \
-#                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
-#              cmake --build build -j $(nproc)
-#
-#              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-#            '
@@ -1,51 +0,0 @@
-name: Build relocatable cmake package
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  linux:
-    runs-on: [self-hosted, Linux, CPU]
-    steps:
-      - uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Build
-        run: |
-          PREFIX="$(pwd)"/inst
-          cmake -S . -B build \
-                -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_OPENSSL=OFF \
-                -DLLAMA_BUILD_TESTS=OFF \
-                -DLLAMA_BUILD_TOOLS=OFF \
-                -DLLAMA_BUILD_EXAMPLES=OFF \
-                -DLLAMA_BUILD_APP=OFF \
-                -DCMAKE_BUILD_TYPE=Release
-          cmake --build build --config Release
-          cmake --install build --prefix "$PREFIX" --config Release
-
-          export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
-          tclsh <<'EOF'
-          set build(commit)  [string trim [exec git rev-parse --short HEAD]]
-          set build(number)  [string trim [exec git rev-list  --count HEAD]]
-          set build(version) "0.0.$build(number)"
-
-          set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
-          set checks [list "set\\(LLAMA_VERSION     \\s+$build(version)\\)" \
-                           "set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
-                           "set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
-
-          puts -nonewline "Checking llama-config.cmake version... "
-          foreach check $checks {
-              if {![regexp -expanded -- $check $llamaconfig]} {
-                  puts "\"$check\" failed!"
-                  exit 1
-              }
-          }
-          puts "success."
-          EOF
-
-          cd examples/simple-cmake-pkg
-          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
-          cmake --build build
@@ -1,215 +0,0 @@
-name: CI (cpu)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-cpu.yml',
-      '.github/workflows/build-cmake-pkg.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-cpu.yml',
-      '.github/workflows/build-cmake-pkg.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  build-cmake-pkg:
-    uses: ./.github/workflows/build-cmake-pkg.yml
-
-  ubuntu:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: cpu-${{ matrix.os }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build Dependencies
-        id: build_depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev python3-wheel \
-            libjpeg-dev build-essential libssl-dev \
-            git-lfs
-
-      - name: Toolchain workaround (GCC 14)
-        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
-      - name: Python Dependencies
-        id: python_depends
-        run: |
-          export PIP_BREAK_SYSTEM_PACKAGES="1"
-          python3 -m pip install --upgrade pip setuptools
-          pip3 install ./gguf-py
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-  windows:
-    runs-on: windows-2025
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      SDE_VERSION: 9.33.0-2024-01-07
-      VULKAN_VERSION: 1.4.313.2
-
-    strategy:
-      matrix:
-        include:
-          - build: 'x64-cpu-static'
-            arch: 'x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
-          - build: 'x64-openblas'
-            arch: 'x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'x64-vulkan'
-            arch: 'x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'arm64'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: cpu-windows-2025-${{ matrix.build }}
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'x64-openblas' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.build == 'x64-vulkan' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DLLAMA_BUILD_BORINGSSL=ON
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'x64-openblas' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Test
-        id: cmake_test
-        if: ${{ matrix.arch == 'x64' }}
-        run: |
-          cd build
-          ctest -L main -C Release --verbose --timeout 900
-
-      # TODO: disabled for now, consider adding tests for all CPU variants instead
-      # - name: Test (Intel SDE)
-      #   id: cmake_test_sde
-      #   if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
-      #   run: |
-      #     curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
-      #     # for some weird reason windows tar doesn't like sde tar.xz
-      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
-      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
-      #     $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
-      #     cd build
-      #     $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
-      #     & $sde -future -- ctest -L main -C Release --verbose --timeout 900
@@ -1,317 +0,0 @@
-name: CI (cross)
-on:
-  # only manual triggers due to low-importance of the workflows
-  # TODO: for regular runs, provision dedicated self-hosted runners
-  workflow_dispatch:
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-cross.yml',
-      'ggml/src/spacemit/*',
-      'ggml/src/arch/loongarch/*'
-    ]
-  # run once every week
-  schedule:
-    - cron: '0 0 * * 0'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-
-jobs:
-  # ubuntu-24-riscv64-cpu-cross:
-  #   runs-on: ubuntu-24.04
-
-  #   steps:
-  #     - uses: actions/checkout@v6
-  #     - name: Setup Riscv
-  #       run: |
-  #         sudo dpkg --add-architecture riscv64
-
-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
-
-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 gcc-14-riscv64-linux-gnu \
-  #                 g++-14-riscv64-linux-gnu
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-  #                        -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-  #                        -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-  #         cmake --build build --config Release -j $(nproc)
-
-  # ubuntu-24-riscv64-vulkan-cross:
-  #   runs-on: ubuntu-24.04
-
-  #   steps:
-  #     - uses: actions/checkout@v6
-  #     - name: Setup Riscv
-  #       run: |
-  #         sudo dpkg --add-architecture riscv64
-
-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
-
-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 glslc \
-  #                 gcc-14-riscv64-linux-gnu \
-  #                 g++-14-riscv64-linux-gnu \
-  #                 libvulkan-dev:riscv64
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_VULKAN=ON \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-  #                        -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-  #                        -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-  #         cmake --build build --config Release -j $(nproc)
-
-  # ubuntu-24-arm64-vulkan-cross:
-  #   runs-on: ubuntu-24.04
-
-  #   steps:
-  #     - uses: actions/checkout@v6
-  #     - name: Setup Arm64
-  #       run: |
-  #         sudo dpkg --add-architecture arm64
-
-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
-  #         deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
-
-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 glslc \
-  #                 crossbuild-essential-arm64 \
-  #                 libvulkan-dev:arm64
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_VULKAN=ON \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-  #                        -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-  #                        -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-  #         cmake --build build --config Release -j $(nproc)
-
-  debian-13-loongarch64-cpu-cross:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
-
-    steps:
-      - uses: actions/checkout@v6
-      - name: Setup LoongArch
-        run: |
-          rm -f /etc/apt/sources.list.d/*
-          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
-          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
-          EOF
-          ( echo 'quiet "true";'; \
-            echo 'APT::Get::Assume-Yes "true";'; \
-            echo 'APT::Install-Recommends "false";'; \
-            echo 'Acquire::Check-Valid-Until "false";'; \
-            echo 'Acquire::Retries "5";'; \
-          ) > /etc/apt/apt.conf.d/99snapshot-repos
-
-          apt-get update
-          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
-          dpkg --add-architecture loong64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
-          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
-          EOF
-
-          apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-loongarch64-linux-gnu \
-                  g++-14-loongarch64-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_OPENSSL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  debian-13-loongarch64-vulkan-cross:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
-
-    steps:
-      - uses: actions/checkout@v6
-      - name: Setup LoongArch
-        run: |
-          rm -f /etc/apt/sources.list.d/*
-          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
-          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
-          EOF
-          ( echo 'quiet "true";'; \
-            echo 'APT::Get::Assume-Yes "true";'; \
-            echo 'APT::Install-Recommends "false";'; \
-            echo 'Acquire::Check-Valid-Until "false";'; \
-            echo 'Acquire::Retries "5";'; \
-          ) > /etc/apt/apt.conf.d/99snapshot-repos
-
-          apt-get update
-          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
-          dpkg --add-architecture loong64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
-          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
-          EOF
-
-          apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  spirv-headers \
-                  gcc-14-loongarch64-linux-gnu \
-                  g++-14-loongarch64-linux-gnu \
-                  libvulkan-dev:loong64
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_OPENSSL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-riscv64-cpu-spacemit-ime-cross:
-    runs-on: ubuntu-24.04
-
-    env:
-      # Make sure this is in sync with build-cache.yml
-      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4"
-
-    steps:
-      - uses: actions/checkout@v6
-
-      #- name: Use SpacemiT Toolchain Cache
-      #  uses: actions/cache@v5
-      #  id: cache-toolchain
-      #  with:
-      #    path: ./spacemit_toolchain
-      #    key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
-
-      - name: Setup SpacemiT Toolchain
-        #if: steps.cache-toolchain.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-spacemit
-        with:
-          path: ./spacemit_toolchain
-          version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
-
-      - name: Build
-        run: |
-          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
-          cmake -B build -DLLAMA_OPENSSL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DGGML_CPU_REPACK=OFF \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
-                         -DGGML_RVV=ON \
-                         -DGGML_RV_ZVFH=ON \
-                         -DGGML_RV_ZFH=ON \
-                         -DGGML_RV_ZICBOP=ON \
-                         -DGGML_RV_ZIHINTPAUSE=ON \
-                         -DGGML_RV_ZBA=ON \
-                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
-
-          cmake --build build --config Release -j $(nproc)
@@ -1,134 +0,0 @@
-name: CI (CUDA, ubuntu)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-cuda-ubuntu.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-cuda-ubuntu.yml',
-      'ggml/src/ggml-cuda/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  cuda:
-    runs-on: ubuntu-24.04
-    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Install dependencies
-        env:
-          DEBIAN_FRONTEND: noninteractive
-        run: |
-          apt update
-          apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: cuda-ubuntu-24.04-cuda
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with CMake
-        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
-        run: |
-          cmake -S . -B build -G Ninja \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DCMAKE_CUDA_ARCHITECTURES=89-real \
-            -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CUDA=ON \
-            -DGGML_CUDA_CUB_3DOT2=ON
-          cmake --build build
-
-  hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.1.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: cuda-ubuntu-22.04-hip
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGPU_TARGETS="gfx1030" \
-            -DGGML_HIP=ON
-          cmake --build build --config Release -j $(nproc)
-
-  musa:
-    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          apt-get update
-          apt-get install -y build-essential git cmake libssl-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: cuda-ubuntu-22.04-musa
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with native CMake MUSA support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DGGML_MUSA=ON
-          time cmake --build build --config Release -j $(nproc)
@@ -1,162 +0,0 @@
-name: CI (CUDA, windows)
-
-# TODO: this workflow is only triggered manually because it is very heavy on the CI
-#       when we provision dedicated windows runners, we can enable it for pushes too
-# note: running this workflow manually will populate the ccache for the release builds
-#       this can be used before merging a PR to speed up the release workflow
-on:
-  workflow_dispatch: # allows manual triggering
-
-# note: this will run in queue with the release workflow
-concurrency:
-  group: release
-  queue: max
-
-env:
-  GH_TOKEN: ${{ github.token }}
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  cuda:
-    runs-on: windows-2022
-
-    permissions:
-      actions: write
-
-    strategy:
-      matrix:
-        cuda: ['12.4', '13.3']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
-
-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DLLAMA_BUILD_SERVER=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
-            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON ^
-            -DGGML_CUDA_CUB_3DOT2=ON
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: ccache-clear
-        uses: ./.github/actions/ccache-clear
-        with:
-          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
-
-  hip:
-    runs-on: windows-2022
-
-    permissions:
-      actions: write
-
-    env:
-      # Make sure this is in sync with build-cache.yml
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
-
-    strategy:
-      matrix:
-        include:
-          # sync with release.yml
-          - name: "radeon"
-            gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Grab rocWMMA package
-        id: grab_rocwmma
-        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
-          7z x rocwmma.deb
-          7z x data.tar
-
-      - name: Use ROCm Installation Cache
-        uses: actions/cache@v5
-        id: cache-rocm
-        with:
-          path: C:\Program Files\AMD\ROCm
-          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Setup ROCm
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-rocm
-        with:
-          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          # Find and test ROCm installation
-          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
-          if (-not $clangPath) {
-            Write-Error "ROCm installation not found"
-            exit 1
-          }
-          & $clangPath.FullName --version
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          # TODO: this build does not match the build in release.yml, so we use a different cache key
-          #       ideally, the builds should match, similar to the CUDA build above so that we would be able
-          #       to populate the ccache for the release with manual runs of this workflow
-          #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
-          key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DLLAMA_BUILD_BORINGSSL=ON `
-            -DROCM_DIR="${env:HIP_PATH}" `
-            -DGGML_HIP=ON `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGPU_TARGETS="gfx1100"  `
-            -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: ccache-clear
-        uses: ./.github/actions/ccache-clear
-        with:
-          #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
-          key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
@@ -1,150 +0,0 @@
-name: CI (ibm)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-ibm.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-ibm.yml',
-      'ggml/src/ggml-cpu/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  ubuntu-24-s390x:
-    runs-on: ubuntu-24.04-s390x
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Build Dependencies
-        id: build_depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev python3-wheel \
-            libjpeg-dev build-essential libssl-dev \
-            git-lfs
-
-      - name: Toolchain workaround (GCC 14)
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
-      - name: Python Dependencies
-        id: python_depends
-        run: |
-          export PIP_BREAK_SYSTEM_PACKAGES="1"
-          python3 -m pip install --upgrade pip setuptools
-          pip3 install ./gguf-py
-
-      - name: Swap Endianness
-        id: endianness
-        run: |
-          for f in models/*.gguf; do
-            echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
-          done
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Test llama2c (s390x)
-        id: llama2c_test_s390x
-        run: |
-          cd build
-          echo "Fetch llama2c big-endian model"
-          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
-          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-  ubuntu-24-ppc64le:
-    runs-on: ubuntu-24.04-ppc64le
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Build Dependencies
-        id: build_depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev python3-wheel \
-            libjpeg-dev build-essential libssl-dev \
-            git-lfs
-
-      - name: Toolchain workaround (GCC 14)
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
-      - name: Python Dependencies
-        id: python_depends
-        run: |
-          export PIP_BREAK_SYSTEM_PACKAGES="1"
-          python3 -m pip install --upgrade pip setuptools
-          pip3 install ./gguf-py
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
@@ -1,70 +0,0 @@
-name: CI (msys)
-
-on:
-  # only manual triggers due to low-importance of the workflows
-  # TODO: for regular runs, provision dedicated self-hosted runners
-  workflow_dispatch:
-  # run once every week
-  schedule:
-    - cron: '0 0 * * 0'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  windows-msys2:
-    runs-on: windows-2025
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  compiler: gcc,   build: Release }
-          - { sys: CLANG64, env: clang-x86_64, compiler: clang, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-
-      #- name: ccache
-      #  uses: ggml-org/ccache-action@v1.2.16
-      #  with:
-      #    key: msys-windows-2025-x64
-      #    variant: ccache
-      #    evict-old-files: 1d
-      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            mingw-w64-${{matrix.env}}-${{matrix.compiler}}
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
@@ -1,82 +0,0 @@
-name: CI (opencl)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-opencl.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cl'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-opencl.yml',
-      'ggml/src/ggml-opencl/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  windows-2025-opencl-adreno:
-    runs-on: windows-2025
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: opencl-windows-2025-x64
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
@@ -1,169 +0,0 @@
-name: CI (openvino)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      'ggml/src/ggml-openvino/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-openvino:
-    runs-on: [self-hosted, Linux, Intel, OpenVINO]
-
-    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
-          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
-
-      - name: Setup OpenVINO Toolkit
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenVINO dependencies
-        run: |
-          cd ./openvino_toolkit
-          chmod +x ./install_dependencies/install_openvino_dependencies.sh
-          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source ./openvino_toolkit/setupvars.sh
-          cmake -B build/ReleaseOV -G Ninja \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
-          time cmake --build build/ReleaseOV --config Release --parallel
-
-      - name: Test (CPU)
-        id: cmake_test_cpu
-        # TODO: fix and re-enable the `test-llama-archs` test below
-        run: |
-          cd ${{ github.workspace }}
-          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
-
-      - name: Test (GPU)
-        id: cmake_test_gpu
-        # TODO: fix and re-enable the `test-llama-archs` test below
-        run: |
-          cd ${{ github.workspace }}
-          export GGML_OPENVINO_DEVICE=GPU
-          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 3000
-
-  openvino-windows-2022:
-    runs-on: windows-2022
-
-    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: openvino-windows-2022
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenCL using vcpkg
-        shell: powershell
-        run: |
-          git clone https://github.com/microsoft/vcpkg C:\vcpkg
-          C:\vcpkg\bootstrap-vcpkg.bat
-          C:\vcpkg\vcpkg install opencl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          REM Find extracted OpenVINO folder dynamically
-          for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
-
-          if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
-              echo ERROR: OpenVINOConfig.cmake not found
-              exit /b 1
-          )
-
-          call "%OPENVINO_ROOT%\setupvars.bat"
-
-          cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
-            -A x64 ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_OPENVINO=ON ^
-            -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
-
-          cmake --build build\ReleaseOV --config Release -- /m
-
-      - name: Test (CPU)
-        id: cmake_test_cpu
-        shell: cmd
-        # TODO: fix and re-enable the `test-llama-archs` test below
-        run: |
-          REM Find extracted OpenVINO folder dynamically
-          for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
-          call "%OPENVINO_ROOT%\setupvars.bat"
-
-          cd build
-          ctest --test-dir ReleaseOV -L main -E "test-llama-archs" -C Release --verbose --timeout 3000
@@ -1,188 +0,0 @@
-name: CI (riscv)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-riscv.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-riscv.yml',
-      'ggml/src/ggml-cpu/arch/riscv/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-cpu-riscv64-native:
-    runs-on: ubuntu-24.04-riscv
-
-    steps:
-      - name: Install dependencies
-        run: |
-          # Install necessary packages
-          sudo apt-get update
-          sudo apt-get install -y libssl-dev
-
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-
-          git lfs install
-
-      - name: Check environment
-        run: |
-          uname -a
-          gcc --version
-          g++ --version
-          ldd --version
-          cmake --version
-          rustc --version
-          env
-          echo "nproc=$(nproc)"
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
-      #- name: ccache
-      #  uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
-      #  with:
-      #    key: riscv-ubuntu-native
-      #    evict-old-files: 1d
-      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=ON \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DGGML_RPC=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-  ubuntu-riscv64-native-sanitizer:
-    runs-on: ubuntu-24.04-riscv
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
-
-    steps:
-      - name: Install dependencies
-        run: |
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-
-          git lfs install
-
-      - name: GCC version check
-        run: |
-          gcc --version
-          g++ --version
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
-      #- name: ccache
-      #  uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
-      #  with:
-      #    key: riscv-ubuntu-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
-      #    evict-old-files: 1d
-      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_OPENSSL=OFF \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=ON \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_OPENSSL=OFF \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
@@ -1,66 +0,0 @@
-name: CI (rpc)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-rpc.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-rpc.yml',
-      'ggml/src/ggml-rpc/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-rpc:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev ninja-build
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
@@ -1,86 +0,0 @@
-name: CI (sanitize)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-sanitize.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  ctest:
-    runs-on: [self-hosted, X64, CPU, Linux]
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings
-      - name: Build (undefined)
-        id: cmake_build_undefined
-        if: ${{ matrix.sanitizer == 'UNDEFINED' }}
-        run: |
-          cmake -B build \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
-
-          cmake --build build --config Debug -j $(nproc)
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer == 'ADDRESS' }}
-        run: |
-          cmake -B build \
-            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
-
-          cmake --build build --config RelWithDebInfo -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_OPENMP=OFF
-
-          cmake --build build --config RelWithDebInfo -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        # skip run in Debug - very slow
-        if: ${{ matrix.sanitizer != 'UNDEFINED' }}
-        run: |
-          cd build
-          ctest -L main -E tokenizer --verbose --timeout 900
@@ -1,387 +0,0 @@
-name: CI (self-hosted)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp',
-      '**/*.glsl',
-      '**/*.wgsl'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-self-hosted.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp',
-      '**/*.glsl',
-      '**/*.wgsl'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  gpu-cuda:
-    runs-on: [self-hosted, Linux, NVIDIA]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  gpu-vulkan-nvidia-cm:
-    runs-on: [self-hosted, Linux, NVIDIA]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  gpu-vulkan-nvidia-cm2:
-    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  gpu-webgpu-nvidia:
-    runs-on: [self-hosted, Linux, NVIDIA, X64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_WEBGPU=1 \
-          GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-          GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  # TODO: provision AMX-compatible machine
-  #cpu-amx:
-  #  runs-on: [self-hosted, Linux, CPU, AMX]
-
-  #  steps:
-  #    - name: Clone
-  #      id: checkout
-  #      uses: actions/checkout@v6
-
-  #    - name: Test
-  #      id: ggml-ci
-  #      run: |
-  #        bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  # TODO: provision AMD GPU machine
-  # amd-vulkan:
-  #   runs-on: [self-hosted, Linux, AMD]
-
-  #   steps:
-  #     - name: Clone
-  #       id: checkout
-  #       uses: actions/checkout@v6
-
-  #     - name: Test
-  #       id: ggml-ci
-  #       run: |
-  #         vulkaninfo --summary
-  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  # TODO: provision AMD GPU machine
-  # amd-rocm:
-  #   runs-on: [self-hosted, Linux, AMD]
-
-  #   steps:
-  #     - name: Clone
-  #       id: checkout
-  #       uses: actions/checkout@v6
-
-  #     - name: Test
-  #       id: ggml-ci
-  #       run: |
-  #         amd-smi static
-  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  gpu-metal:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  gpu-webgpu-apple:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  gpu-vulkan-apple:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  gpu-vulkan-intel-linux:
-    runs-on: [self-hosted, Linux, Intel]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          persist-credentials: false
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  gpu-vulkan-intel-windows:
-    runs-on: [self-hosted, Windows, X64, Intel]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        shell: C:\msys64\usr\bin\bash.exe --noprofile --norc -eo pipefail "{0}"
-        env:
-          MSYSTEM: UCRT64
-          CHERE_INVOKING: 1
-          PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
-        run: |
-          vulkaninfo --summary
-          # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
-          # a valid python environment for testing
-          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
-
-  gpu-openvino-low-perf:
-    runs-on: [self-hosted, Linux, Intel, OpenVINO]
-
-    env:
-      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.2"
-      OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Setup OpenVINO Toolkit
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenVINO dependencies
-        run: |
-          cd ./openvino_toolkit
-          chmod +x ./install_dependencies/install_openvino_dependencies.sh
-          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          source ./openvino_toolkit/setupvars.sh
-          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  cpu-x64-high-perf:
-    runs-on: [self-hosted, Linux, X64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  cpu-arm64-high-perf-graviton4:
-    runs-on: ah-ubuntu_22_04-c8g_8x
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          set -euxo pipefail
-          sudo apt-get update
-          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
-          apt-get install -y \
-          build-essential \
-          python3-venv \
-          gpg \
-          wget \
-          time \
-          git-lfs
-
-          git lfs install
-
-          # install the latest cmake
-          sudo install -d /usr/share/keyrings
-          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
-            | gpg --dearmor \
-            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
-          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
-            | sudo tee /etc/apt/sources.list.d/kitware.list
-          sudo apt-get update
-          sudo apt-get install -y cmake
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  cpu-arm64-graviton4-kleidiai:
-    runs-on: ah-ubuntu_22_04-c8g_8x
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          set -euxo pipefail
-          sudo apt-get update
-          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
-          apt-get install -y \
-          build-essential \
-          python3-venv \
-          gpg \
-          wget \
-          time \
-          git-lfs
-
-          git lfs install
-
-          # install the latest cmake
-          sudo install -d /usr/share/keyrings
-          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
-            | gpg --dearmor \
-            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
-          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
-            | sudo tee /etc/apt/sources.list.d/kitware.list
-          sudo apt-get update
-          sudo apt-get install -y cmake
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_KLEIDIAI=1 \
-          GG_BUILD_EXTRA_TESTS_0=1 \
-          bash ./ci/run.sh ./tmp/results ./tmp/mnt
@@ -1,141 +0,0 @@
-name: CI (sycl)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      'ggml/src/ggml-sycl/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Download & Install oneAPI
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: sycl-ubuntu-24-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Download & Install oneAPI
-        shell: bash
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: sycl-windows-latest
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
@@ -1,50 +0,0 @@
-name: CI (virtgpu)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-virtgpu.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-virtgpu.yml',
-      'ggml/src/ggml-virtgpu/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ubuntu-24-virtgpu:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VIRTGPU=ON \
-            -DGGML_VIRTGPU_BACKEND=ON
-          cmake --build build --config Release -j $(nproc)
@@ -1,134 +0,0 @@
-name: CI (vulkan)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-vulkan.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.comp',
-      '**/*.glsl'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-vulkan.yml',
-      'ggml/src/ggml-vulkan/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-arm64:
-    runs-on: ubuntu-24.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: vulkan-ubuntu-24.04-arm-new
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Configure
-        id: cmake_configure
-        run: |
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_VULKAN=ON
-
-      - name: Build
-        id: cmake_build
-        run: |
-          time cmake --build build -j $(nproc)
-
-  ubuntu-llvmpipe:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo add-apt-repository -y ppa:kisak/kisak-mesa
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
-
-      - name: Get latest Vulkan SDK version
-        id: vulkan_sdk_version
-        run: |
-          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
-
-      - name: Use Vulkan SDK Cache
-        uses: actions/cache@v5
-        id: cache-sdk
-        with:
-          path: ./vulkan_sdk
-          key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
-
-      - name: Setup Vulkan SDK
-        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
-        with:
-          path: ./vulkan_sdk
-          version: ${{ env.VULKAN_SDK_VERSION }}
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: vulkan-ubuntu-24.04-llvmpipe
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source ./vulkan_sdk/setup-env.sh
-          cmake -B build \
-            -DGGML_VULKAN=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          export GGML_VK_VISIBLE_DEVICES=0
-          export GGML_VK_DISABLE_F16=1
-          export GGML_VK_DISABLE_COOPMAT=1
-          # This is using llvmpipe and runs slower than other backends
-          # test-backend-ops is too slow on llvmpipe, skip it
-          ctest -L main -E test-backend-ops --verbose --timeout 900
@@ -1,196 +0,0 @@
-name: CI (webgpu)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-webgpu.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.wgsl'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-webgpu.yml',
-      'ggml/src/ggml-webgpu/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  format:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-
-      - name: Install clang-format 22
-        run: |
-          wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key |
-            sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc > /dev/null
-          sudo add-apt-repository -y \
-            "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-22 main"
-          sudo apt-get update
-          sudo apt-get install -y clang-format-22
-
-      - name: Check formatting
-        run: |
-          find ggml/src/ggml-webgpu \
-            -type f \( -name '*.cpp' -o -name '*.hpp' -o -name '*.h' \) \
-            -print0 |
-            xargs -0 clang-format-22 --dry-run --Werror
-
-  macos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: webgpu-macos-latest
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Build
-        id: cmake_build
-        run: |
-          export CMAKE_PREFIX_PATH=dawn
-          cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: webgpu-ubuntu-24.04
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo add-apt-repository -y ppa:kisak/kisak-mesa
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers \
-            libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Build
-        id: cmake_build
-        run: |
-          export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build \
-            -DGGML_WEBGPU=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          # This is using llvmpipe and runs slower than other backends
-          # test-backend-ops is too slow on llvmpipe, skip it
-          ctest -L main -E test-backend-ops --verbose --timeout 900
-
-  ubuntu-wasm:
-    runs-on: ubuntu-24.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: webgpu-ubuntu-24.04-arm-wasm
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Install Emscripten
-        run: |
-          git clone https://github.com/emscripten-core/emsdk.git
-          cd emsdk
-          ./emsdk install latest
-          ./emsdk activate latest
-
-      - name: Fetch emdawnwebgpu
-        run: |
-          DAWN_TAG="v20260317.182325"
-          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
-          echo "Downloading ${EMDAWN_PKG}"
-          curl -L -o emdawn.zip \
-            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
-          unzip emdawn.zip
-
-      - name: Build WASM WebGPU
-        run: |
-          source emsdk/emsdk_env.sh
-          emcmake cmake -B build-wasm \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_WEBGPU=ON \
-            -DLLAMA_OPENSSL=OFF \
-            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
-
-          time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
@@ -0,0 +1,632 @@
+name: CI
+
+on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  GGML_NLOOP: 3
+  GGML_NITER: 1
+  GGML_N_THREADS: 1
+
+jobs:
+  ubuntu-focal-make:
+    runs-on: ubuntu-20.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential gcc-8
+
+      - name: Build
+        id: make_build
+        run: |
+          CC=gcc-8 make
+
+  ubuntu-latest-cmake:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose --timeout 900
+
+  ubuntu-latest-cmake-sanitizer:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        build_type: [Debug, Release]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake --build . --config ${{ matrix.build_type }}
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose --timeout 900
+
+  ubuntu-latest-cmake-mpi:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        mpi_library: [mpich, libopenmpi-dev]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ${{ matrix.mpi_library }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_MPI=ON ..
+          cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose
+
+  macOS-latest-make:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: make_build
+        run: |
+          make
+
+  macOS-latest-cmake:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          mkdir build
+          cd build
+          cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
+          cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose --timeout 900
+
+  windows-latest-cmake:
+    runs-on: windows-latest
+
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      OPENCL_VERSION: 2023.04.17
+      CLBLAST_VERSION: 1.6.0
+
+    strategy:
+      matrix:
+        include:
+          - build: 'noavx'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
+          - build: 'avx2'
+            defines: '-DLLAMA_BUILD_SERVER=ON'
+          - build: 'avx'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
+          - build: 'avx512'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'clblast'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+          - build: 'openblas'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Download OpenCL SDK
+        id: get_opencl
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
+          mkdir $env:RUNNER_TEMP/opencl
+          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
+
+      - name: Download CLBlast
+        id: get_clblast
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
+          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
+          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
+          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
+            $txt = Get-Content -Path $f -Raw
+            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
+          }
+
+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'openblas' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/openblas
+          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
+          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. ${{ matrix.defines }}
+          cmake --build . --config Release
+
+      - name: Add clblast.dll
+        id: add_clblast_dll
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
+          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
+
+      - name: Add libopenblas.dll
+        id: add_libopenblas_dll
+        if: ${{ matrix.build == 'openblas' }}
+        run: |
+          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
+          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
+
+      - name: Check AVX512F support
+        id: check_avx512f
+        if: ${{ matrix.build == 'avx512' }}
+        continue-on-error: true
+        run: |
+          cd build
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
+          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
+          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
+          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
+
+      - name: Test
+        id: cmake_test
+        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
+        run: |
+          cd build
+          ctest -C Release --verbose --timeout 900
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
+          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+
+  windows-latest-cmake-cublas:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        cuda: ['12.1.0', '11.7.1']
+        build: ['cublas']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - uses: Jimver/cuda-toolkit@v0.2.10
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda }}
+          # TODO(green-sky): _dev seems to fail, and non dev are not enought
+          #sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
+          cmake --build . --config Release
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '12.1.0' }}
+        # TODO(green-sky): paths are cuda 12 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '11.7.1' }}
+        # TODO(green-sky): paths are cuda 11 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir '.\build\bin\cudart\'
+          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
+
+      - name: Upload Cuda runtime
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+
+  release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - ubuntu-focal-make
+      - ubuntu-latest-cmake
+      - macOS-latest-make
+      - macOS-latest-cmake
+      - windows-latest-cmake
+      - windows-latest-cmake-cublas
+
+    steps:
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v3
+
+      - name: Get commit hash
+        id: commit
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Create release
+        id: create_release
+        uses: anzz1/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+
+      - name: Upload release
+        id: upload_release
+        uses: actions/github-script@v3
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./artifact')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./artifact/${file}`)
+                });
+              }
+            }
+
+#  ubuntu-latest-gcc:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Debug, Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v1
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  ubuntu-latest-clang:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Debug, Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v1
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  ubuntu-latest-gcc-sanitized:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v1
+#
+#      - name: Dependencies
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#          sudo apt-get install cmake
+#
+#      - name: Configure
+#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
+#
+#      - name: Build
+#        run: |
+#          make
+#
+#  windows:
+#    runs-on: windows-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#        arch: [Win32, x64]
+#        include:
+#          - arch: Win32
+#            s2arc: x86
+#          - arch: x64
+#            s2arc: x64
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v1
+#
+#      - name: Add msbuild to PATH
+#        uses: microsoft/setup-msbuild@v1
+#
+#      - name: Configure
+#        run: >
+#          cmake -S . -B ./build -A ${{ matrix.arch }}
+#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#
+#      - name: Build
+#        run: |
+#          cd ./build
+#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+#
+#      - name: Upload binaries
+#        uses: actions/upload-artifact@v1
+#        with:
+#          name: llama-bin-${{ matrix.arch }}
+#          path: build/bin/${{ matrix.build }}
+#
+#  windows-blas:
+#    runs-on: windows-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#        arch: [Win32, x64]
+#        blas: [ON]
+#        include:
+#          - arch: Win32
+#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
+#            s2arc: x86
+#          - arch: x64
+#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
+#            s2arc: x64
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v1
+#
+#      - name: Add msbuild to PATH
+#        uses: microsoft/setup-msbuild@v1
+#
+#      - name: Fetch OpenBLAS
+#        if: matrix.blas == 'ON'
+#        run: |
+#          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
+#          7z x blas.zip -oblas -y
+#          copy blas/include/cblas.h .
+#          copy blas/include/openblas_config.h .
+#          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
+#
+#      - name: Configure
+#        run: >
+#          cmake -S . -B ./build -A ${{ matrix.arch }}
+#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
+#          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
+#
+#      - name: Build
+#        run: |
+#          cd ./build
+#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+#
+#      - name: Copy libopenblas.dll
+#        if: matrix.blas == 'ON'
+#        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
+#
+#      - name: Upload binaries
+#        if: matrix.blas == 'ON'
+#        uses: actions/upload-artifact@v1
+#        with:
+#          name: llama-blas-bin-${{ matrix.arch }}
+#          path: build/bin/${{ matrix.build }}
+#
+#  emscripten:
+#    runs-on: ubuntu-latest
+#
+#    strategy:
+#      matrix:
+#        build: [Release]
+#
+#    steps:
+#      - name: Clone
+#        uses: actions/checkout@v1
+#
+#      - name: Dependencies
+#        run: |
+#          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
+#          tar -xvf master.tar.gz
+#          emsdk-master/emsdk update
+#          emsdk-master/emsdk install latest
+#          emsdk-master/emsdk activate latest
+#
+#      - name: Configure
+#        run: echo "tmp"
+#
+#      - name: Build
+#        run: |
+#          pushd emsdk-master
+#          source ./emsdk_env.sh
+#          popd
+#          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+#          make
@@ -1,52 +0,0 @@
-name: Check vendor
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      'vendor/**',
-      'scripts/sync_vendor.py'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      'vendor/**',
-      'scripts/sync_vendor.py'
-    ]
-
-jobs:
-  check-vendor:
-    runs-on: [self-hosted, fast]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.x'
-
-      - name: Run vendor sync
-        run: |
-          set -euo pipefail
-          python3 scripts/sync_vendor.py
-
-      - name: Check for changes
-        run: |
-          set -euo pipefail
-          # detect modified or untracked files
-          changed=$(git status --porcelain --untracked-files=all || true)
-          if [ -n "$changed" ]; then
-            echo "Vendor sync modified files:"
-            echo "$changed" | awk '{ print $2 }' | sed '/^$/d'
-            echo "Failing because vendor files mismatch. Please update scripts/sync_vendor.py"
-            exit 1
-          else
-            echo "Vendor files are up-to-date."
-          fi
@@ -1,28 +0,0 @@
-name: Close inactive issues
-on:
-  schedule:
-    - cron: "42 0 * * *"
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  issues: write
-
-jobs:
-  close-issues:
-    runs-on: ubuntu-slim
-    permissions:
-      issues: write
-      pull-requests: write
-    steps:
-      - uses: actions/stale@v10
-        with:
-          exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap,security"
-          days-before-issue-stale: 30
-          days-before-issue-close: 14
-          stale-issue-label: "stale"
-          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
-          days-before-pr-stale: -1
-          days-before-pr-close: -1
-          operations-per-run: 10000
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -1,51 +0,0 @@
-name: Code Style Checker
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  model-naming:
-    runs-on: [self-hosted, fast]
-    steps:
-      - uses: actions/checkout@v6
-      - name: Check model naming conventions
-        run: |
-          python3 - << 'EOF'
-          import re, os, sys
-
-          pairs = re.findall(
-              r'case\s+(LLM_ARCH_\w+)\s*:\s*\n\s+return new (llama_model_\w+)\s*\(',
-              open("src/llama-model.cpp").read())
-
-          errors = []
-          for arch, cls in pairs:
-              suffix  = arch[len("LLM_ARCH_"):]
-              csuffix = cls[len("llama_model_"):]
-              fname   = csuffix.replace("_", "-") + ".cpp"
-
-              if not re.fullmatch(r'[A-Z][A-Z0-9_]*',   suffix):
-                  errors.append(f"{arch}: suffix not upper snake case, example: LLM_ARCH_MY_MODEL")
-
-              if not re.fullmatch(r'[a-z][a-z0-9_]*', csuffix):
-                  errors.append(f"{arch}: class suffix not lower snake case, example: llama_model_my_model")
-
-              elif suffix.lower() != csuffix:
-                  errors.append(f"{arch}: arch/class name mismatch, expected class 'llama_model_{suffix.lower()}' but got '{cls}'")
-
-              elif not os.path.isfile(f"src/models/{fname}"):
-                  errors.append(f"{arch}: expects model file name to be src/models/{fname}, but not found")
-
-          if errors:
-              print('\n'.join(f"  - {e}" for e in errors)); sys.exit(1)
-          print(f"OK: {len(pairs)} mappings validated.")
-          EOF
@@ -1,56 +0,0 @@
-name: "Copilot Setup Steps"
-
-# Automatically run the setup steps when they are changed to allow for easy validation, and
-# allow manual testing through the repository's "Actions" tab
-on:
-  workflow_dispatch:
-  push:
-    paths:
-      - .github/workflows/copilot-setup-steps.yml
-  pull_request:
-    paths:
-      - .github/workflows/copilot-setup-steps.yml
-
-jobs:
-  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
-  copilot-setup-steps:
-    runs-on: ubuntu-latest
-
-    # Set the permissions to the lowest permissions possible needed for your steps.
-    # Copilot will be given its own token for its operations.
-    permissions:
-      # If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
-      contents: read
-
-    # You can define any steps you want, and they will run before the agent starts.
-    # If you do not check out your code, Copilot will do this for you.
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: copilot-setup-steps
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
-          # Install git-clang-format script for formatting only changed code
-          wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
-          sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
-          sudo chmod +x /usr/local/bin/git-clang-format
-
-      - name: Set up Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.11'
-
-      - name: Install Python dependencies
-        run: |
-          python3 -m venv .venv
-          source .venv/bin/activate
-          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
@@ -10,506 +10,56 @@
 name: Publish Docker image

 on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      skip_s390x:
-        description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)"
-        type: boolean
-        default: false
-  schedule:
-    # Rebuild daily rather than on every push because it is expensive
-    - cron: '12 4 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  packages: write
+  pull_request:
+  push:
+    branches:
+      - master

 jobs:
-  create_tag:
-    name: Create and push git tag
-    runs-on: ubuntu-slim
-    permissions:
-      contents: write
-    outputs:
-      source_tag: ${{ steps.srctag.outputs.name }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Determine source tag name
-        id: srctag
-        uses: ./.github/actions/get-tag-name
-        env:
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
-      - name: Create and push git tag
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          git tag ${{ steps.srctag.outputs.name }} || exit 0
-          git push origin ${{ steps.srctag.outputs.name }} || exit 0
-
-  prepare_matrices:
-    name: Prepare Docker matrices
-    runs-on: ubuntu-24.04
-    outputs:
-      build_matrix: ${{ steps.matrices.outputs.build_matrix }}
-      merge_matrix: ${{ steps.matrices.outputs.merge_matrix }}
-
-    steps:
-      - name: Generate build and merge matrices
-        id: matrices
-        shell: bash
-        env:
-          SKIP_S390X: ${{ inputs.skip_s390x || 'false' }}
-        run: |
-          set -euo pipefail
-
-          # Keep all build targets in one place and derive merge targets from it.
-          cat > build-matrix.json <<'JSON'
-          [
-            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
-            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
-            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }
-          ]
-          JSON
-
-          if [ "${SKIP_S390X}" = "true" ]; then
-            jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp
-            mv build-matrix.json.tmp build-matrix.json
-          fi
-
-          BUILD_MATRIX="$(jq -c . build-matrix.json)"
-          MERGE_MATRIX="$(jq -c '
-            reduce .[] as $entry ({}; .[$entry.tag] |= (
-              . // {
-                tag: $entry.tag,
-                arches: [],
-                full: false,
-                light: false,
-                server: false
-              }
-              | .full = (.full or ($entry.full // false))
-              | .light = (.light or ($entry.light // false))
-              | .server = (.server or ($entry.server // false))
-              | .arches += [($entry.platforms | sub("^linux/"; ""))]
-            ))
-            # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
-            | if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then
-                . + {
-                  s390x: {
-                    tag: "s390x",
-                    arches: ["s390x"],
-                    full: .cpu.full,
-                    light: .cpu.light,
-                    server: .cpu.server
-                  }
-                }
-              else
-                .
-              end
-            | [.[] | .arches = (.arches | unique | sort | join(" "))]
-          ' build-matrix.json)"
-
-          echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT"
-          echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT"
-
  push_to_registry:
-    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag]
+    name: Push Docker image to Docker Hub
+    if: github.event.pull_request.draft == false

-    runs-on: ${{ matrix.config.runs_on }}
+    runs-on: ubuntu-latest
+    env:
+      COMMIT_SHA: ${{ github.sha }}
    strategy:
-      fail-fast: false
      matrix:
-        config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
+        config:
+          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
+          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
    steps:
      - name: Check out the repo
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ needs.create_tag.outputs.source_tag }}
+        uses: actions/checkout@v3

      - name: Set up QEMU
-        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
-        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
-        with:
-          image: tonistiigi/binfmt:qemu-v10.2.1
+        uses: docker/setup-qemu-action@v2

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
+        uses: docker/setup-buildx-action@v2

-      - name: Log in to Docker Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      - name: Determine image metadata
-        id: meta
-        shell: bash
-        run: |
-          set -euo pipefail
-
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
-          PREFIX="${IMAGE_REPO}:"
-          PLATFORM="${{ matrix.config.platforms }}"
-          ARCH_SUFFIX="${PLATFORM#linux/}"
-
-          # list all tags possible
-          tags="${{ matrix.config.tag }}"
-          for tag in $tags; do
-              if [[ "$tag" == "cpu" ]]; then
-                  TYPE=""
-              else
-                  TYPE="-$tag"
-              fi
-              CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}"
-          done
-
-          SAFE_TAGS="$(echo "$tags" | tr ' ' '_')"
-
-          echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT
-          echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT
-          echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT
-          echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT
-          echo "cache_output_tag=$CACHETAG"  # print out for debugging
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
-      - name: Get build date
-        id: build_date
-        run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
-
-      - name: Free Disk Space (Ubuntu)
-        if: ${{ matrix.config.free_disk_space == true }}
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Build and push Full Docker image by digest
-        id: build_full
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+      - name: Build and push Docker image (versioned)
+        if: github.event_name == 'push'
+        uses: docker/build-push-action@v4
        with:
          context: .
-          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}
-          target: full
-          provenance: false
-          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max

-      - name: Build and push Light Docker image by digest
-        id: build_light
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+      - name: Build and push Docker image (tagged)
+        uses: docker/build-push-action@v4
        with:
          context: .
-          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
+          push: ${{ github.event_name == 'push' }}
+          platforms: linux/amd64,linux/arm64
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
-          target: light
-          provenance: false
-          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
-
-      - name: Build and push Server Docker image by digest
-        id: build_server
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
-        with:
-          context: .
-          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
-          file: ${{ matrix.config.dockerfile }}
-          target: server
-          provenance: false
-          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
-
-      - name: Export digest metadata
-        shell: bash
-        run: |
-            set -euo pipefail
-
-            TAGS="${{ matrix.config.tag }}"
-            ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}"
-            DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv"
-            mkdir -p /tmp/digests
-
-            add_digest_rows() {
-                local image_type="$1"
-                local digest="$2"
-
-                if [[ -z "$digest" ]]; then
-                  echo "Missing digest for image_type=${image_type}" >&2
-                  exit 1
-                fi
-
-                for tag in $TAGS; do
-                    printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE"
-                done
-            }
-
-            if [[ "${{ matrix.config.full }}" == "true" ]]; then
-                add_digest_rows "full" "${{ steps.build_full.outputs.digest }}"
-            fi
-
-            if [[ "${{ matrix.config.light }}" == "true" ]]; then
-                add_digest_rows "light" "${{ steps.build_light.outputs.digest }}"
-            fi
-
-            if [[ "${{ matrix.config.server }}" == "true" ]]; then
-                add_digest_rows "server" "${{ steps.build_server.outputs.digest }}"
-            fi
-
-      - name: Upload digest metadata
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
-        with:
-          name: digests-${{ steps.meta.outputs.digest_artifact_suffix }}
-          path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv
-          if-no-files-found: error
-
-  merge_arch_tags:
-    name: Create shared tags from digests
-    needs: [prepare_matrices, push_to_registry, create_tag]
-    runs-on: ubuntu-24.04
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }}
-
-    steps:
-      - name: Check out the repo
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Get build date
-        id: build_date
-        run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
-
-      - name: Download digest metadata
-        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
-        with:
-          pattern: digests-*
-          path: /tmp/digests
-          merge-multiple: true
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
-
-      - name: Log in to Docker Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Create tags from digests
-        shell: bash
-        run: |
-          set -euo pipefail
-
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
-          PREFIX="${IMAGE_REPO}:"
-          SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
-          BUILD_DATE="${{ steps.build_date.outputs.date }}"
-          COMMIT_SHA="${{ steps.checkout.outputs.commit }}"
-          TAGS="${{ matrix.config.tag }}"
-          ARCHES="${{ matrix.config.arches }}"
-          DIGEST_GLOB="/tmp/digests/*.tsv"
-
-          if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then
-              echo "No digest metadata found in /tmp/digests" >&2
-              exit 1
-          fi
-
-          if [[ -z "$SRC_TAG" ]]; then
-              echo "Missing source tag from create_tag" >&2
-              exit 1
-          fi
-
-          find_digest() {
-              local tag_name="$1"
-              local arch="$2"
-              local image_type="$3"
-              local digest
-
-              digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
-
-              # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
-              if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then
-                digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
-              fi
-
-              if [[ -z "$digest" ]]; then
-                echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2
-                exit 1
-              fi
-
-              echo "$digest"
-          }
-
-          create_manifest_tags() {
-              local image_type="$1"
-              local tag_name="$2"
-              local suffix="$3"
-
-              local merged_tag="${PREFIX}${image_type}${suffix}"
-              local merged_versioned_tag="${merged_tag}-${SRC_TAG}"
-
-              local refs=()
-
-              for arch in $ARCHES; do
-                  local digest
-                  digest="$(find_digest "$tag_name" "$arch" "$image_type")"
-                  refs+=("${IMAGE_REPO}@${digest}")
-              done
-
-              local annotations=(
-                  --annotation "index:org.opencontainers.image.created=${BUILD_DATE}"
-                  --annotation "index:org.opencontainers.image.version=${SRC_TAG}"
-                  --annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}"
-                  --annotation "index:org.opencontainers.image.title=llama.cpp"
-                  --annotation "index:org.opencontainers.image.description=LLM inference in C/C++"
-                  --annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}"
-                  --annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}"
-              )
-
-              echo "Creating ${merged_tag} from ${refs[*]}"
-              docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}"
-
-              echo "Creating ${merged_versioned_tag} from ${refs[*]}"
-              docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}"
-          }
-
-          for tag in $TAGS; do
-              if [[ "$tag" == "cpu" ]]; then
-                  TYPE=""
-              else
-                  TYPE="-$tag"
-              fi
-
-              if [[ "${{ matrix.config.full }}" == "true" ]]; then
-                  create_manifest_tags "full" "$tag" "$TYPE"
-              fi
-
-              if [[ "${{ matrix.config.light }}" == "true" ]]; then
-                  create_manifest_tags "light" "$tag" "$TYPE"
-              fi
-
-              if [[ "${{ matrix.config.server }}" == "true" ]]; then
-                  create_manifest_tags "server" "$tag" "$TYPE"
-              fi
-          done
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
@@ -1,7 +1,6 @@
 name: EditorConfig Checker

 on:
-  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
@@ -9,16 +8,10 @@ on:
    branches:
      - master

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  editorconfig:
-    runs-on: [self-hosted, fast]
+    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
-      - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
-        with:
-          version: v3.0.3
+      - uses: actions/checkout@v3
+      - uses: editorconfig-checker/action-editorconfig-checker@main
      - run: editorconfig-checker
@@ -1,44 +0,0 @@
-# This workflow will upload a Python Package using Twine when a GGUF release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-# See `gguf-py/README.md` for how to make a release.
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
-
-on:
-  workflow_dispatch:
-  push:
-    # Pattern matched against refs/tags
-    tags:
-      - 'gguf-v*'           # Push events to every version tag
-
-
-jobs:
-  deploy:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v6
-    - name: Set up Python
-      uses: actions/setup-python@v6
-      with:
-        python-version: '3.11'
-        pip-install: poetry==2.4.0
-    - name: Install dependencies
-      run: |
-        cd gguf-py
-        poetry install
-
-    - name: Build package
-      run: cd gguf-py && poetry build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
-        packages-dir: gguf-py/dist
@@ -1,82 +0,0 @@
-name: HIP quality check
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/hip-quality-check.yml',
-      '**/*.cu',
-      '**/*.cuh',
-      'scripts/hip/gcn-cdna-vgpr-check.py'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/hip-quality-check.yml',
-      '**/*.cu',
-      '**/*.cuh',
-      'scripts/hip/gcn-cdna-vgpr-check.py'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-22-hip-quality-check:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:7.2.1
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev python3
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: hip-quality-check-ubuntu-22.04
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with Werror
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx942 \
-            -DGGML_HIP=ON \
-            -DGGML_HIP_EXPORT_METRICS=Off \
-            -DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
-            -DCMAKE_BUILD_TYPE=Release
-          cd build
-          make -j $(nproc)
-
-      - name: Check for major VGPR spills
-        id: vgpr_check
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx908 \
-            -DGGML_HIP=ON \
-            -DGGML_HIP_EXPORT_METRICS=On \
-            -DCMAKE_HIP_FLAGS="" \
-            -DCMAKE_BUILD_TYPE=Release
-          cd build
-          make -j $(nproc) 2>&1 | tee metrics.log | grep -v 'Rpass-analysis=kernel-resource-usage\|remark:\|^$'
-          python3 ../scripts/hip/gcn-cdna-vgpr-check.py metrics.log
@@ -1,17 +0,0 @@
-name: "Pull Request Labeler"
-on:
- pull_request_target
-
-jobs:
-  labeler:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-slim
-    steps:
-    - uses: actions/checkout@v6
-      with:
-        repository: "ggml-org/llama.cpp"
-    - uses: actions/labeler@v6
-      with:
-        configuration-path: '.github/labeler.yml'
@@ -1,45 +0,0 @@
-name: Check Pre-Tokenizer Hashes
-
-on:
-    push:
-        paths:
-            - 'conversion/base.py'
-            - 'convert_hf_to_gguf_update.py'
-    pull_request:
-        paths:
-            - 'conversion/base.py'
-            - 'convert_hf_to_gguf_update.py'
-
-jobs:
-    pre-tokenizer-hashes:
-        runs-on: [self-hosted, fast]
-
-        steps:
-        - name: Checkout repository
-          uses: actions/checkout@v6
-
-        - name: Set up Python
-          uses: actions/setup-python@v6
-          with:
-              python-version: '3.11'
-
-        - name: Install Python dependencies
-          run: |
-              python3 -m venv .venv
-              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
-
-        - name: Update pre-tokenizer hashes
-          run: |
-              cp conversion/base.py /tmp
-              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
-
-        - name: Check if committed pre-tokenizer hashes matches generated version
-          run: |
-              if ! diff -q conversion/base.py /tmp/base.py; then
-                  echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
-                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes"
-                  echo "Differences found:"
-                  diff conversion/base.py /tmp/base.py || true
-                  exit 1
-              fi
-              echo "Model pre-tokenizer hashes are up to date."
@@ -1,33 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - '**/requirements*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - '**/requirements*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-check-requirements:
-    runs-on: [self-hosted, CPU, fast]
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v6
-      - name: Set up Python environment
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh
@@ -1,36 +0,0 @@
-name: flake8 Lint
-
-on:
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/python-lint.yml',
-      '**/*.py'
-    ]
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/python-lint.yml',
-      '**/*.py'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  flake8-lint:
-    runs-on: [self-hosted, fast]
-    name: Lint
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v6
-      - name: Set up Python environment
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.11"
-      - name: flake8 Lint
-        uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
-        with:
-            plugins: "flake8-no-print"
@@ -1,43 +0,0 @@
-name: Python Type-Check
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - 'ty.toml'
-      - '**.py'
-      - '**/requirements*.txt'
-      # - 'pyrightconfig.json'
-  pull_request:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - 'ty.toml'
-      - '**.py'
-      - '**/requirements*.txt'
-      # - 'pyrightconfig.json'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-type-check:
-    runs-on: [self-hosted, fast]
-    name: python type-check
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v6
-      - name: Set up Python environment
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.35
-      # - name: Type-check with Pyright
-      #   uses: jakebailey/pyright-action@v2
-      #   with:
-      #     version: 1.1.382
-      #     level: warning
-      #     warnings: true
-      - name: Type-check with ty
-        run: |
-            ty check --output-format=github
@@ -1,112 +0,0 @@
-name: Server (sanitize)
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/server-sanitize.yml',
-      '**/CMakeLists.txt',
-      '**/Makefile',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      'tools/server/**.*'
-    ]
-
-env:
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-  LLAMA_ARG_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server:
-    runs-on: [self-hosted, CPU, Linux, llama-server]
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
-        build_type: [RelWithDebInfo]
-      fail-fast: false
-
-    steps:
-      #- name: Dependencies
-      #  id: depends
-      #  run: |
-      #    sudo apt-get update
-      #    sudo apt-get -y install \
-      #      build-essential \
-      #      xxd \
-      #      git \
-      #      cmake \
-      #      curl \
-      #      wget \
-      #      language-pack-en \
-      #      libssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_SCHED_NO_REALLOC=ON \
-            -DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-            -DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-            -DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
-            -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-            -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-            -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.11'
-          pip-install: -r tools/server/tests/requirements.txt
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-        run: |
-          cd tools/server/tests
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          export ${{ matrix.extra_args }}
-          SLOW_TESTS=1 pytest -v -x
@@ -1,202 +0,0 @@
-name: Server (self-hosted)
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/server-self-hosted.yml',
-      '**/CMakeLists.txt',
-      '**/Makefile',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.swift',
-      '**/*.m',
-      'tools/server/**.*'
-    ]
-
-env:
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-  LLAMA_ARG_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server-metal:
-    runs-on: [self-hosted, llama-server, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Tests (GPUx1)
-        id: server_integration_tests
-        if: ${{ !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          source venv/bin/activate
-          pytest -v -x -m "not slow"
-
-      - name: Tests (GPUx1, backend-sampling)
-        id: server_integration_tests_backend_sampling
-        if: ${{ !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          source venv/bin/activate
-          export LLAMA_ARG_BACKEND_SAMPLING=1
-          pytest -v -x -m "not slow"
-
-      - name: Tests (GPUx2)
-        id: server_integration_tests_gpu2
-        if: ${{ !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          source venv/bin/activate
-          export GGML_METAL_DEVICES=2
-          pytest -v -x -m "not slow"
-
-      - name: Tests (GPUx2, backend-sampling)
-        id: server_integration_tests_gpu2_backend_sampling
-        if: ${{ !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          source venv/bin/activate
-          export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1
-          pytest -v -x -m "not slow"
-
-  server-cuda:
-    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config Release -j $(nproc) --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Tests (GPUx1)
-        id: server_integration_tests
-        if: ${{ !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          source venv/bin/activate
-          pytest -v -x -m "not slow"
-
-      - name: Tests (GPUx1, backend-sampling)
-        id: server_integration_tests_backend_sampling
-        if: ${{ !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          source venv/bin/activate
-          export LLAMA_ARG_BACKEND_SAMPLING=1
-          pytest -v -x -m "not slow"
-
-  server-kleidiai:
-    runs-on: ah-ubuntu_22_04-c8g_8x
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          set -euxo pipefail
-          sudo apt-get update
-          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
-          apt-get install -y \
-           build-essential \
-           libssl-dev \
-           python3-venv \
-           gpg \
-           wget \
-           time \
-           git-lfs
-
-          git lfs install
-
-          # install the latest cmake
-          sudo install -d /usr/share/keyrings
-          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
-           | gpg --dearmor \
-           | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
-          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
-           | sudo tee /etc/apt/sources.list.d/kitware.list
-          sudo apt-get update
-          sudo apt-get install -y cmake
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON
-          cmake --build build --config Release -j $(nproc) --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          source venv/bin/activate
-          pytest -v -x -m "not slow"
@@ -1,181 +0,0 @@
-name: Server
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/server.yml',
-      '**/CMakeLists.txt',
-      '**/Makefile',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.swift',
-      '**/*.m',
-      'tools/server/**.*'
-    ]
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/server.yml',
-      '**/CMakeLists.txt',
-      '**/Makefile',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.swift',
-      '**/*.m',
-      'tools/server/**.*'
-    ]
-
-env:
-  LLAMA_ARG_LOG_COLORS: 1
-  LLAMA_ARG_LOG_PREFIX: 1
-  LLAMA_ARG_LOG_TIMESTAMPS: 1
-  LLAMA_ARG_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ubuntu:
-    runs-on: ubuntu-24.04-arm
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: server-ubuntu-24.04-arm
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config Release -j $(nproc) --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.11'
-          pip-install: -r tools/server/tests/requirements.txt
-
-      - name: Tests
-        id: server_integration_tests
-        run: |
-          cd tools/server/tests
-          pytest -v -x -m "not slow"
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
-        run: |
-          cd tools/server/tests
-          SLOW_TESTS=1 pytest -v -x
-
-      - name: Tests (Backend sampling)
-        id: server_integration_tests_backend_sampling
-        run: |
-          cd tools/server/tests
-          export LLAMA_ARG_BACKEND_SAMPLING=1
-          pytest -v -x -m "not slow"
-
-      - name: Slow tests (Backend sampling)
-        id: server_integration_tests_slow_backend_sampling
-        if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
-        run: |
-          cd tools/server/tests
-          export LLAMA_ARG_BACKEND_SAMPLING=1
-          SLOW_TESTS=1 pytest -v -x
-
-  windows:
-    runs-on: windows-2025
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: server-windows-2025-x64
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          cmake -B build -G "Ninja Multi-Config" ^
-            -DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
-            -DGGML_SCHED_NO_REALLOC=ON
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.11'
-          pip-install: -r tools/server/tests/requirements.txt
-
-      - name: Tests
-        id: server_integration_tests
-        run: |
-          cd tools/server/tests
-          $env:PYTHONIOENCODING = ":replace"
-          pytest -v -x -m "not slow"
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
-        run: |
-          cd tools/server/tests
-          $env:SLOW_TESTS = "1"
-          pytest -v -x
@@ -0,0 +1,20 @@
+name: clang-tidy review post comments
+
+on:
+  workflow_dispatch:
+    workflows: ["clang-tidy-review"]
+    types:
+      - completed
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: ZedThree/clang-tidy-review/post@v0.13.0
+        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
+        with:
+          # adjust options as necessary
+          lgtm_comment_body: ''
+          annotations: false
+          max_comments: 25
@@ -0,0 +1,23 @@
+name: clang-tidy-review
+
+on:
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  clang-tidy-review:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: ZedThree/clang-tidy-review@v0.13.0
+      id: review
+      with:
+        lgtm_comment_body: ''
+        build_dir: build
+        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
+        split_workflow: true
+
+    - uses: ZedThree/clang-tidy-review/upload@v0.13.0
--- a/Show More
+++ b/Show More