minor layout improvements (#6572 )

* minor layout improvements * added missing file, run deps.sh locally
llama : add model types for mixtral (#6589 )
2026-06-16 10:46:43 +02:00 · 2024-04-10 19:18:25 +02:00 · 2024-04-10 17:24:14 +02:00 · 2024-04-10 15:23:12 +02:00 · 2024-04-10 09:58:48 +03:00 · 2024-04-10 09:49:12 +03:00
98 changed files with 7331 additions and 4004 deletions
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -28,6 +28,8 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1

 RUN make

@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -15,6 +15,9 @@ WORKDIR /app

 COPY . .

+ENV LLAMA_CURL=1
+
+
 RUN make

 ENV LC_ALL=C.utf8
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git libcurl4-openssl-dev

 WORKDIR /app

@@ -22,11 +22,16 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1

 RUN make

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/server /server

 ENTRYPOINT [ "/server" ]
@@ -4,7 +4,7 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
-    apt-get install -y git
+    apt-get install -y git libcurl4-openssl-dev

 WORKDIR /app

@@ -16,11 +16,14 @@ RUN mkdir build && \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build . --config Release --target server

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/build/bin/server /server

 ENV LC_ALL=C.utf8
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 RUN make

 ENTRYPOINT [ "/app/server" ]
@@ -11,12 +11,16 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
    apt update -y && \
    apt-get install -y vulkan-sdk

+# Install cURL
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 # Build it
 WORKDIR /app
 COPY . .
 RUN mkdir build && \
    cd build && \
-    cmake .. -DLLAMA_VULKAN=1 && \
+    cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build . --config Release --target server

 # Clean up
@@ -3,16 +3,21 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git libcurl4-openssl-dev

 WORKDIR /app

 COPY . .

+ENV LLAMA_CURL=1
+
 RUN make

 FROM ubuntu:$UBUNTU_VERSION as runtime

+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/server /server

 ENV LC_ALL=C.utf8
@@ -24,15 +24,15 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
-  pull_request:
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
  cancel-in-progress: true

 jobs:
@@ -42,11 +42,21 @@ jobs:
      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
      N_USERS: 8
      DURATION: 10m
+
+    strategy:
+      matrix:
+        model: [phi-2]
+        ftype: [q4_0, q8_0, f16]
+        include:
+          - model: phi-2
+            ftype: q4_0
+            pr_comment_enabled: "true"
+
    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@@ -69,12 +79,18 @@ jobs:
            sleep 0.1
          done

-      - name: Install k6
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
          cd examples/server/bench
-          wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
-          tar xzf k6*.tar.gz --strip-components=1
+          go install go.k6.io/xk6/cmd/xk6@latest
+          xk6 build master \
+              --with github.com/phymbert/xk6-sse

      - name: Build
        id: cmake_build
@@ -108,7 +124,7 @@ jobs:

          cd examples/server/bench
          source venv/bin/activate
-          BENCH_K6_BIN_PATH=./k6 python bench.py \
+          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
              --branch ${{ github.head_ref || github.ref_name }} \
@@ -116,7 +132,7 @@ jobs:
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
              --hf-repo ggml-org/models	 \
-              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
              --model-path-prefix /models \
              --parallel ${{ env.N_USERS }} \
              -ngl 33 \
@@ -134,7 +150,7 @@ jobs:

      - uses: actions/upload-artifact@v4
        with:
-          name: benchmark-results
+          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          compression-level: 9
          path: |
            examples/server/bench/*.jpg
@@ -143,11 +159,10 @@ jobs:

      - name: Commit status
        uses: Sibz/github-status-action@v1
-        continue-on-error: true # If not authorized on external repo
        with:
          authToken: ${{secrets.GITHUB_TOKEN}}
          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-baseline
+          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          description: |
            ${{ env.BENCH_RESULTS }}
          state: 'success'
@@ -204,21 +219,26 @@ jobs:
      - name: Comment PR
        uses: mshick/add-pr-comment@v2
        id: comment_pr
-        if: ${{ github.event.pull_request != '' }}
+        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
        with:
-          message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
+          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          message: |
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+            <p align="center">

-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
-            - ${{ env.BENCH_GRAPH_XLABEL }}
+            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+
+            </p>

            <details>

-            <summary>Time series</summary>
+            <summary>Expand details for performance related PR only</summary>
+
+            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
+            - ${{ env.BENCH_GRAPH_XLABEL }}
+

            <p align="center">

@@ -16,7 +16,7 @@ on:
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 env:
@@ -31,7 +31,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -76,10 +76,10 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: llama-bin-macos-arm64.zip

  macOS-latest-cmake-x64:
    runs-on: macos-latest
@@ -87,7 +87,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -132,10 +132,10 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: llama-bin-macos-x64.zip

  ubuntu-focal-make:
    runs-on: ubuntu-20.04
@@ -146,7 +146,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -158,7 +158,7 @@ jobs:
        with:
          node-version: "20"

-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"

@@ -181,7 +181,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -203,7 +203,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -249,7 +249,7 @@ jobs:
 #    steps:
 #      - name: Clone
 #        id: checkout
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        id: depends
@@ -283,7 +283,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -311,7 +311,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -357,7 +357,7 @@ jobs:

      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Build
        id: cmake_build
@@ -398,7 +398,7 @@ jobs:

      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Build
        id: cmake_build
@@ -418,7 +418,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -449,7 +449,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        id: depends
@@ -593,7 +593,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -723,10 +723,10 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
+          name: llama-bin-win-${{ matrix.build }}-x64.zip

  windows-latest-cmake-cuda:
    runs-on: windows-latest
@@ -739,7 +739,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -779,10 +779,10 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
        run: |
@@ -793,10 +793,10 @@ jobs:

      - name: Upload Cuda runtime
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
-          path: |
-            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip

  windows-latest-cmake-sycl:
    runs-on: windows-latest
@@ -812,7 +812,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -844,17 +844,17 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          name: llama-bin-win-sycl-x64.zip

  ios-xcode-build:
    runs-on: macos-latest

    steps:
      - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@@ -864,7 +864,7 @@ jobs:

    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Set up JDK
        uses: actions/setup-java@v3
@@ -887,7 +887,7 @@ jobs:
 #    runs-on: macos-12
 #    steps:
 #    - name: Clone
-#      uses: actions/checkout@v3
+#      uses: actions/checkout@v4
 #
 #    - name: Build
 #      uses: cross-platform-actions/action@v0.19.0
@@ -918,7 +918,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -937,7 +937,7 @@ jobs:

      - name: Download artifacts
        id: download-artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4

      - name: Create release
        id: create_release
@@ -978,7 +978,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        run: |
@@ -1002,7 +1002,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        run: |
@@ -1026,7 +1026,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        run: |
@@ -1056,7 +1056,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -1072,7 +1072,7 @@ jobs:
 #          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
 #
 #      - name: Upload binaries
-#        uses: actions/upload-artifact@v1
+#        uses: actions/upload-artifact@v4
 #        with:
 #          name: llama-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
@@ -1095,7 +1095,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -1127,7 +1127,7 @@ jobs:
 #
 #      - name: Upload binaries
 #        if: matrix.blas == 'ON'
-#        uses: actions/upload-artifact@v1
+#        uses: actions/upload-artifact@v4
 #        with:
 #          name: llama-blas-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
@@ -1141,7 +1141,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v4
 #
 #      - name: Dependencies
 #        run: |
@@ -6,7 +6,7 @@ env:
  GGML_N_THREADS: 1

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Dependencies
        run: |
@@ -16,7 +16,7 @@ on:
      - master

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -46,7 +46,7 @@ jobs:
          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
@@ -15,13 +15,13 @@ on:
      - master

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
  editorconfig:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - uses: editorconfig-checker/action-editorconfig-checker@main
      - run: editorconfig-checker
@@ -24,9 +24,9 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
      with:
        python-version: '3.9.x'
    - name: Install dependencies
@@ -18,7 +18,7 @@ on:
    paths: ['**/*.nix', 'flake.lock']

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -9,7 +9,7 @@ on:
    types: [opened, synchronize, reopened]

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -17,7 +17,7 @@ on:
      - 'requirements/*.txt'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -26,9 +26,9 @@ jobs:
    name: check-requirements
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up Python environment
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
@@ -3,7 +3,7 @@ name: flake8 Lint
 on: [push, pull_request]

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -12,9 +12,9 @@ jobs:
    name: Lint
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up Python environment
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: flake8 Lint
@@ -4,6 +4,10 @@ name: Server
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
@@ -11,15 +15,15 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
-  pull_request:
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  schedule:
-    -  cron: '0 0 * * *'
+    -  cron: '2 4 * * *'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -44,25 +48,45 @@ jobs:
      options: --cpus 4

    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
      - name: Dependencies
        id: depends
        run: |
          apt-get update
          apt-get -y install \
            build-essential \
+            xxd \
            git \
            cmake \
            python3-pip \
+            curl \
            wget \
            language-pack-en \
            libcurl4-openssl-dev

+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Verify server deps
+        id: verify_server_deps
+        run: |
+          git config --global --add safe.directory $(realpath .)
+          cd examples/server
+          git ls-files --others --modified
+          git status
+          ./deps.sh
+          git status
+          not_ignored_files="$(git ls-files --others --modified)"
+          echo "Modified files: ${not_ignored_files}"
+          if [ -n "${not_ignored_files}" ]; then
+            echo "Repository is dirty or server deps are not built as expected"
+            echo "${not_ignored_files}"
+            exit 1
+          fi
+
      - name: Build
        id: cmake_build
        run: |
@@ -102,7 +126,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -7,7 +7,7 @@ on:
      - master

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -18,7 +18,7 @@ jobs:
        runs-on: [ubuntu-latest, macos-latest, windows-latest]
    runs-on: ${{ matrix.runs-on }}
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
        with:
          submodules: recursive
          fetch-depth: 0
@@ -0,0 +1,655 @@
+# date: Tue Apr  9 09:17:14 EEST 2024
+# this file is auto-generated by scripts/gen-authors.sh
+
+0cc4m <picard12@live.de>
+0xspringtime <110655352+0xspringtime@users.noreply.github.com>
+2f38b454 <dxf@protonmail.com>
+3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
+44670 <44670@users.noreply.github.com>
+AN Long <aisk@users.noreply.github.com>
+AT <manyoso@users.noreply.github.com>
+Aarni Koskela <akx@iki.fi>
+Aaron Miller <apage43@ninjawhale.com>
+Aaryaman Vasishta <aaryaman.vasishta@amd.com>
+Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
+Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
+Adithya Balaji <adithya.b94@gmail.com>
+AdithyanI <adithyan.i4internet@gmail.com>
+Adrian <smith.adriane@gmail.com>
+Adrian Hesketh <a-h@users.noreply.github.com>
+AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
+Aisuko <urakiny@gmail.com>
+Alberto <57916483+albbus-stack@users.noreply.github.com>
+Alex <awhill19@icloud.com>
+Alex Azarov <alex@azarov.by>
+Alex Azarov <alexander.azarov@mapbox.com>
+Alex Klinkhamer <from.github.com.917@grencez.dev>
+Alex Klinkhamer <git@grencez.dev>
+Alex Nguyen <tiendung@users.noreply.github.com>
+Alex Petenchea <alex.petenchea@gmail.com>
+Alex Renda <alexrenda@users.noreply.github.com>
+Alex von Gluck IV <kallisti5@unixzen.com>
+Alexey Parfenov <zxed@alkatrazstudio.net>
+Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
+Ali Nehzat <ali.nehzat@thanks.dev>
+Ali Tariq <ali.tariq@10xengineers.ai>
+Alon <alonfaraj@gmail.com>
+AlpinDale <52078762+AlpinDale@users.noreply.github.com>
+AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
+Ananta Bastola <anantarajbastola@gmail.com>
+Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
+András Salamon <ott2@users.noreply.github.com>
+Andrei <abetlen@gmail.com>
+Andrew Canis <andrew.canis@gmail.com>
+Andrew Duffy <a10y@users.noreply.github.com>
+Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
+Arik Poznanski <arikpoz@users.noreply.github.com>
+Artem <guinmoon@gmail.com>
+Artyom Lebedev <vagran.ast@gmail.com>
+Asbjørn Olling <asbjornolling@gmail.com>
+Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
+Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
+Ashraful Islam <ashraful.meche@gmail.com>
+Atsushi Tatsuma <yoshoku@outlook.com>
+Austin <77757836+teleprint-me@users.noreply.github.com>
+AustinMroz <austinmroz@utexas.edu>
+BADR <contact@pythops.com>
+Bach Le <bach@bullno1.com>
+Bailey Chittle <39804642+bachittle@users.noreply.github.com>
+BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
+Behnam M <58621210+ibehnam@users.noreply.github.com>
+Ben Garney <bengarney@users.noreply.github.com>
+Ben Siraphob <bensiraphob@gmail.com>
+Ben Williams <ben@719ben.com>
+Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
+Bernat Vadell <hounter.caza@gmail.com>
+Bodo Graumann <mail@bodograumann.de>
+Bono Lv <lvscar@users.noreply.github.com>
+Borislav Stanimirov <b.stanimirov@abv.bg>
+Branden Butler <bwtbutler@hotmail.com>
+Brian <mofosyne@gmail.com>
+Bruce MacDonald <brucewmacdonald@gmail.com>
+CJ Pais <cj@cjpais.com>
+CRD716 <crd716@gmail.com>
+Cameron <csteele@steelecameron.com>
+Cameron Kaiser <classilla@users.noreply.github.com>
+Casey Primozic <casey@cprimozic.net>
+Casey Primozic <me@ameo.link>
+CausalLM <148736309+CausalLM@users.noreply.github.com>
+Cebtenzzre <cebtenzzre@gmail.com>
+Chad Brewbaker <crb002@gmail.com>
+Cheng Shao <terrorjack@type.dance>
+Chris Kuehl <ckuehl@ckuehl.me>
+Christian Demsar <christian@github.email.demsar.us>
+Christian Demsar <crasm@git.vczf.us>
+Christian Falch <875252+chrfalch@users.noreply.github.com>
+Christian Kögler <ck3d@gmx.de>
+Clark Saben <76020733+csaben@users.noreply.github.com>
+Clint Herron <hanclinto@gmail.com>
+Cuong Trinh Manh <nguoithichkhampha@gmail.com>
+DAN™ <dranger003@gmail.com>
+Damian Stewart <d@damianstewart.com>
+Dane Madsen <dane_madsen@hotmail.com>
+DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
+Daniel Bevenius <daniel.bevenius@gmail.com>
+Daniel Drake <drake@endlessos.org>
+Daniel Hiltgen <dhiltgen@users.noreply.github.com>
+Daniel Illescas Romero <illescas.daniel@protonmail.com>
+DannyDaemonic <DannyDaemonic@gmail.com>
+Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
+Dave Della Costa <ddellacosta+github@gmail.com>
+David Friehs <david@friehs.info>
+David Kennedy <dakennedyd@gmail.com>
+David Pflug <david@pflug.email>
+David Renshaw <dwrenshaw@gmail.com>
+David Sommers <12738+databyte@users.noreply.github.com>
+David Yang <davidyang6us@gmail.com>
+Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
+Dean <Dean.Sinaean@gmail.com>
+Deins <deinsegle@gmail.com>
+Didzis Gosko <didzis@users.noreply.github.com>
+Don Mahurin <dmahurin@users.noreply.github.com>
+DooWoong Lee (David) <manics99@naver.com>
+Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
+Douglas Hanley <thesecretaryofwar@gmail.com>
+Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
+Ebey Abraham <ebey97@gmail.com>
+Ed Lee <edilee@mozilla.com>
+Ed Lepedus <ed.lepedus@googlemail.com>
+Edward Taylor <edeetee@gmail.com>
+Elbios <141279586+Elbios@users.noreply.github.com>
+Engininja2 <139037756+Engininja2@users.noreply.github.com>
+Equim <sayaka@ekyu.moe>
+Eric Sommerlade <es0m@users.noreply.github.com>
+Eric Zhang <34133756+EZForever@users.noreply.github.com>
+Erik Garrison <erik.garrison@gmail.com>
+Erik Scholz <Green-Sky@users.noreply.github.com>
+Ettore Di Giacinto <mudler@users.noreply.github.com>
+Evan Jones <evan.q.jones@gmail.com>
+Evan Miller <emmiller@gmail.com>
+Eve <139727413+netrunnereve@users.noreply.github.com>
+Evgeny Kurnevsky <kurnevsky@gmail.com>
+Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
+ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
+FK <sozforex@gmail.com>
+Fabian <cmdrf@users.noreply.github.com>
+Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
+Faez Shakil <faez.shakil@gmail.com>
+FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
+Fattire <528174+fat-tire@users.noreply.github.com>
+Felix <stenbackfelix@gmail.com>
+Finn Voorhees <finnvoorhees@gmail.com>
+Firat <firatkiral@gmail.com>
+Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
+Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
+Francisco Melo <43780565+francis2tm@users.noreply.github.com>
+FrankHB <frankhb1989@gmail.com>
+Frederik Vogel <Schaltfehler@users.noreply.github.com>
+Gabe Goodhart <gabe.l.hart@gmail.com>
+GainLee <perfecter.gen@gmail.com>
+Galunid <karolek1231456@gmail.com>
+Gary Linscott <glinscott@gmail.com>
+Gary Mulder <gjmulder@gmail.com>
+Genkagaku.GPT <hlhr202@163.com>
+Georgi Gerganov <ggerganov@gmail.com>
+Gilad S <giladgd@users.noreply.github.com>
+GiviMAD <GiviMAD@users.noreply.github.com>
+Govlzkoy <gotope@users.noreply.github.com>
+Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
+Guillaume Wenzek <gwenzek@users.noreply.github.com>
+Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
+Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
+Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
+Haohui Mai <ricetons@gmail.com>
+Haoxiang Fei <tonyfettes@tonyfettes.com>
+Harald Fernengel <harald.fernengel@here.com>
+Hatsune Miku <129688334+at8u@users.noreply.github.com>
+Henk Poley <HenkPoley@gmail.com>
+Henri Vasserman <henv@hot.ee>
+Henrik Forstén <henrik.forsten@gmail.com>
+Herman Semenov <GermanAizek@yandex.ru>
+Hesen Peng <hesen.peng@gmail.com>
+Hoang Nguyen <hugo53@users.noreply.github.com>
+Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
+Howard Su <howard0su@gmail.com>
+Hua Jiang <allenhjiang@outlook.com>
+Huawei Lin <huaweilin.cs@gmail.com>
+Ian Bull <irbull@eclipsesource.com>
+Ian Bull <irbull@gmail.com>
+Ian Scrivener <github@zilogy.asia>
+Ido S <ido.pluto@gmail.com>
+IgnacioFDM <ignaciofdm@gmail.com>
+Igor Okulist <okigan@gmail.com>
+Ikko Eltociear Ashimine <eltociear@gmail.com>
+Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
+Ionoclast Laboratories <brigham@ionoclast.com>
+Isaac McFadyen <isaac@imcf.me>
+IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
+Ivan Komarov <Ivan.Komarov@dfyz.info>
+Ivan Stepanov <ivanstepanovftw@gmail.com>
+JH23X <165871467+JH23X@users.noreply.github.com>
+Jack Mousseau <jmousseau@users.noreply.github.com>
+JackJollimore <130917767+JackJollimore@users.noreply.github.com>
+Jag Chadha <jagtesh@gmail.com>
+Jakub N <jakubniemczyk97@gmail.com>
+James Reynolds <magnusviri@users.noreply.github.com>
+Jan Boon <jan.boon@kaetemi.be>
+Jan Boon <kaetemi@gmail.com>
+Jan Ploski <jpl@plosquare.com>
+Jannis Schönleber <joennlae@gmail.com>
+Jared Van Bortel <cebtenzzre@gmail.com>
+Jared Van Bortel <jared@nomic.ai>
+Jason McCartney <jmac@theroot.org>
+Jean-Christophe Hoelt <hoelt@fovea.cc>
+Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
+Jed Fox <git@jedfox.com>
+Jeffrey Quesnelle <emozilla@nousresearch.com>
+Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
+Jhen-Jie Hong <iainst0409@gmail.com>
+Jiahao Li <liplus17@163.com>
+Jian Liao <jianliao@users.noreply.github.com>
+JidongZhang-THU <1119708529@qq.com>
+Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
+Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
+Johannes Gäßler <johannesg@5d6.de>
+Johannes Rudolph <johannes.rudolph@gmail.com>
+John <78893154+cmp-nct@users.noreply.github.com>
+John Balis <phobossystems@gmail.com>
+John Smith <67539080+kingsidelee@users.noreply.github.com>
+JohnnyB <jboero@users.noreply.github.com>
+Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
+Jorge A <161275481+jorgealias@users.noreply.github.com>
+Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
+Joseph Stahl <1269177+josephst@users.noreply.github.com>
+Joyce <joycebrum@google.com>
+Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
+Judd <foldl@users.noreply.github.com>
+Julius Arkenberg <arki05@users.noreply.github.com>
+Jun Jie <71215065+junnjiee16@users.noreply.github.com>
+Juraj Bednar <juraj@bednar.io>
+Justin Parker <jparkerweb@gmail.com>
+Justin Suess <justin.suess@westpoint.edu>
+Justine Tunney <jtunney@gmail.com>
+Juuso Alasuutari <juuso.alasuutari@gmail.com>
+KASR <karim.asrih@gmail.com>
+Kamil Tomšík <info@tomsik.cz>
+Karsten Weiss <knweiss@gmail.com>
+Karthick <j.karthic2004@gmail.com>
+Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
+Karthik Sethuraman <k.seth1993@gmail.com>
+Kasumi <90275229+kasumi-1@users.noreply.github.com>
+Kawrakow <48489457+ikawrakow@users.noreply.github.com>
+Keiichi Tabata <keiichi.tabata@outlook.com>
+Kenvix ⭐ <kenvixzure@live.com>
+Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
+Kevin Ji <1146876+kevinji@users.noreply.github.com>
+Kevin Kwok <antimatter15@gmail.com>
+Kevin Lo <kevlo@kevlo.org>
+Kolen Cheung <ickc@users.noreply.github.com>
+Konstantin Herud <konstantin.herud@denkbares.com>
+Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
+Kunshang Ji <kunshang.ji@intel.com>
+Kyle Liang <liangmanlai@gmail.com>
+Kyle Mistele <kyle@mistele.com>
+Kylin <56434533+KyL0N@users.noreply.github.com>
+Lars Grammel <lars.grammel@gmail.com>
+Laura <Tijntje_7@msn.com>
+Lee <44310445+lx200916@users.noreply.github.com>
+Lee Drake <b.lee.drake@gmail.com>
+Leng Yue <lengyue@lengyue.me>
+LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
+Leonardo Neumann <leonardo@neumann.dev.br>
+Li Tan <tanliboy@gmail.com>
+Linwei Wang <wanix1988@gmail.com>
+LoganDark <github@logandark.mozmail.com>
+LostRuins <39025047+LostRuins@users.noreply.github.com>
+Luciano <lucianostrika44@gmail.com>
+Luo Tian <lt@basecity.com>
+M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
+Maarten ter Huurne <maarten@treewalker.org>
+Mack Straight <eiz@users.noreply.github.com>
+Maël Kerbiriou <m431.kerbiriou@gmail.com>
+MaggotHATE <clay1326@gmail.com>
+Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
+Marco Matthies <71844+marcom@users.noreply.github.com>
+Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
+Marian Cepok <marian.cepok@gmail.com>
+Mark Fairbairn <thebaron88@gmail.com>
+Marko Tasic <mtasic85@gmail.com>
+Martin Krasser <krasserm@googlemail.com>
+Martin Schwaighofer <mschwaig@users.noreply.github.com>
+Marvin Gießing <marvin.giessing@gmail.com>
+Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
+Matheus C. França <matheus-catarino@hotmail.com>
+Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
+Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
+Mathijs de Bruin <mathijs@mathijsfietst.nl>
+Matt Clayton <156335168+mattjcly@users.noreply.github.com>
+Matt Pulver <matt.pulver@heavy.ai>
+Matteo Boschini <12133566+mbosc@users.noreply.github.com>
+Matthew Tejo <matthew.tejo@gmail.com>
+Matvey Soloviev <blackhole89@gmail.com>
+Maxime <672982+maximegmd@users.noreply.github.com>
+Maximilian Winter <maximilian.winter.91@gmail.com>
+Meng Zhang <meng@tabbyml.com>
+Meng, Hengyu <hengyu.meng@intel.com>
+Merrick Christensen <merrick.christensen@gmail.com>
+Michael Coppola <m18coppola@gmail.com>
+Michael Hueschen <m@mhueschen.dev>
+Michael Kesper <mkesper@schokokeks.org>
+Michael Klimenko <mklimenko29@gmail.com>
+Michael Podvitskiy <podvitskiymichael@gmail.com>
+Michael Potter <NanoTekGuy@Gmail.com>
+Michaël de Vries <vriesdemichael@gmail.com>
+Mihai <mihai.chirculescu@yahoo.com>
+Mike <ytianhui2004@gmail.com>
+Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
+Mirko185 <mirkosig@gmail.com>
+Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
+Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
+Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
+Murilo Santana <mvrilo@gmail.com>
+Musab Gultekin <musabgultekin@users.noreply.github.com>
+Nam D. Tran <42194884+namtranase@users.noreply.github.com>
+NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
+Nebula <infinitewormhole@gmail.com>
+Neo Zhang Jianyu <jianyu.zhang@intel.com>
+Neuman Vong <neuman.vong@gmail.com>
+Nexesenex <124105151+Nexesenex@users.noreply.github.com>
+Niall Coates <1349685+Niall-@users.noreply.github.com>
+Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
+Nigel Bosch <pnigelb@gmail.com>
+Niklas Korz <niklas@niklaskorz.de>
+Nindaleth <Nindaleth@users.noreply.github.com>
+Oleksandr Nikitin <oleksandr@tvori.info>
+Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
+Olivier Chafik <ochafik@users.noreply.github.com>
+Ondřej Čertík <ondrej@certik.us>
+Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
+Paul Tsochantaris <ptsochantaris@icloud.com>
+Pavol Rusnak <pavol@rusnak.io>
+Pedro Cuenca <pedro@huggingface.co>
+Peter Sugihara <peter@campsh.com>
+Phil H <5756783+phiharri@users.noreply.github.com>
+Philip Taron <philip.taron@gmail.com>
+Phillip Kravtsov <phillip@kravtsov.net>
+Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
+Pierrick Hymbert <pierrick.hymbert@gmail.com>
+Przemysław Pawełczyk <przemoc@gmail.com>
+Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
+Qingyou Meng <meng.qingyou@gmail.com>
+Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
+RJ Adriaansen <adriaansen@eshcc.eur.nl>
+Radoslav Gerganov <rgerganov@gmail.com>
+Radosław Gryta <radek.gryta@gmail.com>
+Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
+Rand Xie <randxiexyy29@gmail.com>
+Randall Fitzgerald <randall@dasaku.net>
+Reinforce-II <fate@eastal.com>
+Riceball LEE <snowyu.lee@gmail.com>
+Richard Kiss <him@richardkiss.com>
+Richard Roberson <richardr1126@gmail.com>
+Rick G <26732651+TheFlipbook@users.noreply.github.com>
+Rickard Edén <rickardeden@gmail.com>
+Rickard Hallerbäck <rickard.hallerback@gmail.com>
+Rickey Bowers Jr <bitRAKE@gmail.com>
+Riley Stewart <ristew@users.noreply.github.com>
+Rinne <AsakusaRinne@gmail.com>
+Rinne <liu_yaohui1998@126.com>
+Robert Brisita <986796+rbrisita@users.noreply.github.com>
+Robert Sung-wook Shin <edp1096@users.noreply.github.com>
+Robey Holderith <robey@flaminglunchbox.net>
+Robyn <robyngraf@users.noreply.github.com>
+Roger Meier <r.meier@siemens.com>
+Roland <14355895+rbur0425@users.noreply.github.com>
+Romain D <90720+Artefact2@users.noreply.github.com>
+Romain Neutron <romain@neutron.io>
+Roman Parykin <donderom@gmail.com>
+Ron Evans <ron@hybridgroup.com>
+Ron Jailall <rojailal@gmail.com>
+Ronny Brendel <ronnybrendel@gmail.com>
+Ronsor <ronsor@ronsor.pw>
+Rowan Hart <rowanbhart@gmail.com>
+Rune <43761327+Rune-AI@users.noreply.github.com>
+Ryan Landay <rlanday@gmail.com>
+Ryder Wishart <ryderwishart@gmail.com>
+Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
+SakuraUmi <yukinon244@gmail.com>
+Salvador E. Tropea <stropea@inti.gob.ar>
+Sam Spilsbury <smspillaz@gmail.com>
+Sami Farin <3876865+Safari77@users.noreply.github.com>
+Samuel Maynard <samwmaynard@gmail.com>
+Sang-Kil Park <sang.park@42dot.ai>
+Seb C <47074056+Sebby37@users.noreply.github.com>
+Sebastián A <sebastian.aedo29@gmail.com>
+SebastianApel <13675545+SebastianApel@users.noreply.github.com>
+Senemu <10880819+Senemu@users.noreply.github.com>
+Sergey Alirzaev <zl29ah@gmail.com>
+Sergio López <slp@sinrega.org>
+SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
+ShadovvBeast <ShadovvBeast@gmail.com>
+Shakhar Dasgupta <shakhardasgupta@gmail.com>
+Shangning Xu <32517059+xushangning@users.noreply.github.com>
+Shijie <821898965@qq.com>
+Shintarou Okada <kokuzen@gmail.com>
+Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
+Shouzheng Liu <lshzh.hi@gmail.com>
+Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
+Simon Willison <swillison@gmail.com>
+Siwen Yu <yusiwen@gmail.com>
+Sky Yan <skyan83@gmail.com>
+Slaren <2141330+slaren@users.noreply.github.com>
+Slava Primenko <primenko.s@gmail.com>
+SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
+Someone <sergei.kozlukov@aalto.fi>
+Someone Serge <sergei.kozlukov@aalto.fi>
+Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
+Spencer Sutton <spencersutton@users.noreply.github.com>
+Srinivas Billa <nivibilla@gmail.com>
+Stefan Sydow <stefan@sydow.email>
+Stephan Walter <stephan@walter.name>
+Stephen Nichols <snichols@users.noreply.github.com>
+Steve Grubb <ausearch.1@gmail.com>
+Steven Roussey <sroussey@gmail.com>
+Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
+Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
+SuperUserNameMan <yoann@terminajones.com>
+Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
+Taikono-Himazin <kazu@po.harenet.ne.jp>
+Tameem <113388789+AhmadTameem@users.noreply.github.com>
+Tamotsu Takahashi <ttakah+github@gmail.com>
+Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
+Thatcher Chamberlin <j.thatcher.c@gmail.com>
+Theia Vogel <theia@vgel.me>
+Thérence <13496987+Royalphax@users.noreply.github.com>
+Thibault Terrasson <thibault.terrasson@gmail.com>
+Thomas Klausner <wiz@gatalith.at>
+Tim Miller <drasticactions@users.noreply.github.com>
+Timmy Knight <r2d2fish@gmail.com>
+Timothy Cronin <40186632+4imothy@users.noreply.github.com>
+Ting Lou <ting.lou@gmail.com>
+Ting Sun <suntcrick@gmail.com>
+Tobias Lütke <tobi@shopify.com>
+Tom C <tom.corelis@gmail.com>
+Tom Jobbins <784313+TheBloke@users.noreply.github.com>
+Tomas <tom.tomas.36478119@gmail.com>
+Tomáš Pazdiora <tomas.pazdiora@gmail.com>
+Tristan Ross <rosscomputerguy@protonmail.com>
+Tungsten842 <886724vf@anonaddy.me>
+Tungsten842 <quantmint@protonmail.com>
+Tushar <ditsuke@protonmail.com>
+UEXTM.com <84163508+uextm@users.noreply.github.com>
+Uzo Nweke <uzoechi@gmail.com>
+Vaibhav Srivastav <vaibhavs10@gmail.com>
+Val Kharitonov <mail@kharvd.com>
+Valentin Konovalov <valle.ketsujin@gmail.com>
+Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
+Victor Z. Peng <ziliangdotme@gmail.com>
+Vlad <spitfireage@gmail.com>
+Vladimir <bogdad@gmail.com>
+Vladimir Malyutin <first-leon@yandex.ru>
+Vladimir Zorin <vladimir@deviant.guru>
+Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
+WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
+Weird Constructor <weirdconstructor@gmail.com>
+Welby Seely <welbyseely@gmail.com>
+Wentai Zhang <rchardx@gmail.com>
+WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
+Willy Tarreau <w@1wt.eu>
+Wu Jian Ping <wujjpp@hotmail.com>
+Wu Jian Ping <wujp@greatld.com>
+Xiake Sun <xiake.sun@intel.com>
+Xiang (Kevin) Li <kevinli020508@gmail.com>
+Xiao-Yong Jin <jinxiaoyong@gmail.com>
+XiaotaoChen <chenxiaotao1234@gmail.com>
+Xiaoyi Chen <cxychina@gmail.com>
+Xingchen Song(宋星辰) <xingchensong1996@163.com>
+Xuan Son Nguyen <thichthat@gmail.com>
+Yann Follet <131855179+YannFollet@users.noreply.github.com>
+Yiming Cui <conandiy@vip.qq.com>
+Yishuo Wang <MeouSker77@outlook.com>
+Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
+Yui <dev@sleepyyui.com>
+Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
+Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
+ZHAOKAI WANG <sanxianwei@163.com>
+Zane Shannon <z@zcs.me>
+Zay <95888118+isaiahbjork@users.noreply.github.com>
+Zenix <zenixls2@gmail.com>
+Zhang Peiyuan <a1286225768@gmail.com>
+ZhouYuChen <zhouyuchen@naver.com>
+Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
+Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
+Zsapi <martin1.zsapka@gmail.com>
+a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
+adel boussaken <netdur@gmail.com>
+afrideva <95653597+afrideva@users.noreply.github.com>
+akawrykow <142945436+akawrykow@users.noreply.github.com>
+alexpinel <93524949+alexpinel@users.noreply.github.com>
+alonfaraj <alonfaraj@gmail.com>
+andrijdavid <david@geek.mg>
+anon998 <131767832+anon998@users.noreply.github.com>
+anzz1 <anzz1@live.com>
+apaz <aarpazdera@gmail.com>
+apcameron <37645737+apcameron@users.noreply.github.com>
+arcrank <arcrank@gmail.com>
+arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
+at8u <129688334+at8u@users.noreply.github.com>
+automaticcat <daogiatuank54@gmail.com>
+bandoti <141645996+bandoti@users.noreply.github.com>
+beiller <beiller@gmail.com>
+bhubbb <79117352+bhubbb@users.noreply.github.com>
+bmwl <brian.marshall@tolko.com>
+bobqianic <129547291+bobqianic@users.noreply.github.com>
+bryanSwk <93190252+bryanSwk@users.noreply.github.com>
+bsilvereagle <bsilvereagle@users.noreply.github.com>
+bssrdf <merlintiger@hotmail.com>
+byte-6174 <88070277+byte-6174@users.noreply.github.com>
+cebtenzzre <cebtenzzre@gmail.com>
+chaihahaha <chai836275709@gmail.com>
+chiranko <96988916+chiranko@users.noreply.github.com>
+clibdev <52199778+clibdev@users.noreply.github.com>
+clyang <clyang@clyang.net>
+cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
+coezbek <c.oezbek@gmail.com>
+comex <comexk@gmail.com>
+compilade <113953597+compilade@users.noreply.github.com>
+crasm <crasm@git.vczf.net>
+crasm <crasm@git.vczf.us>
+daboe01 <daboe01@googlemail.com>
+david raistrick <keen99@users.noreply.github.com>
+ddpasa <112642920+ddpasa@users.noreply.github.com>
+deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
+divinity76 <divinity76@gmail.com>
+dotpy314 <33351922+dotpy314@users.noreply.github.com>
+drbh <david.richard.holtz@gmail.com>
+ds5t5 <145942675+ds5t5@users.noreply.github.com>
+dylan <canardleteer@users.noreply.github.com>
+eastriver <lee@eastriver.dev>
+ebraminio <ebraminio@gmail.com>
+eiery <19350831+eiery@users.noreply.github.com>
+eric8607242 <e0928021388@gmail.com>
+fraxy-v <65565042+fraxy-v@users.noreply.github.com>
+github-actions[bot] <github-actions[bot]@users.noreply.github.com>
+gliptic <gliptic@users.noreply.github.com>
+goerch <jhr.walter@t-online.de>
+grahameth <96447521+grahameth@users.noreply.github.com>
+gwjr <502526+gwjr@users.noreply.github.com>
+h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
+hankcs <cnhankmc@gmail.com>
+hoangmit <hoangmit@users.noreply.github.com>
+hongbo.mo <352280764@qq.com>
+howlger <eclipse@voormann.de>
+howlger <github@voormann.de>
+hutli <6594598+hutli@users.noreply.github.com>
+hutli <hutli@hutli.hu>
+hutli <jensstaermose@hotmail.com>
+hxer7963 <hxer7963@gmail.com>
+hydai <z54981220@gmail.com>
+iSma <ismail.senhaji@gmail.com>
+iacore <74560659+iacore@users.noreply.github.com>
+igarnier <igarnier@protonmail.com>
+iohub <rickyang.pro@gmail.com>
+jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
+jameswu2014 <545426914@qq.com>
+jneem <joeneeman@gmail.com>
+johnson442 <56517414+johnson442@users.noreply.github.com>
+jon-chuang <9093549+jon-chuang@users.noreply.github.com>
+jp-x-g <jpxg-dev@protonmail.com>
+jwj7140 <32943891+jwj7140@users.noreply.github.com>
+kaizau <kaizau@users.noreply.github.com>
+kalomaze <66376113+kalomaze@users.noreply.github.com>
+kang <tpdns9032100@gmail.com>
+katsu560 <118887472+katsu560@users.noreply.github.com>
+kchro3 <62481661+kchro3@users.noreply.github.com>
+khimaros <me@khimaros.com>
+kiltyj <kiltyj@gmail.com>
+klosax <131523366+klosax@users.noreply.github.com>
+kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
+kunnis <kunnis@users.noreply.github.com>
+kuronekosaiko <EvanChanJ@163.com>
+kuvaus <22169537+kuvaus@users.noreply.github.com>
+kwin1412 <42286931+kwin1412@users.noreply.github.com>
+l3utterfly <gc.pthzfoldr@gmail.com>
+ldwang <ftgreat@163.com>
+le.chang <cljs118@126.com>
+leejet <leejet714@gmail.com>
+limitedAtonement <limitedAtonement@users.noreply.github.com>
+lon <114724657+longregen@users.noreply.github.com>
+m3ndax <adrian.goessl@outlook.com>
+maddes8cht <55592906+maddes8cht@users.noreply.github.com>
+makomk <makosoft@googlemail.com>
+manikbhandari <mbbhandarimanik2@gmail.com>
+mdrokz <mohammadmunshi@gmail.com>
+mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
+minarchist <minarchist@users.noreply.github.com>
+mj-shifu <77107165+mj-shifu@users.noreply.github.com>
+mmyjona <jonathan.gonse@gmail.com>
+momonga <115213907+mmnga@users.noreply.github.com>
+moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
+mzcu <milos.cubrilo@gmail.com>
+nanahi <130121847+na-na-hi@users.noreply.github.com>
+ngc92 <7938269+ngc92@users.noreply.github.com>
+nhamanasu <45545786+nhamanasu@users.noreply.github.com>
+niansa/tuxifan <anton-sa@web.de>
+niansa/tuxifan <tuxifan@posteo.de>
+ningshanwutuobang <ningshanwutuobang@gmail.com>
+nold <Nold360@users.noreply.github.com>
+nopperl <54780682+nopperl@users.noreply.github.com>
+nusu-github <29514220+nusu-github@users.noreply.github.com>
+olexiyb <olexiyb@gmail.com>
+oobabooga <112222186+oobabooga@users.noreply.github.com>
+opparco <parco.opaai@gmail.com>
+ostix360 <55257054+ostix360@users.noreply.github.com>
+perserk <perserk@gmail.com>
+postmasters <namnguyen@google.com>
+pudepiedj <pudepiedj@gmail.com>
+qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
+qouoq <qouoq@fastmail.com>
+qunash <anzoria@gmail.com>
+rabidcopy <rabidcopy@yahoo.com>
+rankaiyx <rankaiyx@rankaiyx.com>
+rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
+rhuddleston <ryan.huddleston@percona.com>
+rimoliga <53384203+rimoliga@users.noreply.github.com>
+runfuture <runfuture@users.noreply.github.com>
+sandyiscool <sandyiscool@gmail.com>
+semidark <me@semidark.net>
+sharpHL <132747147+sharpHL@users.noreply.github.com>
+shibe2 <shibe@tuta.io>
+singularity <12184989+singularity-s0@users.noreply.github.com>
+sjinzh <sjinzh@gmail.com>
+slaren <2141330+slaren@users.noreply.github.com>
+slaren <slarengh@gmail.com>
+snadampal <87143774+snadampal@users.noreply.github.com>
+staviq <staviq@gmail.com>
+stduhpf <stephduh@live.fr>
+swittk <switt1995@gmail.com>
+takov751 <40316768+takov751@users.noreply.github.com>
+tarcey <cey.tarik@gmail.com>
+texmex76 <40733439+texmex76@users.noreply.github.com>
+thement <40525767+thement@users.noreply.github.com>
+tjohnman <tjohnman@users.noreply.github.com>
+tslmy <tslmy@users.noreply.github.com>
+ubik2 <ubik2@users.noreply.github.com>
+uint256_t <konndennsa@gmail.com>
+uint256_t <maekawatoshiki1017@gmail.com>
+unbounded <haakon@likedan.net>
+valiray <133289098+valiray@users.noreply.github.com>
+vodkaslime <646329483@qq.com>
+vvhg1 <94630311+vvhg1@users.noreply.github.com>
+vxiiduu <73044267+vxiiduu@users.noreply.github.com>
+wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
+whoreson <139810751+whoreson@users.noreply.github.com>
+wonjun Jang <strutive07@gmail.com>
+wzy <32936898+Freed-Wu@users.noreply.github.com>
+xaedes <xaedes@gmail.com>
+xaedes <xaedes@googlemail.com>
+xloem <0xloem@gmail.com>
+yangli2 <yangli2@gmail.com>
+yuiseki <yuiseki@gmail.com>
+zakkor <edward.partenie@gmail.com>
+zhouwg <6889919+zhouwg@users.noreply.github.com>
+zrm <trustiosity.zrm@gmail.com>
+源文雨 <41315874+fumiama@users.noreply.github.com>
+Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 Georgi Gerganov
+Copyright (c) 2023-2024 The ggml authors

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -10,7 +10,7 @@ TEST_TARGETS = \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
 	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease                                 \
-	tests/test-json-schema-to-grammar
+	tests/test-json-schema-to-grammar tests/test-grammar-integration

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -867,6 +867,10 @@ passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
@@ -914,6 +918,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -3,7 +3,7 @@
 - [Background](#background)
 - [News](#news)
 - [OS](#os)
- [Supported Devices](#supported-devices)
+- [Hardware](#hardware)
 - [Docker](#docker)
 - [Linux](#linux)
 - [Windows](#windows)
@@ -24,19 +24,20 @@
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

 ### Llama.cpp + SYCL
-This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

-The llama.cpp SYCL backend supports:
- Intel GPUs.
- Nvidia GPUs.
+The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).

-*Upcoming support: AMD GPUs*.
+When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.

-When targetting **Intel CPUs**, it is recommended to  use llama.cpp for [x86_64](README.md#intel-onemkl) approach.
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

 ## News

+- 2024.4
+  - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
+
 - 2024.3
+  - Release binary files of Windows.
  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
@@ -59,16 +60,11 @@ When targetting **Intel CPUs**, it is recommended to  use llama.cpp for [x86_64]
 |Windows|Support|Windows 11|


-## Supported devices
+## Hardware

-### Intel GPUs
+### Intel GPU

-The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
-```sh
-source /opt/intel/oneapi/setvars.sh
-```
-
- **Tested devices**
+**Verified devices**

 |Intel GPU| Status | Verified Model|
 |-|-|-|
@@ -80,16 +76,18 @@ source /opt/intel/oneapi/setvars.sh

 *Notes:*

- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
+- **Memory**
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.

- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.
+  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.

- If the iGPU has less than 80  EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.
+- **Execution Unit (EU)**
+  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.

-### Nvidia GPUs
-The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
+### Nvidia GPU
+The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)

- **Tested devices**
+**Verified devices**

 |Nvidia GPU| Status | Verified Model|
 |-|-|-|
@@ -257,10 +255,11 @@ source /opt/intel/oneapi/setvars.sh
 mkdir -p build && cd build

 # Option 1: Use FP16 for better performance in long-prompt  inference
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+# Or without "--build", run "make" next

 # Option 2: Use FP32 by default
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 ```

 #### Nvidia GPU
@@ -275,17 +274,17 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 mkdir -p build && cd build

 # Option 1: Use FP16 for better performance in long-prompt  inference
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

 # Option 2: Use FP32 by default
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 ```

 ### III. Run the inference

 1. Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

 2. Enable oneAPI running environment

@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Recent API changes

+- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
@@ -18,12 +19,13 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

+- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387**
+- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
+- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187

 ----

@@ -117,6 +119,10 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Mamba](https://github.com/state-spaces/mamba)
 - [x] [Xverse](https://huggingface.co/models?search=xverse)
 - [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
+- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
+- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
+
+(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))

 **Multimodal models:**

@@ -140,6 +146,7 @@ Typically finetunes of the base models below are supported as well.
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -177,7 +184,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [Msty](https://msty.app) (proprietary)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
 - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
-
+- [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [MindMac](https://mindmac.app) (proprietary)
+- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 ---
@@ -488,7 +498,7 @@ Building the program with BLAS support may lead to some performance improvements

  This provides BLAS acceleration on HIP-supported AMD GPUs.
  Make sure to have ROCm installed.
-  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).

  - Using `make`:
    ```bash
@@ -505,7 +515,7 @@ Building the program with BLAS support may lead to some performance improvements

  - Using `make` (example for target gfx1030, build with 16 CPU threads):
    ```bash
-    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
+    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
    ```

  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
@@ -513,7 +523,7 @@ Building the program with BLAS support may lead to some performance improvements
    set PATH=%HIP_PATH%\bin;%PATH%
    mkdir build
    cd build
-    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
+    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
    cmake --build .
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -0,0 +1,67 @@
+# Security Policy
+
+ - [**Using llama.cpp securely**](#using-llamacpp-securely)
+   - [Untrusted models](#untrusted-models)
+   - [Untrusted inputs](#untrusted-inputs)
+   - [Data privacy](#data-privacy)
+   - [Untrusted environments or networks](#untrusted-environments-or-networks)
+   - [Multi-Tenant environments](#multi-tenant-environments)
+ - [**Reporting a vulnerability**](#reporting-a-vulnerability)
+
+## Using llama.cpp securely
+
+### Untrusted models
+Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
+
+*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
+
+> [!NOTE]
+> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
+
+### Untrusted inputs
+
+Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
+
+For maximum security when handling untrusted inputs, you may need to employ the following:
+
+* Sandboxing: Isolate the environment where the inference happens.
+* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
+* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
+* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
+    * Validation: Enforce strict rules on allowed characters and data types.
+    * Filtering: Remove potentially malicious scripts or code fragments.
+    * Encoding: Convert special characters into safe representations.
+    * Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
+
+### Data privacy
+
+To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
+
+### Untrusted environments or networks
+
+If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
+* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
+* Encrypt your data if sending it over the network.
+
+### Multi-Tenant environments
+
+If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
+
+1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
+
+1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
+
+1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
+
+1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+
+## Reporting a vulnerability
+
+Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
+
+<!-- normal version -->
+However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
+
+A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
@@ -16,6 +16,7 @@
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
+#include <codecvt>

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -27,7 +28,6 @@
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
-#include <codecvt>
 #include <locale>
 #include <windows.h>
 #include <fcntl.h>
@@ -1500,6 +1500,77 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    GGML_UNREACHABLE();
 }

+// Validate if a filename is safe to use
+// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
+bool validate_file_name(const std::string & filename) {
+    if (!filename.length()) {
+        // Empty filename invalid
+        return false;
+    }
+    if (filename.length() > 255) {
+        // Limit at common largest possible filename on Linux filesystems
+        // to avoid unnecessary further validation
+        // (On systems with smaller limits it will be caught by the OS)
+        return false;
+    }
+
+    std::u32string filename_utf32;
+    try {
+        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
+        filename_utf32 = converter.from_bytes(filename);
+
+        // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
+        // or invalid encodings were encountered. Reject such attempts
+        std::string filename_reencoded = converter.to_bytes(filename_utf32);
+        if (filename_reencoded != filename) {
+            return false;
+        }
+    } catch (const std::exception &) {
+        return false;
+    }
+
+    // Check for forbidden codepoints:
+    // - Control characters
+    // - Unicode equivalents of illegal characters
+    // - UTF-16 surrogate pairs
+    // - UTF-8 replacement character
+    // - Byte order mark (BOM)
+    // - Illegal characters: / \ : * ? " < > |
+    for (char32_t c : filename_utf32) {
+        if (c <= 0x1F // Control characters (C0)
+            || c == 0x7F // Control characters (DEL)
+            || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
+            || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
+            || c == 0x2215 // Division Slash (forward slash equivalent)
+            || c == 0x2216 // Set Minus (backslash equivalent)
+            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
+            || c == 0xFFFD // Replacement Character (UTF-8)
+            || c == 0xFEFF // Byte Order Mark (BOM)
+            || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
+            || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
+            return false;
+        }
+    }
+
+    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
+    // Unicode and other whitespace is not affected, only 0x20 space
+    if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
+        return false;
+    }
+
+    // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
+    if (filename.find("..") != std::string::npos) {
+        return false;
+    }
+
+    // Reject "."
+    if (filename == ".") {
+        return false;
+    }
+
+    return true;
+}
+
 //
 // String utils
 //
@@ -1928,11 +1999,6 @@ struct llama_model * llama_load_model_from_url(
        return NULL;
    }

-    if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
-        return NULL;
-    }
-
    if (!llama_download_file(curl, model_url, path_model)) {
        return NULL;
    }
@@ -2146,23 +2212,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special) {
-    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
+                        bool   add_special,
+                        bool   parse_special) {
+    return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
 }

 std::vector<llama_token> llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special) {
+                        bool   add_special,
+                        bool   parse_special) {
    // upper limit for the number of tokens
-    int n_tokens = text.length() + add_bos;
+    int n_tokens = text.length() + 2 * add_special;
    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -179,6 +179,8 @@ std::string gpt_random_prompt(std::mt19937 & rng);

 void process_escapes(std::string& input);

+bool validate_file_name(const std::string & filename);
+
 //
 // String utils
 //
@@ -221,14 +223,14 @@ void llama_batch_add(
 std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
+                        bool   add_special,
+                        bool   parse_special = false);

 std::vector<llama_token> llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
+                        bool   add_special,
+                        bool   parse_special = false);

 // tokenizes a token into a piece
 // should work similar to Python's `tokenizer.id_to_piece`
@@ -129,7 +129,7 @@ llama_token llama_sampling_sample(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        struct llama_context * ctx_cfg,
-        int idx = 0);
+        int idx = -1);

 // Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
 llama_token_data_array llama_sampling_prepare(
@@ -160,7 +160,7 @@ class Model(ABC):
                data = data.astype(np.float32)

            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
                data = data.astype(np.float32)

            # if f16 desired, convert any float32 2-dim weight tensors to float16
@@ -227,15 +227,14 @@ class Model(ABC):
            return ("pytorch_model.bin",)
        return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))

-    def _set_vocab_gpt2(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
+    # used for GPT-2 BPE and WordPiece vocabs
+    def get_basic_vocab(self) -> tuple[list[str], list[int]]:
        tokens: list[str] = []
        toktypes: list[int] = []

        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model)
-        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
        assert max(tokenizer.vocab.values()) < vocab_size

        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
@@ -255,11 +254,15 @@ class Model(ABC):
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)

+        return tokens, toktypes
+
+    def _set_vocab_gpt2(self) -> None:
+        tokens, toktypes = self.get_basic_vocab()
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)

-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
        special_vocab.add_to_gguf(self.gguf_writer)

    def _set_vocab_qwen(self):
@@ -323,8 +326,7 @@ class Model(ABC):
        toktypes: list[int] = []

        if not tokenizer_path.is_file():
-            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
-            sys.exit(1)
+            raise FileNotFoundError(f"File not found: {tokenizer_path}")

        tokenizer = SentencePieceProcessor(str(tokenizer_path))
        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
@@ -511,6 +513,17 @@ class BloomModel(Model):
 class MPTModel(Model):
    model_arch = gguf.MODEL_ARCH.MPT

+    def set_vocab(self):
+        try:
+            self._set_vocab_gpt2()
+        except Exception:
+            # Fallback for SEA-LION model
+            self._set_vocab_sentencepiece()
+            self.gguf_writer.add_add_bos_token(False)
+            self.gguf_writer.add_pad_token_id(3)
+            self.gguf_writer.add_eos_token_id(1)
+            self.gguf_writer.add_unk_token_id(0)
+
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layers"]
        self.gguf_writer.add_name(self.dir_model.name)
@@ -524,7 +537,10 @@ class MPTModel(Model):
        self.gguf_writer.add_layer_norm_eps(1e-5)
        if self.hparams["attn_config"]["clip_qkv"] is not None:
            self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
-        self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
+        if self.hparams["attn_config"]["alibi"]:
+            self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
+        else:
+            self.gguf_writer.add_max_alibi_bias(0.0)

    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
@@ -1216,6 +1232,8 @@ class LlamaModel(Model):
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        n_head = self.hparams.get("num_attention_heads")
        n_kv_head = self.hparams.get("num_key_value_heads")
+        n_experts = self.hparams.get("num_local_experts")
+        experts = dict()
        for name, data_torch in self.get_tensors():
            # we don't need these
            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
@@ -1236,6 +1254,153 @@ class LlamaModel(Model):

            data = data.squeeze()

+            # process the experts separately
+            if name.find("block_sparse_moe.experts") != -1:
+                experts[name] = data
+                if len(experts) >= n_experts:
+                    # merge the experts into a single 3d tensor
+                    for bid in range(block_count):
+                        for wid in range(1, 4):
+                            full = True
+                            for xid in range(n_experts):
+                                ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
+                                if ename not in experts:
+                                    full = False
+                                    break
+                            if not full:
+                                continue
+
+                            datas = []
+                            for xid in range(n_experts):
+                                ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
+                                datas.append(experts[ename])
+                                del experts[ename]
+
+                            data = np.stack(datas, axis=0)
+                            data_dtype = data.dtype
+
+                            if self.ftype == 0 and data_dtype == np.float16:
+                                data = data.astype(np.float32)
+
+                            if self.ftype == 1 and data_dtype == np.float32:
+                                data = data.astype(np.float16)
+
+                            merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"
+
+                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+                            if new_name is None:
+                                print(f"Can not map tensor {name!r}")
+                                sys.exit()
+
+                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+                            self.gguf_writer.add_tensor(new_name, data)
+                continue
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # 1d tensors need to be converted to float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+        if len(experts) > 0:
+            raise ValueError(f"Unprocessed experts: {experts.keys()}")
+
+
+@Model.register("GrokForCausalLM")
+class GrokModel(Model):
+    model_arch = gguf.MODEL_ARCH.GROK
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_name("Grok")
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_experts = self.hparams.get("num_local_experts")
+        experts = dict()
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # process the experts separately
+            if name.find(".moe.") != -1:
+                experts[name] = data
+                if len(experts) >= n_experts:
+                    # merge the experts into a single 3d tensor
+                    for bid in range(block_count):
+                        for wid in ["linear", "linear_1", "linear_v"]:
+                            full = True
+                            for xid in range(n_experts):
+                                ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
+                                if ename not in experts:
+                                    full = False
+                                    break
+                            if not full:
+                                continue
+
+                            datas = []
+                            for xid in range(n_experts):
+                                ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
+                                datas.append(experts[ename])
+                                del experts[ename]
+
+                            data = np.stack(datas, axis=0)
+                            data_dtype = data.dtype
+
+                            if self.ftype == 0 and data_dtype == np.float16:
+                                data = data.astype(np.float32)
+
+                            if self.ftype == 1 and data_dtype == np.float32:
+                                data = data.astype(np.float16)
+
+                            merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
+
+                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+                            if new_name is None:
+                                print(f"Can not map tensor {name!r}")
+                                sys.exit()
+
+                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+                            self.gguf_writer.add_tensor(new_name, data)
+                continue
+
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
@@ -1262,21 +1427,6 @@ class LlamaModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("GrokForCausalLM")
-class GrokModel(Model):
-    model_arch = gguf.MODEL_ARCH.GROK
-
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_name("Grok")
-
-
@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
    model_arch = gguf.MODEL_ARCH.MINICPM
@@ -1896,34 +2046,25 @@ class BertModel(Model):
            self.gguf_writer.add_pooling_type(pooling_type)

    def set_vocab(self):
-        # use huggingface vocab to get all tokens
-        vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
-        tokens, scores, toktypes = zip(*vocab.all_tokens())
-        assert len(tokens) == vocab.vocab_size
-        self.vocab_size = vocab.vocab_size
+        tokens, toktypes = self.get_basic_vocab()
+        self.vocab_size = len(tokens)

        # we need this to validate the size of the token_type embeddings
        # though currently we are passing all zeros to the token_type embeddings
-        n_token_types = len(set(toktypes))
-        self.gguf_writer.add_token_type_count(n_token_types)
+        self.gguf_writer.add_token_type_count(2)  # "Sequence A" or "Sequence B"

        # convert to phantom space vocab
-        def phantom(tok, typ):
-            if tok.startswith(b"[") and tok.endswith(b"]"):
+        def phantom(tok):
+            if tok.startswith("[") and tok.endswith("]"):
                return tok
-            if tok.startswith(b"##"):
+            if tok.startswith("##"):
                return tok[2:]
-            return b"\xe2\x96\x81" + tok
-        tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
-
-        # set up bos and eos tokens (cls and sep)
-        self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
-        self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
+            return "\u2581" + tok
+        tokens = list(map(phantom, tokens))

        # add vocab to gguf
        self.gguf_writer.add_tokenizer_model("bert")
        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)

        # handle special tokens
@@ -1995,16 +2136,6 @@ class NomicBertModel(BertModel):
        super().set_gguf_parameters()
        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])

-    def get_tensors(self):
-        assert self.vocab_size is not None
-        for name, data in super().get_tensors():
-            # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
-            if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
-                rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
-                assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
-                data = data[:self.vocab_size, :]
-            yield name, data
-

@Model.register("GemmaForCausalLM")
 class GemmaModel(Model):
@@ -2180,7 +2311,8 @@ class MambaModel(Model):
                data = data.astype(np.float32)

            # if f16 desired, convert big float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
+            new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
+            if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
                data = data.astype(np.float16)

            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+from __future__ import annotations
+
 import argparse
 import os
 import sys
@@ -33,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf

 if TYPE_CHECKING:
-    from typing import TypeAlias
+    from typing_extensions import Self, TypeAlias

 if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
    faulthandler.register(signal.SIGUSR1)
@@ -139,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
        dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
        if dt is None:
            raise ValueError(self)
-        # 1D tensors are always F32.
+        # Convert all 1D tensors to F32.  Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
+        #  Also The 1d tensors aren't much of a performance/size issue.  So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
        return dt if len(tensor.shape) > 1 else DT_F32


@@ -516,7 +517,7 @@ class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"

-    def __init__(self, base_path: Path, ignore_nonllama: bool = False):
+    def __init__(self, base_path: Path):
        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
        # if this fails, FileNotFoundError propagates to caller
        with open(fname_tokenizer, encoding='utf-8') as f:
@@ -524,9 +525,7 @@ class LlamaHfVocab(Vocab):

        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        if ignore_nonllama:
-            pass  # workaround incorrect use of this class for WordPiece
-        elif (
+        if (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
@@ -646,16 +645,17 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:


 class Tensor(ABC):
+    ndarray: NDArray
    data_type: DataType

    @abstractmethod
-    def astype(self, data_type: DataType) -> Tensor: ...
+    def astype(self, data_type: DataType) -> Self: ...
    @abstractmethod
-    def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
+    def permute(self, n_head: int, n_head_kv: int) -> Self: ...
    @abstractmethod
-    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
+    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
    @abstractmethod
-    def part(self, n_part: int) -> UnquantizedTensor: ...
+    def part(self, n_part: int) -> Self: ...
    @abstractmethod
    def to_ggml(self) -> GGMLCompatibleTensor: ...

@@ -672,13 +672,13 @@ class UnquantizedTensor(Tensor):
        self.ndarray = ndarray
        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]

-    def astype(self, data_type: DataType) -> Tensor:
+    def astype(self, data_type: DataType) -> UnquantizedTensor:
        dtype = data_type.dtype
        if self.data_type == DT_BF16:
            self.ndarray = bf16_to_fp32(self.ndarray)
        return UnquantizedTensor(self.ndarray.astype(dtype))

-    def to_ggml(self) -> UnquantizedTensor:
+    def to_ggml(self) -> Self:
        return self

    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
@@ -828,6 +828,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)


+def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
+    def load() -> Tensor:
+        tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
+        return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
+    s = lazy_tensors[0].shape.copy()
+    s.insert(0, len(lazy_tensors))
+    return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
+
+
 # Functionality that simulates `torch.load` but where individual tensors are
 # only loaded into memory on demand, not all at once.
 # PyTorch can't do this natively as of time of writing:
@@ -1246,6 +1255,22 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->

    tmp = model

+    # merge experts into one tensor
+    if params.n_experts and params.n_experts > 0:
+        for i_l in range(params.n_layer):
+            for w in range(1, 4):
+                experts = []
+                for e in range(params.n_experts):
+                    if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
+                        experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
+                        del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
+                    elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
+                        experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
+                        del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
+                    else:
+                        raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
+                tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
+
    # HF models permut or pack some of the tensors, so we need to undo that
    for i in itertools.count():
        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
@@ -1325,7 +1350,7 @@ def load_some_model(path: Path) -> ModelPlus:
    # Be extra-friendly and accept either a file or a directory:
    if path.is_dir():
        # Check if it's a set of safetensors files first
-        globs = ["model-00001-of-*.safetensors", "model.safetensors"]
+        globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
        files = [file for glob in globs for file in path.glob(glob)]
        if not files:
            # Try the PyTorch patterns too, with lower priority
@@ -0,0 +1,117 @@
+## Add a new model architecture to `llama.cpp`
+
+Adding a model requires few steps:
+
+1. Convert the model to GGUF
+2. Define the model architecture in `llama.cpp`
+3. Build the GGML graph implementation
+
+After following these steps, you can open PR.
+
+Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
+- [main](../examples/main)
+- [imatrix](../examples/imatrix)
+- [quantize](../examples/quantize)
+- [server](../examples/server)
+
+### 1. Convert the model to GGUF
+
+This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
+Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+
+The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
+
+The required steps to implement for an HF model are:
+
+1. Define the model `Model.register` annotation in a new `Model` subclass, example:
+
+```python
+@Model.register("MyModelForCausalLM")
+class MyModel(Model):
+    model_arch = gguf.MODEL_ARCH.GROK
+```
+
+2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
+
+Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
+
+Example for `falcon` model:
+```python
+    MODEL_ARCH.FALCON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_NORM_2,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ]
+```
+
+3. Map the original tensor names to the standardize equivalent in GGUF
+
+As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
+
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
+
+If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
+
+Example for the normalization tensor in attention layers:
+
+```python
+block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Attention norm
+        MODEL_TENSOR.ATTN_NORM: (
+            "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
+            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen
+            "transformer.blocks.{bid}.norm_1",                      # mpt
+            ...
+        )
+}
+```
+
+`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
+
+Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
+- `Model#set_gguf_parameters`
+- `Model#set_vocab`
+- `Model#write_tensors`
+
+NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
+
+### 2. Define the model architecture in `llama.cpp`
+
+The model params and tensors layout must be defined in `llama.cpp`:
+1. Define a new `llm_arch`
+2. Define the tensors layout in `LLM_TENSOR_NAMES`
+3. Add any non standard metadata in `llm_load_hparams`
+4. Create the tensors for inference in `llm_load_tensors`
+5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+
+NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
+
+### 3. Build the GGML graph implementation
+
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
+
+Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
+
+When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
+
+## GGUF specification
+
+https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+
+## Resources
+
+- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
+- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
+- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
+- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
+- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
+- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
+- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)

 ```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>

 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
+./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99

 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99

 # custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
 ```

 ## Sample results
@@ -32,13 +32,15 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
        return 1 ;
    }

    int n_kv_max     = 2048;
+    int n_batch      = 2048;
+    int n_ubatch     = 512;
    int is_pp_shared = 0;
    int n_gpu_layers = 0;

@@ -56,23 +58,31 @@ int main(int argc, char ** argv) {
    }

    if (argc >= 4) {
-        is_pp_shared = std::atoi(argv[3]);
+        n_batch = std::atoi(argv[3]);
    }

    if (argc >= 5) {
-        n_gpu_layers = std::atoi(argv[4]);
+        n_ubatch = std::atoi(argv[4]);
    }

    if (argc >= 6) {
-        n_pp = parse_list(argv[5]);
+        is_pp_shared = std::atoi(argv[5]);
    }

    if (argc >= 7) {
-        n_tg = parse_list(argv[6]);
+        n_gpu_layers = std::atoi(argv[6]);
    }

    if (argc >= 8) {
-        n_pl = parse_list(argv[7]);
+        n_pp = parse_list(argv[7]);
+    }
+
+    if (argc >= 9) {
+        n_tg = parse_list(argv[8]);
+    }
+
+    if (argc >= 10) {
+        n_pl = parse_list(argv[9]);
    }

    // init LLM
@@ -100,7 +110,8 @@ int main(int argc, char ** argv) {

    ctx_params.seed      = 1234;
    ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = 512;
+    ctx_params.n_batch   = n_batch;
+    ctx_params.n_ubatch  = n_ubatch;

    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -158,7 +169,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
@@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
        inputs.push_back(inp);
    }

-    // add eos if not present
+    // add SEP if not present
    for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_token_eos(model)) {
-            inp.push_back(llama_token_eos(model));
+        if (inp.empty() || inp.back() != llama_token_sep(model)) {
+            inp.push_back(llama_token_sep(model));
        }
    }

@@ -0,0 +1,5 @@
+set(TARGET gbnf-validator)
+add_executable(${TARGET} gbnf-validator.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,132 @@
+#define LLAMA_API_INTERNAL
+
+#include "grammar-parser.h"
+#include "ggml.h"
+#include "llama.h"
+#include "unicode.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+    auto decoded = decode_utf8(input_str, {});
+    const auto & code_points = decoded.first;
+
+    size_t pos = 0;
+    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+        auto prev_stacks = grammar->stacks;
+        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+        if (grammar->stacks.empty()) {
+            error_pos = pos;
+            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
+            grammar->stacks = prev_stacks;
+            return false;
+        }
+        ++pos;
+    }
+
+    for (const auto & stack : grammar->stacks) {
+        if (stack.empty()) {
+            return true;
+        }
+    }
+
+    error_pos = pos;
+    error_msg = "Unexpected end of input";
+    return false;
+}
+
+static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
+    fprintf(stdout, "Input string is invalid according to the grammar.\n");
+    fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "Input string:\n");
+    fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
+    if (error_pos < input_str.size()) {
+        fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
+        if (error_pos+1 < input_str.size()) {
+            fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
+        }
+        fprintf(stdout, "\033[0m\n");
+    }
+}
+
+int main(int argc, char** argv) {
+    if (argc != 3) {
+        fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string grammar_filename = argv[1];
+    const std::string input_filename = argv[2];
+
+    // Read the GBNF grammar file
+    FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
+    if (!grammar_file) {
+        fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
+        return 1;
+    }
+
+    fseek(grammar_file, 0, SEEK_END);
+    size_t grammar_size = ftell(grammar_file);
+    fseek(grammar_file, 0, SEEK_SET);
+
+    std::string grammar_str(grammar_size, ' ');
+    fread(&grammar_str[0], 1, grammar_size, grammar_file);
+    fclose(grammar_file);
+
+    // Parse the GBNF grammar
+    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+    // will be empty (default) if there are parse errors
+    if (parsed_grammar.rules.empty()) {
+        fprintf(stdout, "%s: failed to parse grammar\n", __func__);
+        return 1;
+    }
+
+    // Ensure that there is a "root" node.
+    if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
+        fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
+        return 1;
+    }
+
+    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+
+    // Create the LLAMA grammar
+    auto grammar = llama_grammar_init(
+            grammar_rules.data(),
+            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+
+    // Read the input file
+    FILE* input_file = fopen(input_filename.c_str(), "r");
+    if (!input_file) {
+        fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
+        return 1;
+    }
+
+    fseek(input_file, 0, SEEK_END);
+    size_t input_size = ftell(input_file);
+    fseek(input_file, 0, SEEK_SET);
+
+    std::string input_str(input_size, ' ');
+    fread(&input_str[0], 1, input_size, input_file);
+    fclose(input_file);
+
+    // Validate the input string against the grammar
+    size_t error_pos;
+    std::string error_msg;
+    bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
+
+    if (is_valid) {
+        fprintf(stdout, "Input string is valid according to the grammar.\n");
+    } else {
+        print_error_message(input_str, error_pos, error_msg);
+    }
+
+    // Clean up
+    llama_grammar_free(grammar);
+
+    return 0;
+}
@@ -98,35 +98,38 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();

+    // this has been adapted to the new format of storing merged experts in a single 3d tensor
+    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
        const int idx  = ((int32_t *) t->op_params)[0];
-        const int n_as = ((int32_t *) t->op_params)[1];
+        const ggml_tensor * ids = t->src[2];
+        const int n_as = src0->ne[2];

-        // the top-k selected expert ids are stored in the src0 tensor
-        // for simplicity, always copy src0 to host, because it is small
-        // take into account that src0 is not contiguous!
-        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
-        GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
-        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
-        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
+        // the top-k selected expert ids are stored in the ids tensor
+        // for simplicity, always copy ids to host, because it is small
+        // take into account that ids is not contiguous!
+        GGML_ASSERT(ids->ne[1] == src1->ne[1]);
+        GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
+        m_ids.resize(ggml_nbytes(ids)/sizeof(int));
+        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
+
+        auto & e = m_stats[wname];
+
+        ++e.ncall;
+        // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
+        //       using the following line, we can correct for that if needed by replacing the line above with:
+        //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;

        // loop over all possible experts, regardless if they are used or not in the batch
-        // this is necessary to guarantee equal number of "ncall" for each tensor
        for (int ex = 0; ex < n_as; ++ex) {
-            src0 = t->src[2 + ex];
-            wname = filter_tensor_name(src0->name);
-            auto& e = m_stats[wname];
+            size_t e_start = ex*src1->ne[0];
            if (e.values.empty()) {
-                e.values.resize(src1->ne[0], 0);
+                e.values.resize(src1->ne[0]*n_as, 0);
            }
-            else if (e.values.size() != (size_t)src1->ne[0]) {
-                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
+                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
                exit(1); //GGML_ASSERT(false);
            }
-            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
-            //       using the following line, we can correct for that if needed
-            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
-            ++e.ncall;
            if (m_params.verbosity > 1) {
                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
            }
@@ -136,7 +139,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                if (excur != ex) continue;
                const float * x = data + row * src1->ne[0];
                for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                    e.values[j] += x[j]*x[j];
+                    e.values[e_start + j] += x[j]*x[j];
                }
            }
            if (e.ncall > m_last_call) {
@@ -346,12 +349,13 @@ static void process_logits(
 static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s\n", get_system_info(params).c_str());
    }
    const bool add_bos = llama_should_add_bos_token(model);
+    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);

    bool suff_rm_leading_spc = params.escape;
@@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

        original_prompt_len = original_inp.size();
@@ -146,7 +146,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    int n_past = 0;

    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));

    std::string system_prompt, user_prompt;
    size_t image_pos = prompt.find("<image>");
@@ -180,7 +179,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        }
    }

-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);

@@ -64,13 +64,10 @@ int main(int argc, char ** argv) {
    std::tie(model, ctx) = llama_init_from_gpt_params(params);

    // Tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
    std::vector<llama_token> inp;
    std::vector<llama_token> all;

-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true);
    all = inp;

    const int max_context_size     = llama_n_ctx(ctx);
@@ -28,10 +28,8 @@ int main(int argc, char ** argv){
    GGML_ASSERT(model != nullptr);

    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true);
    fprintf(stderr, "%s: tokenization done\n", __func__);


@@ -34,11 +34,8 @@ int main(int argc, char ** argv){
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true);

    llama_ngram_cache ngram_cache_context;
    llama_ngram_cache ngram_cache_dynamic;
@@ -42,11 +42,8 @@ int main(int argc, char ** argv){
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true);

    llama_ngram_cache ngram_cache_context;
    llama_ngram_cache ngram_cache_dynamic;
@@ -235,7 +235,7 @@ int main(int argc, char ** argv) {
            // The file exists and is not empty
            session_tokens.resize(n_ctx);
            size_t n_token_count_out = 0;
-            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+            if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
@@ -246,6 +246,7 @@ int main(int argc, char ** argv) {
    }

    const bool add_bos = llama_should_add_bos_token(model);
+    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
@@ -255,7 +256,7 @@ int main(int argc, char ** argv) {
        if (params.chatml) {
            params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
        }
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
    } else {
        LOG("use session tokens\n");
        embd_inp = session_tokens;
@@ -277,10 +278,10 @@ int main(int argc, char ** argv) {
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

        original_prompt_len = original_inp.size();
@@ -339,14 +340,14 @@ int main(int argc, char ** argv) {
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true,  true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false, true);

    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());

    // chatml prefix & suffix
-    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
+    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
    const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);

    LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
@@ -693,7 +694,7 @@ int main(int argc, char ** argv) {
            // optionally save the session on first sample (for faster prompt loading next time)
            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                need_to_save_session = false;
-                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());

                LOG("saved session to %s\n", path_session.c_str());
            }
@@ -935,7 +936,7 @@ int main(int argc, char ** argv) {

    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
-        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

    llama_print_timings(ctx);
@@ -315,10 +315,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    // BOS tokens will be added for each chunk before eval

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    const int n_ctx = llama_n_ctx(ctx);

@@ -454,6 +455,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    // BOS tokens will be added for each chunk before eval

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
@@ -470,7 +472,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -771,9 +773,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
    fprintf(stderr, "================================= is_spm = %d\n", is_spm);

-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;

@@ -818,7 +817,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
        for (size_t j = 0; j < 4; j++) {
            hs_cur.ending[j] = prompt_lines[idx*6+2+j];
-            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
+            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
        }

        // determine the common prefix of the endings
@@ -837,7 +836,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
            hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;

-        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
+        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());

        // Delete the selected random example from the prompt
        if (randomize_tasks) {
@@ -1110,12 +1109,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);

-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
    for (auto & task : data) {
-        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
-        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
+        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
+        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);

        task.common_prefix = 0;
        for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@@ -1130,8 +1126,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            task.seq_tokens[0].size() - task.common_prefix +
            task.seq_tokens[1].size() - task.common_prefix;

-        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
-        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
+        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
+        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
    }

    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
@@ -1322,7 +1318,7 @@ struct multiple_choice_task {
    std::vector<float> log_probs;
 };

-static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
+static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
    if (task.question.empty() || task.mc1.answers.empty()) {
        if (log_error) {
            printf("%s: found bad task with empty question and/or answers\n", __func__);
@@ -1337,7 +1333,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
            }
            return false;
        }
-        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
+        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
    }
    auto min_len = task.seq_tokens.front().size();
    for (auto& seq : task.seq_tokens) {
@@ -1436,9 +1432,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        n_task = params.multiple_choice_tasks;
    }

-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
    printf("%s: preparing task data", __func__);
    fflush(stdout);
    if (n_task > 500) {
@@ -1446,7 +1439,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        fflush(stdout);
        std::atomic<int> counter(0);
        std::atomic<int> n_bad(0);
-        auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
+        auto prepare = [&counter, &n_bad, &tasks, ctx] () {
            int num_tasks = tasks.size();
            int n_bad_local = 0;
            while (true) {
@@ -1457,7 +1450,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                }
                int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
                for (int i = first; i < last; ++i) {
-                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
+                    if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
                }
            }
        };
@@ -1479,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        int i_task = 0;
        for (auto& task : tasks) {
            ++i_task;
-            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
+            if (!multiple_choice_prepare_one_task(ctx, task, true)) {
                return;
            }
            if (i_task%n_dot == 0) {
@@ -1715,6 +1708,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -116,13 +116,13 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
    if (!in) {
        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
-        return;
+        exit(1);
    }
    int n_entries;
    in.read((char *)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
        printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
-        return;
+        exit(1);
    }
    for (int i = 0; i < n_entries; ++i) {
        int len; in.read((char *)&len, sizeof(len));
@@ -130,11 +130,11 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
            printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
-            return;
+            exit(1);
        }
        name_as_vec[len] = 0;
        std::string name{name_as_vec.data()};
-        auto & e = imatrix_data[std::move(name)];
+        auto & e = imatrix_data[name];
        int ncall;
        in.read((char *)&ncall, sizeof(ncall));
        int nval;
@@ -142,18 +142,22 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
        if (in.fail() || nval < 1) {
            printf("%s: failed reading number of values for entry %d\n", __func__, i);
            imatrix_data = {};
-            return;
+            exit(1);
        }
        e.resize(nval);
        in.read((char *)e.data(), nval*sizeof(float));
        if (in.fail()) {
            printf("%s: failed reading data for entry %d\n", __func__, i);
            imatrix_data = {};
-            return;
+            exit(1);
        }
        if (ncall > 0) {
            for (auto& v : e) v /= ncall;
        }
+
+        if (getenv("LLAMA_TRACE")) {
+            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
+        }
    }
    printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
 }
@@ -24,6 +24,7 @@ int main(int argc, char ** argv) {

    std::string result0;
    std::string result1;
+    std::string result2;

    // init
    llama_model * model;
@@ -44,8 +45,8 @@ int main(int argc, char ** argv) {

    // save state (rng, logits, embedding and kv_cache) to file
    {
-        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
-        const size_t written = llama_copy_state_data(ctx, state_mem.data());
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
+        const size_t written = llama_state_get_data(ctx, state_mem.data());

        FILE *fp_write = fopen("dump_state.bin", "wb");
        fwrite(state_mem.data(), 1, written, fp_write);
@@ -97,13 +98,13 @@ int main(int argc, char ** argv) {

    // load state (rng, logits, embedding and kv_cache) from file
    {
-        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));

        FILE * fp_read = fopen("dump_state.bin", "rb");
        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
        fclose(fp_read);

-        if (read != llama_set_state_data(ctx2, state_mem.data())) {
+        if (read != llama_state_set_data(ctx2, state_mem.data())) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
@@ -141,16 +142,104 @@ int main(int argc, char ** argv) {
        n_past += 1;
    }

-    printf("\n");
+    printf("\n\n");

    llama_free(ctx2);
-    llama_free_model(model);

    if (result0 != result1) {
        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
        return 1;
    }

+    // make new context
+    auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+
+    printf("\nsingle seq run: %s", params.prompt.c_str());
+
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
+
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);
+
+        if (read != llama_state_set_data(ctx3, state_mem.data())) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
+    }
+
+    // restore state (last tokens)
+    n_past = n_past_saved;
+
+    // save seq 0 and load into seq 1
+    {
+        // save kv of seq 0
+        std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
+        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
+        if (ncopy != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
+
+        // erase whole kv
+        llama_kv_cache_clear(ctx3);
+        fprintf(stderr, "%s : kv cache cleared\n", __func__);
+
+        // restore kv into seq 1
+        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
+        if (nset != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
+    }
+
+    // third run with seq 1 instead of 0
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto * logits = llama_get_logits(ctx3);
+        auto n_vocab = llama_n_vocab(model);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx3, &candidates_p);
+        auto next_token_str = llama_token_to_piece(ctx3, next_token);
+
+        printf("%s", next_token_str.c_str());
+        result2 += next_token_str;
+
+        if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        n_past += 1;
+    }
+
+    printf("\n");
+
+    llama_free(ctx3);
+    llama_free_model(model);
+
+    if (result0 != result2) {
+        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
+        return 1;
+    }
+
    fprintf(stderr, "\n%s : success\n", __func__);

    return 0;
@@ -16,52 +16,51 @@ The project is under active development, and we are [looking for feedback and co

 **Command line options:**

- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching, this parameter is used only if one token is to be processed on CPU backend.
+- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
+- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
+- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused
+- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
+- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
+- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance.
+- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`
+- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa STRATEGY`: Attempt one of the below optimization strategies  that help on some NUMA systems
+- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
 - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl
-if run without this previously, it is recommended to drop the system page cache before using this
-see https://github.com/ggerganov/llama.cpp/issues/1437
+- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
+page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437

- `--numa`: Attempt optimizations that help on some NUMA systems.
+- `--numa`: Attempt optimizations that may help on some NUMA systems.
 - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default: disabled)
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`
+- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`
+- `--port`: Set the port to listen. Default: `8080`
+- `--path`: Path from which to serve static files. Default: disabled
+- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
+- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
+- `--embedding`: Enable embedding extraction. Default: disabled
+- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
+- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
+- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
+- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled.
+- `--grp-attn-w`: Set the group attention width to extend context size through self-extend.  Used together with group attention factor `--grp-attn-n`. Default: `512`
+- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, not to `llama.log`. default: enabled.
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
+- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
+- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
+- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name.  Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
+- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`

 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@@ -69,7 +68,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437

 ## Build

-server is build alongside everything else from the root of the project
+`server` is built alongside everything else from the root of the project

 - Using `make`:

@@ -85,7 +84,7 @@ server is build alongside everything else from the root of the project

 ## Build with SSL

-server can also be built with SSL support using OpenSSL 3
+`server` can also be built with SSL support using OpenSSL 3

 - Using `make`:

@@ -135,7 +134,7 @@ docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/

 ## Testing with CURL

-Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
+Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS.

 ```sh
 curl --request POST \
@@ -159,7 +158,7 @@ mkdir llama-client
 cd llama-client
 ```

-Create a index.js file and put inside this:
+Create a index.js file and put this inside:

 ```javascript
 const prompt = `Building a website can be done in 10 simple steps:`;
@@ -190,8 +189,8 @@ node index.js
  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
  - 500 -> `{"status": "error"}` if the model failed to load.
  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
-  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
-  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
+  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available.
+  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available.

  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.

@@ -205,75 +204,75 @@ node index.js
      - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
      - The system prompt is empty

-    `temperature`: Adjust the randomness of the generated text (default: 0.8).
+    `temperature`: Adjust the randomness of the generated text. Default: `0.8`

-    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
+    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.

-    `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
+    `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`

-    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
+    `top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`

-    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
+    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`

-    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).
+    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`

-    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
+    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.

    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
-    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
+    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

    `stop`: Specify a JSON array of stopping strings.
-    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
+    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`

-    `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
+    `tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled.

-    `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
+    `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.

-    `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
+    `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`

-    `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+    `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.

-    `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
+    `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`

-    `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).
+    `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.

-    `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
+    `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.

-    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`).
+    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.

-    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
+    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.

-    `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`

-    `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+    `mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`

-    `grammar`: Set grammar for grammar-based sampling (default: no grammar)
+    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar

-    `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
+    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.

-    `ignore_eos`: Ignore end of stream token and continue generating (default: false).
+    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`

-    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
+    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`

-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
+    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`

-    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
+    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`

-    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
+    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch.  Default: `false`

    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values)
+    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.

 ### Result JSON

- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.

 - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:

@@ -287,7 +286,7 @@ node index.js
    },
    {
      "prob": float,
-      "tok_str": "<second most likely tonen>"
+      "tok_str": "<second most likely token>"
    },
    ...
  ]
@@ -357,14 +356,14 @@ Notice that each `probs` is an array of length `n_probs`.

 - `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
 - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
+- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)

 - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.

    *Options:*

-    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
+    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.

    *Examples:*

@@ -514,16 +513,67 @@ Available metrics:
 - `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
 - `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
 - `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
+- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
 - `llamacpp:kv_cache_tokens`: KV-cache tokens.
- `llamacpp:requests_processing`: Number of request processing.
- `llamacpp:requests_deferred`: Number of request deferred.
+- `llamacpp:requests_processing`: Number of requests processing.
+- `llamacpp:requests_deferred`: Number of requests deferred.
+
+- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
+
+    *Options:*
+
+    `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
+
+### Result JSON
+
+```json
+{
+    "id_slot": 0,
+    "filename": "slot_save_file.bin",
+    "n_saved": 1745,
+    "n_written": 14309796,
+    "timings": {
+        "save_ms": 49.865
+    }
+}
+```
+
+- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
+
+    *Options:*
+
+    `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
+
+### Result JSON
+
+```json
+{
+    "id_slot": 0,
+    "filename": "slot_save_file.bin",
+    "n_restored": 1745,
+    "n_read": 14309796,
+    "timings": {
+        "restore_ms": 42.937
+    }
+}
+```
+
+- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
+
+### Result JSON
+
+```json
+{
+    "id_slot": 0,
+    "n_erased": 1745
+}
+```

 ## More examples

 ### Change system prompt on runtime

-To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
+To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.

 `prompt`: Specify a context that you want all connecting clients to respect.

@@ -562,11 +612,11 @@ bash chat.sh

 ### OAI-like API

-The HTTP server supports OAI-like API: https://github.com/openai/openai-openapi
+The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi

 ### API errors

-Server returns error in the same format as OAI: https://github.com/openai/openai-openapi
+`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi

 Example of an error:

@@ -2,13 +2,15 @@

 Benchmark is using [k6](https://k6.io/).

-##### Install k6
+##### Install k6 and sse extension

-Follow instruction from: https://k6.io/docs/get-started/installation/
+SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.

-Example for ubuntu:
+Example:
 ```shell
-snap install k6
+go install go.k6.io/xk6/cmd/xk6@latest
+xk6 build master \
+--with github.com/phymbert/xk6-sse
 ```

 #### Download a dataset
@@ -46,7 +48,7 @@ server --host localhost --port 8080 \

 For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
 ```shell
-k6 run script.js --duration 10m --iterations 500 --vus 8
+./k6 run script.js --duration 10m --iterations 500 --vus 8
 ```

 The benchmark values can be overridden with:
@@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with:
 ```shell
 curl http://localhost:8080/metrics
 ```
+
+### Using the CI python script
+The `bench.py` script does several steps:
+- start the server
+- define good variable for k6
+- run k6 script
+- extract metrics from prometheus
+
+It aims to be used in the CI, but you can run it manually:
+
+```shell
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
+              --runner-label local \
+              --name local \
+              --branch `git rev-parse --abbrev-ref HEAD` \
+              --commit `git rev-parse HEAD` \
+              --scenario script.js \
+              --duration 5m \
+              --hf-repo ggml-org/models	 \
+              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --model-path-prefix models \
+              --parallel 4 \
+              -ngl 33 \
+              --batch-size 2048 \
+              --ubatch-size	256 \
+              --ctx-size 4096 \
+              --n-prompts 200 \
+              --max-prompt-tokens 256 \
+              --max-tokens 256
+```
@@ -16,6 +16,7 @@ import matplotlib
 import matplotlib.dates
 import matplotlib.pyplot as plt
 import requests
+from statistics import mean


 def main(args_in: list[str] | None = None) -> None:
@@ -75,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None:
                            data['metrics'][metric_name][metric_metric]=value
                            github_env.write(
                                f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
-                token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
                iterations = data['root_group']['checks']['success completion']['passes']

    except Exception:
@@ -109,6 +109,7 @@ def main(args_in: list[str] | None = None) -> None:

    # Prometheus
    end_time = time.time()
+    prometheus_metrics = {}
    if is_server_listening("0.0.0.0", 9090):
        metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
                   'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
@@ -127,6 +128,7 @@ def main(args_in: list[str] | None = None) -> None:
                values = metric_data['data']['result'][0]['values']
                timestamps, metric_values = zip(*values)
                metric_values = [float(value) for value in metric_values]
+                prometheus_metrics[metric] = metric_values
                timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
                plt.figure(figsize=(16, 10), dpi=80)
                plt.plot(timestamps_dt, metric_values, label=metric)
@@ -176,17 +178,20 @@ xychart-beta

    # 140 chars max for commit status description
    bench_results = {
+        "i": iterations,
        "req": {
-            "p90": data['metrics']["http_req_duration"]["p(90)"],
-            "avg": data['metrics']["http_req_duration"]["avg"],
+            "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
+            "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
        },
        "pp": {
-            "p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
-            "avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
+            "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
+            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
        },
        "tg": {
-            "p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
-            "avg": data['metrics']["llamacpp_tokens_second"]["avg"],
+            "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
+            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
        },
    }
    with open("results.github.env", 'a') as github_env:
@@ -200,7 +205,7 @@ xychart-beta


 def start_benchmark(args):
-    k6_path = 'k6'
+    k6_path = './k6'
    if 'BENCH_K6_BIN_PATH' in os.environ:
        k6_path = os.environ['BENCH_K6_BIN_PATH']
    k6_args = [
@@ -1,4 +1,4 @@
-import http from 'k6/http'
+import sse from 'k6/x/sse'
 import {check, sleep} from 'k6'
 import {SharedArray} from 'k6/data'
 import {Counter, Rate, Trend} from 'k6/metrics'
@@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () {

 const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
 const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+
 const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
+const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')

 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -86,35 +88,62 @@ export default function () {
            }
        ],
        "model": model,
-        "stream": false,
+        "stream": true,
+        "seed": 42,
        "max_tokens": max_tokens
    }

-    const body = JSON.stringify(payload)
+    const params = {method: 'POST', body: JSON.stringify(payload)};

-    let res = http.post(`${server_url}/chat/completions`, body, {
-        headers: {'Content-Type': 'application/json'},
-        timeout: '300s'
+    const startTime = new Date()
+    let promptEvalEndTime = null
+    let prompt_tokens = 0
+    let completions_tokens = 0
+    let finish_reason = null
+    const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
+        client.on('event', function (event) {
+            if (promptEvalEndTime == null) {
+                promptEvalEndTime = new Date()
+            }
+
+            let chunk = JSON.parse(event.data)
+            let choice = chunk.choices[0]
+            if (choice.finish_reason) {
+                finish_reason = choice.finish_reason
+            }
+
+            if (chunk.usage) {
+                prompt_tokens = chunk.usage.prompt_tokens
+                llamacpp_prompt_tokens.add(prompt_tokens)
+                llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
+
+                completions_tokens = chunk.usage.completion_tokens
+                llamacpp_completion_tokens.add(completions_tokens)
+                llamacpp_completion_tokens_total_counter.add(completions_tokens)
+            }
+        })
+
+        client.on('error', function (e) {
+            console.log('An unexpected error occurred: ', e.error());
+            throw e;
+        })
    })

    check(res, {'success completion': (r) => r.status === 200})

-    if (res.status === 200) {
-        const completions = res.json()
+    const endTime = new Date()

-        llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
-        llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
-
-        llamacpp_completion_tokens.add(completions.usage.completion_tokens)
-        llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
-
-        llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
-        llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
-
-        llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
-    } else {
-        console.error(`response: ${res.body} request=${payload}`)
+    const promptEvalTime = promptEvalEndTime - startTime
+    if (promptEvalTime > 0) {
+        llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
    }

+    const completion_time = endTime - promptEvalEndTime
+    if (completions_tokens > 0 && completion_time > 0) {
+        llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+    }
+    llamacpp_completions_truncated_rate.add(finish_reason === 'length')
+    llamacpp_completions_stop_rate.add(finish_reason === 'stop')
+
    sleep(0.3)
 }
@@ -51,6 +51,26 @@
      margin-bottom: 0.5em;
    }

+  button, input, textarea, .button, a.button, select {
+      color: #666;
+      border: 1px solid #ddd;
+      border-radius: 4px;
+      line-height: 1.5em;
+      padding: 0.25em 0.25em;
+      text-decoration: none;
+      font-size: 1.1rem;
+    }
+
+    button {
+      border: 1px solid #2a8aad;
+      background: #3584e4;
+      font-weight: normal;
+      color: #fff;
+    }
+    button:disabled {
+      background: #9cbce5;
+    }
+
    #write form {
      margin: 1em 0 0 0;
      display: flex;
@@ -222,6 +242,7 @@
      temperature: 0.7,
      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
      repeat_penalty: 1.18, // 1.0 = disabled
+      penalize_nl: false,
      top_k: 40, // <= 0 to use vocab size
      top_p: 0.95, // 1.0 = disabled
      min_p: 0.05, // 0 = disabled
@@ -405,7 +426,7 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
        const data = chunk.data;

        if (data.stop) {
@@ -566,7 +587,7 @@
        runCompletion();
      }
      return html`
-        <div>
+        <div class="right">
          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
          <button onclick=${reset}>Reset</button>
@@ -627,6 +648,7 @@
      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }

      const grammarJsonSchemaPropOrder = signal('')
      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
@@ -670,6 +692,15 @@
        `
      };

+      const BoolField = ({ label, name, value }) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
+          </div>
+        `
+      };
+
      const userTemplateReset = (e) => {
        e.preventDefault();
        userTemplateResetToDefaultAndApply()
@@ -769,6 +800,7 @@
            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
@@ -1003,6 +1035,10 @@
    }

    function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);

      return html`
        <div class="mode-${session.value.type}">
@@ -61,7 +61,10 @@ enum server_task_type {
    SERVER_TASK_TYPE_COMPLETION,
    SERVER_TASK_TYPE_CANCEL,
    SERVER_TASK_TYPE_NEXT_RESPONSE,
-    SERVER_TASK_TYPE_METRICS
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
 };

 struct server_task {
@@ -128,6 +131,7 @@ struct server_params {

    bool slots_endpoint   = true;
    bool metrics_endpoint = false;
+    std::string slot_save_path;
 };

 struct server_slot {
@@ -685,6 +689,7 @@ struct server_context {
        n_ctx = llama_n_ctx(ctx);

        add_bos_token = llama_should_add_bos_token(model);
+        GGML_ASSERT(llama_add_eos_token(model) != 1);

        return true;
    }
@@ -754,7 +759,7 @@ struct server_context {
        metrics.init();
    }

-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
+    std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
        // TODO: currently, we tokenize using special tokens by default
        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
        //       but it's better compared to completely ignoring ChatML and other chat templates
@@ -772,7 +777,7 @@ struct server_context {

                    std::vector<llama_token> p;
                    if (first) {
-                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+                        p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
                        first = false;
                    } else {
                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
@@ -789,7 +794,7 @@ struct server_context {
            }
        } else {
            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
        }

        return prompt_tokens;
@@ -1054,7 +1059,7 @@ struct server_context {
        system_tokens.clear();

        if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
+            system_tokens = ::llama_tokenize(ctx, system_prompt, true);

            llama_batch_clear(batch);

@@ -1612,6 +1617,107 @@ struct server_context {
                    }
                    queue_results.send(res);
                } break;
+            case SERVER_TASK_TYPE_SLOT_SAVE:
+                {
+                    int id_slot = task.data["id_slot"];
+                    server_slot * slot = get_slot(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+
+                    const size_t token_count = slot->cache_tokens.size();
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.data["filename"];
+                    std::string filepath = task.data["filepath"];
+
+                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_save_ms = (t_end - t_start) / 1000.0;
+
+                    server_task_result result;
+                    result.id = task.id;
+                    result.stop = true;
+                    result.error = false;
+                    result.data = json {
+                        { "id_slot",   id_slot },
+                        { "filename",  filename },
+                        { "n_saved",   token_count }, // tokens saved
+                        { "n_written", nwrite },      // bytes written
+                        { "timings", {
+                            { "save_ms", t_save_ms }
+                        } }
+                    };
+                    queue_results.send(result);
+                } break;
+            case SERVER_TASK_TYPE_SLOT_RESTORE:
+                {
+                    int id_slot = task.data["id_slot"];
+                    server_slot * slot = get_slot(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.data["filename"];
+                    std::string filepath = task.data["filepath"];
+
+                    slot->cache_tokens.resize(slot->n_ctx);
+                    size_t token_count = 0;
+                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
+                    if (nread == 0) {
+                        slot->cache_tokens.resize(0);
+                        send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    slot->cache_tokens.resize(token_count);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_restore_ms = (t_end - t_start) / 1000.0;
+
+                    server_task_result result;
+                    result.id = task.id;
+                    result.stop = true;
+                    result.error = false;
+                    result.data = json {
+                        { "id_slot",    id_slot },
+                        { "filename",   filename },
+                        { "n_restored", token_count }, // tokens restored
+                        { "n_read",     nread },       // bytes read
+                        { "timings", {
+                            { "restore_ms", t_restore_ms }
+                        } }
+                    };
+                    queue_results.send(result);
+                } break;
+            case SERVER_TASK_TYPE_SLOT_ERASE:
+                {
+                    int id_slot = task.data["id_slot"];
+                    server_slot * slot = get_slot(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+
+                    // Erase token cache
+                    const size_t n_erased = slot->cache_tokens.size();
+                    llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
+                    slot->cache_tokens.clear();
+
+                    server_task_result result;
+                    result.id = task.id;
+                    result.stop = true;
+                    result.error = false;
+                    result.data = json {
+                        { "id_slot",  id_slot },
+                        { "n_erased", n_erased }
+                    };
+                    queue_results.send(result);
+                } break;
        }
    }

@@ -1809,7 +1915,7 @@ struct server_context {
                            prefix_tokens.push_back(llama_token_middle(model));
                            prompt_tokens = prefix_tokens;
                        } else {
-                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
+                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                        }

                        slot.n_past = 0;
@@ -2189,8 +2295,6 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("                            KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
    printf("  -b N, --batch-size N      logical maximum batch size (default: %d)\n", params.n_batch);
    printf("  -ub N, --ubatch-size N    physical maximum batch size (default: %d)\n", params.n_ubatch);
-    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_supports_mlock()) {
        printf("  --mlock                   force system to keep model in RAM rather than swapping or compressing\n");
    }
@@ -2213,6 +2317,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
        printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
        printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
        printf("                            or for intermediate results and KV (with split-mode = row)\n");
+        printf("  -nkvo, --no-kv-offload\n");
+        printf("                            disable KV offload\n");
    }
    printf("  -m FNAME, --model FNAME\n");
    printf("                            model path (default: %s)\n", params.model.c_str());
@@ -2249,6 +2355,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  --log-disable             disables logging to a file.\n");
    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
    printf("  --metrics                 enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
+    printf("  --slot-save-path PATH     path to save slot kv cache (default: disabled)\n");
    printf("\n");
    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
    printf("  --override-kv KEY=TYPE:VALUE\n");
@@ -2498,6 +2605,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                    "See main README.md for information on enabling GPU BLAS support",
                    {{"n_gpu_layers", params.n_gpu_layers}});
            }
+        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+            params.no_kv_offload = true;
        } else if (arg == "--split-mode" || arg == "-sm") {
            if (++i >= argc) {
                invalid_param = true;
@@ -2655,6 +2764,16 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
            sparams.slots_endpoint = false;
        } else if (arg == "--metrics") {
            sparams.metrics_endpoint = true;
+        } else if (arg == "--slot-save-path") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.slot_save_path = argv[i];
+            // if doesn't end with DIRECTORY_SEPARATOR, add it
+            if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                sparams.slot_save_path += DIRECTORY_SEPARATOR;
+            }
        } else if (arg == "--chat-template") {
            if (++i >= argc) {
                invalid_param = true;
@@ -3157,6 +3276,112 @@ int main(int argc, char ** argv) {
        res.status = 200; // HTTP OK
    };

+    const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+        json request_data = json::parse(req.body);
+        std::string filename = request_data["filename"];
+        if (!validate_file_name(filename)) {
+            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+        std::string filepath = sparams.slot_save_path + filename;
+
+        server_task task;
+        task.type = SERVER_TASK_TYPE_SLOT_SAVE;
+        task.data = {
+            { "id_slot", id_slot },
+            { "filename", filename },
+            { "filepath", filepath }
+        };
+
+        const int id_task = ctx_server.queue_tasks.post(task);
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        if (result.error) {
+            res_error(res, result.data);
+        } else {
+            res.set_content(result.data.dump(), "application/json");
+        }
+    };
+
+    const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+        json request_data = json::parse(req.body);
+        std::string filename = request_data["filename"];
+        if (!validate_file_name(filename)) {
+            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+        std::string filepath = sparams.slot_save_path + filename;
+
+        server_task task;
+        task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
+        task.data = {
+            { "id_slot", id_slot },
+            { "filename", filename },
+            { "filepath", filepath }
+        };
+
+        const int id_task = ctx_server.queue_tasks.post(task);
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        if (result.error) {
+            res_error(res, result.data);
+        } else {
+            res.set_content(result.data.dump(), "application/json");
+        }
+    };
+
+    const auto handle_slots_erase = [&ctx_server, &res_error](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
+        server_task task;
+        task.type = SERVER_TASK_TYPE_SLOT_ERASE;
+        task.data = {
+            { "id_slot", id_slot },
+        };
+
+        const int id_task = ctx_server.queue_tasks.post(task);
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        if (result.error) {
+            res_error(res, result.data);
+        } else {
+            res.set_content(result.data.dump(), "application/json");
+        }
+    };
+
+    const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
+        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+
+        std::string id_slot_str = req.path_params.at("id_slot");
+        int id_slot;
+
+        try {
+            id_slot = std::stoi(id_slot_str);
+        } catch (const std::exception &) {
+            res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        std::string action = req.get_param_value("action");
+
+        if (action == "save") {
+            handle_slots_save(req, res, id_slot);
+        } else if (action == "restore") {
+            handle_slots_restore(req, res, id_slot);
+        } else if (action == "erase") {
+            handle_slots_erase(req, res, id_slot);
+        } else {
+            res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
+        }
+    };
+
    const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = {
@@ -3519,6 +3744,10 @@ int main(int argc, char ** argv) {
    svr->Post("/v1/embeddings",       handle_embeddings);
    svr->Post("/tokenize",            handle_tokenize);
    svr->Post("/detokenize",          handle_detokenize);
+    if (!sparams.slot_save_path.empty()) {
+        // only enable slot endpoints if slot_save_path is set
+        svr->Post("/slots/:id_slot",  handle_slots_action);
+    }

    //
    // Start the server
@@ -0,0 +1,58 @@
+@llama.cpp
+@slotsave
+Feature: llama.cpp server slot management
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   prompt caching is enabled
+    And   2 slots
+    And   . as slot save path
+    And   2048 KV cache size
+    And   42 as server seed
+    And   24 max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+
+  Scenario: Save and Restore Slot
+    # First prompt in slot 1 should be fully processed
+    Given a user prompt "What is the capital of France?"
+    And   using slot id 1
+    And   a completion request with no api error
+    Then  24 tokens are predicted matching (Lily|cake)
+    And   22 prompt tokens are processed
+    When  the slot 1 is saved with filename "slot1.bin"
+    Then  the server responds with status code 200
+    # Since we have cache, this should only process the last tokens
+    Given a user prompt "What is the capital of Germany?"
+    And   a completion request with no api error
+    Then  24 tokens are predicted matching (Thank|special)
+    And   7 prompt tokens are processed
+    # Loading the original cache into slot 0,
+    # we should only be processing 1 prompt token and get the same output
+    When  the slot 0 is restored with filename "slot1.bin"
+    Then  the server responds with status code 200
+    Given a user prompt "What is the capital of France?"
+    And   using slot id 0
+    And   a completion request with no api error
+    Then  24 tokens are predicted matching (Lily|cake)
+    And   1 prompt tokens are processed
+    # For verification that slot 1 was not corrupted during slot 0 load, same thing
+    Given a user prompt "What is the capital of Germany?"
+    And   using slot id 1
+    And   a completion request with no api error
+    Then  24 tokens are predicted matching (Thank|special)
+    And   1 prompt tokens are processed
+
+  Scenario: Erase Slot
+    Given a user prompt "What is the capital of France?"
+    And   using slot id 1
+    And   a completion request with no api error
+    Then  24 tokens are predicted matching (Lily|cake)
+    And   22 prompt tokens are processed
+    When  the slot 1 is erased
+    Then  the server responds with status code 200
+    Given a user prompt "What is the capital of France?"
+    And   a completion request with no api error
+    Then  24 tokens are predicted matching (Lily|cake)
+    And   22 prompt tokens are processed
@@ -49,6 +49,9 @@ def step_server_config(context, server_fqdn, server_port):
    context.n_predict = None
    context.n_prompts = 0
    context.n_server_predict = None
+    context.slot_save_path = None
+    context.id_slot = None
+    context.cache_prompt = None
    context.n_slots = None
    context.prompt_prefix = None
    context.prompt_suffix = None
@@ -119,6 +122,21 @@ def step_server_n_predict(context, n_predict):
    context.n_server_predict = n_predict


+@step('{slot_save_path} as slot save path')
+def step_slot_save_path(context, slot_save_path):
+    context.slot_save_path = slot_save_path
+
+
+@step('using slot id {id_slot:d}')
+def step_id_slot(context, id_slot):
+    context.id_slot = id_slot
+
+
+@step('prompt caching is enabled')
+def step_enable_prompt_cache(context):
+    context.cache_prompt = True
+
+
@step('continuous batching')
 def step_server_continuous_batching(context):
    context.server_continuous_batching = True
@@ -212,6 +230,8 @@ async def step_request_completion(context, api_error):
                                          context.base_url,
                                          debug=context.debug,
                                          n_predict=context.n_predict,
+                                          cache_prompt=context.cache_prompt,
+                                          id_slot=context.id_slot,
                                          seed=await completions_seed(context),
                                          expect_api_error=expect_api_error,
                                          user_api_key=context.user_api_key)
@@ -711,12 +731,48 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
    await asyncio.sleep(0.1)


+@step('the slot {slot_id:d} is saved with filename "{filename}"')
+@async_run_until_complete
+async def step_save_slot(context, slot_id, filename):
+    async with aiohttp.ClientSession() as session:
+        async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
+                                json={"filename": filename},
+                                headers={"Content-Type": "application/json"}) as response:
+            context.response = response
+
+
+@step('the slot {slot_id:d} is restored with filename "{filename}"')
+@async_run_until_complete
+async def step_restore_slot(context, slot_id, filename):
+    async with aiohttp.ClientSession() as session:
+        async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
+                                json={"filename": filename},
+                                headers={"Content-Type": "application/json"}) as response:
+            context.response = response
+
+
+@step('the slot {slot_id:d} is erased')
+@async_run_until_complete
+async def step_erase_slot(context, slot_id):
+    async with aiohttp.ClientSession() as session:
+        async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
+                                headers={"Content-Type": "application/json"}) as response:
+            context.response = response
+
+
+@step('the server responds with status code {status_code:d}')
+def step_server_responds_with_status_code(context, status_code):
+    assert context.response.status == status_code
+
+
 async def request_completion(prompt,
                             base_url,
                             debug=False,
                             prompt_prefix=None,
                             prompt_suffix=None,
                             n_predict=None,
+                             cache_prompt=False,
+                             id_slot=None,
                             seed=None,
                             expect_api_error=None,
                             user_api_key=None):
@@ -738,6 +794,8 @@ async def request_completion(prompt,
                                    "prompt": prompt,
                                    "input_suffix": prompt_suffix,
                                    "n_predict": n_predict if n_predict is not None else -1,
+                                    "cache_prompt": cache_prompt,
+                                    "id_slot": id_slot,
                                    "seed": seed if seed is not None else 42
                                },
                                headers=headers,
@@ -1104,6 +1162,8 @@ def start_server_background(context):
        server_args.extend(['--parallel', context.n_slots])
    if context.n_server_predict:
        server_args.extend(['--n-predict', context.n_server_predict])
+    if context.slot_save_path:
+        server_args.extend(['--slot-save-path', context.slot_save_path])
    if context.server_api_key:
        server_args.extend(['--api-key', context.server_api_key])
    if context.n_ga:
@@ -49,12 +49,23 @@ extern bool server_log_json;
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)

+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
+
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value) {
    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
+    if (body.contains(key) && !body.at(key).is_null()){
+        try {
+            return body.value(key, default_value);
+        }
+        catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
+            std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
+            server_log("WARN", __func__, __LINE__, message.c_str(), body);
+            return default_value;
+        }
+    } else {
+        return default_value;
+    }
 }

 static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
@@ -556,6 +567,15 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
        {"model",   modelname},
        {"object",  "chat.completion.chunk"}
    };
+    if (!finish_reason.empty()) {
+        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+        ret.push_back({"usage", json {
+            {"completion_tokens", num_tokens_predicted},
+            {"prompt_tokens",     num_prompt_tokens},
+            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+        }});
+    }

    return std::vector<json>({ret});
 }
@@ -76,6 +76,28 @@ int main(int argc, char ** argv) {
    params.n_threads_batch = params.n_threads_batch_draft;
    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);

+    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
+    LOG("vocab_type tgt: %d\n", vocab_type_tgt);
+
+    const bool vocab_type_dft = llama_vocab_type(model_dft);
+    LOG("vocab_type dft: %d\n", vocab_type_dft);
+
+    if (vocab_type_tgt != vocab_type_dft) {
+        fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
+        fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        return 1;
+    }
+
+    if (
+        llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
+        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
+        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
+        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
+    ) {
+        fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
+        return 1;
+    }
+
    {
        const int n_vocab_tgt = llama_n_vocab(model_tgt);
        const int n_vocab_dft = llama_n_vocab(model_dft);
@@ -105,20 +127,8 @@ int main(int argc, char ** argv) {


    // Tokenize the prompt
-    const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
-    LOG("add_bos tgt: %d\n", add_bos_tgt);
-
-    const bool add_bos_dft = llama_should_add_bos_token(model_dft);
-    LOG("add_bos dft: %d\n", add_bos_dft);
-
-    if (add_bos_tgt != add_bos_dft) {
-        fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
-        fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
-        return 1;
-    }
-
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
+    inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);

    const int max_context_size     = llama_n_ctx(ctx_tgt);
    const int max_tokens_list_size = max_context_size - 4;
@@ -26,11 +26,9 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

-    const bool add_bos = llama_should_add_bos_token(model);
-
    std::vector<llama_token> tokens;

-    tokens = ::llama_tokenize(model, prompt, add_bos, true);
+    tokens = ::llama_tokenize(model, prompt, true, true);

    for (int i = 0; i < (int) tokens.size(); i++) {
        if (printing_ids) {
@@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1709336216,
-        "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
+        "lastModified": 1712014858,
+        "narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
+        "rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1711163522,
-        "narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=",
+        "lastModified": 1712163089,
+        "narHash": "sha256-Um+8kTIrC19vD4/lUCN9/cU9kcOsD1O1m+axJqQPyMM=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4",
+        "rev": "fd281bd6b7d3e32ddfa399853946f782553163b5",
        "type": "github"
      },
      "original": {
@@ -37,11 +37,11 @@
    "nixpkgs-lib": {
      "locked": {
        "dir": "lib",
-        "lastModified": 1709237383,
-        "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
+        "lastModified": 1711703276,
+        "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
+        "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
        "type": "github"
      },
      "original": {
@@ -137,7 +137,7 @@ extern "C" {
    /*
      Example usage:

-        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

@@ -447,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_

 #define GGML_COMMON_IMPL
 #elif defined(GGML_COMMON_IMPL_SYCL)
+
 #include <cstdint>

-#define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
-#define GGML_TABLE_END() });
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };

 #define GGML_COMMON_IMPL
 #endif
@@ -401,10 +401,8 @@ GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t
 GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;

-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+    if (tensor->view_src != NULL) {
        assert(tensor->view_src->buffer->buft == buffer->buft);
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
        return;
    }

@@ -1227,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(

    // the main device has a larger memory buffer to hold the results from all GPUs
    // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = id == ctx.device ? ne0 : row_diff;
+    int64_t ldc = id == ctx.device ? ne0 : row_diff;

    const int compute_capability = ggml_cuda_info().devices[id].cc;

@@ -1379,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];

-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
+    const int64_t nb2 = dst->nb[2];
+    const int64_t nb3 = dst->nb[3];

    GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
    GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
@@ -1962,227 +1960,49 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    }
 }

-#if 0
-template<typename ... Srcs>
-static __global__ void k_compute_batched_ptrs_id(
-        const void ** ptrs_src, void ** ptrs_dst,
-        int ne12, int ne13,
-        int ne23,
-        int nb02, int nb03,
-        int nb12, int nb13,
-        int nb2, int nb3,
-        int r2, int r3,
-        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
-        const half * src1_f16, half * dst_f16,
-        const int32_t * ids, const int id,
-        Srcs... src0s) {
-
-    int i = ids[id];
-
-    half * src0_f16;
-    const void * srcs_ar[] = { (const half *) src0s... };
-    if (src0_type == GGML_TYPE_F16) {
-        src0_f16 = (half *) srcs_ar[i];
-    } else {
-        src0_f16 = src0_as_f16;
-        if (threadIdx.x == 0 && threadIdx.y == 0) {
-            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
-            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
-        }
-    }
-
-    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int i03 = i13 / r3;
-    int i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
-}
-
-static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
-    const struct ggml_tensor * ids = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src00 = dst->src[2];
-
-    const int id = dst->op_params[0];
-
-    GGML_ASSERT(!ggml_is_transposed(src00));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src00->ne[1];
-    const int64_t ne02 = src00->ne[2];
-    const int64_t ne03 = src00->ne[3];
-
-    //const int64_t nb01 = src00->nb[1];
-    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    //const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-
-    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    //void * src0_ddq = src0_extra->data_device[g_main_device];
-    //half * src0_as_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
-
-    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const half alpha_f16 = 1.0f;
-    const half beta_f16  = 0.0f;
-
-    // use cublasGemmBatchedEx
-    const int ne23 = ne12*ne13;
-
-    const void ** ptrs_src = nullptr;
-          void ** ptrs_dst = nullptr;
-
-    size_t ptrs_src_s = 0;
-    size_t ptrs_dst_s = 0;
-
-    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
-
-    int64_t src0_ne = ggml_nelements(src00);
-    half * src0_as_f16 = nullptr;
-    size_t src0_as = 0;
-    if (src00->type != GGML_TYPE_F16) {
-        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
-    }
-
-    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
-    dim3 block_dims(ne13, ne12);
-    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
-            ptrs_src, ptrs_dst,
-            ne12, ne13,
-            ne23,
-            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
-            nb12, nb13,
-            dst->nb[2], dst->nb[3],
-            r2, r3,
-            src00->type, src0_as_f16, src0_ne,
-            src1_as_f16, dst_f16,
-            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
-            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    CUBLAS_CHECK(
-    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-            ne01, ne11, ne10,
-            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
-                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
-            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
-            ne23,
-            CUBLAS_COMPUTE_16F,
-            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-    if (src0_as != 0) {
-        ggml_cuda_pool_free(src0_as_f16, src0_as);
-    }
-    if (ptrs_src_s != 0) {
-        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
-    }
-    if (ptrs_dst_s != 0) {
-        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
-    }
-
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    ggml_cuda_pool_free(dst_f16, dst_as);
-}
-#endif
-
 static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-#if 0
-    ggml_cuda_mul_mat_id_cublas(dst);
-    // TODO: mmq/mmv support
-#endif
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
+    const ggml_tensor * ids  = dst->src[2];
+
+    GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");

    cudaStream_t stream = ctx.stream();

    const size_t nb11 = src1->nb[1];
    const size_t nb1  =  dst->nb[1];

-    const struct ggml_tensor * ids = src0;
    const int32_t id = ((int32_t *) dst->op_params)[0];
-    const int32_t n_as = ((int32_t *) dst->op_params)[1];
+    const int32_t n_as = src0->ne[2];

    std::vector<char> ids_host(ggml_nbytes(ids));
    const char * ids_dev = (const char *) ids->data;
    CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));

+    ggml_tensor src0_row = *src0;
    ggml_tensor src1_row = *src1;
    ggml_tensor dst_row = *dst;

+    char * src0_original = (char *) src0->data;
    char * src1_original = (char *) src1->data;
    char * dst_original  = (char *)  dst->data;

+    src0_row.ne[2] = 1;
+    src0_row.ne[3] = 1;
+    src0_row.nb[3] = src0->nb[2];
+
    if (src1->ne[1] == 1) {
        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);

            GGML_ASSERT(row_id >= 0 && row_id < n_as);

-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
+            src0_row.data = src0_original + row_id*src0->nb[2];
            src1_row.data = src1_original + i01*src1->nb[1];
            dst_row.data  =  dst_original + i01*dst->nb[1];

-            ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
+            ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
        }
    } else {
        ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
@@ -2192,8 +2012,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
        dst_row.data  =  dst_contiguous.get();

        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
            int64_t num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
@@ -2213,6 +2031,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
                continue;
            }

+            src0_row.data = src0_original + row_id*src0->nb[2];
+
            src1_row.ne[1] = num_src1_rows;
            dst_row.ne[1] = num_src1_rows;

@@ -2224,7 +2044,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
            dst_row.nb[2] = num_src1_rows*nb1;
            dst_row.nb[3] = num_src1_rows*nb1;

-            ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
+            ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);

            num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -2389,7 +2209,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
-        GGML_ASSERT(false);
+        CUDA_CHECK(err);
    }

    return true;
@@ -2797,6 +2617,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
        return false;
    }

+#if CUDART_VERSION >= 11100
    cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
    if (err != cudaSuccess) {
        // clear the error
@@ -2807,6 +2628,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
        return false;
    }
    return true;
+#else
+    return false;
+#endif
 }

 GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
@@ -8,32 +8,41 @@ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
 }

 template<ggml_sort_order order>
-static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
+static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
    // bitonic sort
    int col = threadIdx.x;
    int row = blockIdx.y;

-    if (col >= ncols) return;
+    if (col >= ncols_pad) {
+        return;
+    }

    const float * x_row = x + row * ncols;
-    int * dst_row = dst + row * ncols;
+    extern __shared__ int dst_row[];

    // initialize indices
-    if (col < ncols) {
-        dst_row[col] = col;
-    }
+    dst_row[col] = col;
+
    __syncthreads();

-    for (int k = 2; k <= ncols; k *= 2) {
+    for (int k = 2; k <= ncols_pad; k *= 2) {
        for (int j = k / 2; j > 0; j /= 2) {
            int ixj = col ^ j;
            if (ixj > col) {
                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
+                    if (dst_row[col] >= ncols ||
+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                    ) {
                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
                    }
                } else {
-                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
+                    if (dst_row[ixj] >= ncols ||
+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                    ) {
                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
                    }
                }
@@ -41,18 +50,35 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
            __syncthreads();
        }
    }
+
+    // copy the result to dst without the padding
+    if (col < ncols) {
+        dst[row * ncols + col] = dst_row[col];
+    }
+}
+
+static int next_power_of_2(int x) {
+    int n = 1;
+    while (n < x) {
+        n *= 2;
+    }
+    return n;
 }

 static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
    // bitonic sort requires ncols to be power of 2
-    GGML_ASSERT((ncols & (ncols - 1)) == 0);
+    const int ncols_pad = next_power_of_2(ncols);

-    const dim3 block_dims(ncols, 1, 1);
+    const dim3 block_dims(ncols_pad, 1, 1);
    const dim3 block_nums(1, nrows, 1);
+    const size_t shared_mem = ncols_pad * sizeof(int);
+
+    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
+
    if (order == GGML_SORT_ORDER_ASC) {
-        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
    } else if (order == GGML_SORT_ORDER_DESC) {
-        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
    } else {
        GGML_ASSERT(false);
    }
@@ -394,7 +394,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 // TODO: move to ggml-common.h
 static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

-typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);


 //////////////////////
@@ -4,14 +4,14 @@
 #define CUDA_Q8_0_NE_ALIGN 2048

 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
-    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
+    const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);

    if (i >= k) {
        return;
    }

-    const int ib = i/qk; // block index
+    const int64_t ib = i/qk; // block index
    const int iqs = (i%qk)/qr; // quant index
    const int iybs = i - i%qk; // y block start index
    const int y_offset = qr == 1 ? 1 : qk/2;
@@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
 }

 template <bool need_check>
-static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
+static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
 #if __CUDA_ARCH__ >= CC_PASCAL
    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;

@@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
 template<typename dst_t>
 static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {

-    const int i = blockIdx.x;
+    const int64_t i = blockIdx.x;

    // assume 32 threads
    const int tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
-    const int ib = 8*i + ir;
+    const int64_t ib = 8*i + ir;
    if (ib >= nb32) {
        return;
    }
@@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {

-    const int i = blockIdx.x;
+    const int64_t i = blockIdx.x;

    // assume 32 threads
    const int tid = threadIdx.x;
    const int il  = tid/8;
    const int ir  = tid%8;
-    const int ib = 8*i + ir;
+    const int64_t ib = 8*i + ir;
    if (ib >= nb32) {
        return;
    }
@@ -313,14 +313,14 @@ template<typename dst_t>
 static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q6_K * x = (const block_q6_K *) vx;

-    const int i = blockIdx.x;
+    const int64_t i = blockIdx.x;
 #if QK_K == 256

    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
-    const int ip  = tid/32;   // ip is 0 or 1
-    const int il  = tid - 32*ip; // 0...32
-    const int is  = 8*ip + il/16;
+    const int64_t tid = threadIdx.x;
+    const int64_t ip  = tid/32;   // ip is 0 or 1
+    const int64_t il  = tid - 32*ip; // 0...32
+    const int64_t is  = 8*ip + il/16;

    dst_t * y = yy + i*QK_K + 128*ip + il;

@@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 #else

    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int ip  = tid/16;         // 0 or 1
-    const int il  = tid - 16*ip;    // 0...15
+    const int64_t tid = threadIdx.x;
+    const int64_t ip  = tid/16;         // 0 or 1
+    const int64_t il  = tid - 16*ip;    // 0...15

    dst_t * y = yy + i*QK_K + 16*ip + il;

@@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
 #endif

 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
+static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }

-static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
+static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
        const bool need_check = false;
@@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
 }

 template<typename dst_t>
-static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
 #if QK_K == 256
    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
 }

 template<typename dst_t>
-static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
 #if QK_K == 256
    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
 }

 template<typename dst_t>
-static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb32 = k / 32;
    const int nb = (k + 255) / 256;
    dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
 }

 template<typename dst_t>
-static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb32 = k / 32;
    const int nb = (k + 255) / 256;
    dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
 }

 template<typename dst_t>
-static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
 #if QK_K == 256
    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
 }

 template<typename dst_t>
-static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
 #if QK_K == 256
    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
 }

 template<typename dst_t>
-static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
 }

 template<typename dst_t>
-static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
 #if QK_K == 64
    dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
@@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
 }

 template <typename src_t, typename dst_t>
-static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
+    const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= k) {
        return;
@@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
 }

 template <typename src_t, typename dst_t>
-static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
+static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }
@@ -3,7 +3,7 @@
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256

 template<typename T>
-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);

 typedef to_t_cuda_t<float> to_fp32_cuda_t;
 typedef to_t_cuda_t<half> to_fp16_cuda_t;
@@ -1,6 +1,6 @@
 #include "common.cuh"

-static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
    const block_q4_0 * x = (const block_q4_0 *) vx;

    const dfloat d = x[ib].d;
@@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
 #endif // GGML_CUDA_F16
 }

-static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
    const block_q4_1 * x = (const block_q4_1 *) vx;

    const dfloat d = __low2half(x[ib].dm);
@@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
 #endif // GGML_CUDA_F16
 }

-static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
    const block_q5_0 * x = (const block_q5_0 *) vx;

    const dfloat d = x[ib].d;
@@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
 #endif // GGML_CUDA_F16
 }

-static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
    const block_q5_1 * x = (const block_q5_1 *) vx;

    const dfloat d = __low2half(x[ib].dm);
@@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
 #endif // GGML_CUDA_F16
 }

-static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
    const block_q8_0 * x = (const block_q8_0 *) vx;

    const dfloat d = x[ib].d;
@@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
    }
 }

-static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
+static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
    const half * x = (const half *) vx;

    // automatic half -> float type cast if dfloat == float
@@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
 static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
    // qk = quantized weights per x block
    // qr = number of quantized weights per data value in x block
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
@@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons

    for (int i = 0; i < ncols; i += iter_stride) {
        const int col = i + vals_per_iter*tid;
-        const int ib = (row*ncols + col)/qk; // x block index
+        const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
        const int iqs = (col%qk)/qr; // x quant index
        const int iybs = col - col%qk; // y block start index

@@ -1,20 +1,20 @@
 #include "quantize.cuh"

-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
-    const int ix = blockDim.x*blockIdx.x + threadIdx.x;
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
+    const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;

    if (ix >= kx_padded) {
        return;
    }

-    const int iy = blockDim.y*blockIdx.y + threadIdx.y;
+    const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;

-    const int i_padded = iy*kx_padded + ix;
+    const int64_t i_padded = (int64_t)iy*kx_padded + ix;

    block_q8_1 * y = (block_q8_1 *) vy;

-    const int ib = i_padded / QK8_1; // block index
-    const int iqs = i_padded % QK8_1; // quant index
+    const int64_t ib = i_padded / QK8_1; // block index
+    const int64_t iqs = i_padded % QK8_1; // quant index

    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
    float amax = fabsf(xi);
@@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }

-void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
-    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
+    const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
    const dim3 num_blocks(block_num_x, ky, 1);
    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
@@ -2,4 +2,4 @@

 #define CUDA_QUANTIZE_BLOCK_SIZE 256

-void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream);
+void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
@@ -1685,37 +1685,31 @@ static enum ggml_status ggml_metal_graph_compute(
                    {
                        //GGML_ASSERT(ne00 == ne10);
                        //GGML_ASSERT(ne03 == ne13);
-
-                        GGML_ASSERT(src0t == GGML_TYPE_I32);
-
-                        const int n_as = ((int32_t *) dst->op_params)[1];
-
-                        // TODO: make this more general
-                        GGML_ASSERT(n_as <= 8);
+                        const int n_as = src0->ne[2];

                        // max size of the src1ids array in the kernel shared buffer
                        GGML_ASSERT(ne11 <= 4096);

-                        const int64_t  ne20 = src2 ? src2->ne[0] : 0;
-                        const int64_t  ne21 = src2 ? src2->ne[1] : 0;
-                        const int64_t  ne22 = src2 ? src2->ne[2] : 0;
-                        const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
+                        // src2 = ids
+                        const int64_t  ne20 = src2->ne[0]; GGML_UNUSED(ne20);
+                        const int64_t  ne21 = src2->ne[1];
+                        const int64_t  ne22 = src2->ne[2]; GGML_UNUSED(ne22);
+                        const int64_t  ne23 = src2->ne[3]; GGML_UNUSED(ne23);

-                        const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
-                        const uint64_t nb21 = src2 ? src2->nb[1] : 0;
-                        const uint64_t nb22 = src2 ? src2->nb[2] : 0;
-                        const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
+                        const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
+                        const uint64_t nb21 = src2->nb[1];
+                        const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
+                        const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);

-                        const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
+                        const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);

-                        GGML_ASSERT(!ggml_is_transposed(src2));
+                        GGML_ASSERT(src2t == GGML_TYPE_I32);
+
+                        GGML_ASSERT(!ggml_is_transposed(src0));
                        GGML_ASSERT(!ggml_is_transposed(src1));

                        GGML_ASSERT(src1t == GGML_TYPE_F32);

-                        const uint r2 = ne12/ne22;
-                        const uint r3 = ne13/ne23;
-
                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                        // to the matrix-vector kernel
                        int ne11_mm_min = n_as;
@@ -1723,7 +1717,10 @@ static enum ggml_status ggml_metal_graph_compute(
                        const int idx = ((int32_t *) dst->op_params)[0];

                        // batch size
-                        GGML_ASSERT(ne01 == ne11);
+                        GGML_ASSERT(ne21 == ne11); // ?
+                        GGML_ASSERT(ne12 == 1 && ne13 == 1); // no broadcasting
+                        const uint r2 = 1;
+                        const uint r3 = 1;

                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
@@ -1732,7 +1729,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        //       indirect matrix multiplication
                        // !!!
                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                            ne20 % 32 == 0 && ne20 >= 64 &&
+                            ne00 % 32 == 0 && ne00 >= 64 &&
                            ne11 > ne11_mm_min) {

                            // some Metal matrix data types require aligned pointers
@@ -1745,7 +1742,7 @@ static enum ggml_status ggml_metal_graph_compute(

                            id<MTLComputePipelineState> pipeline = nil;

-                            switch (src2->type) {
+                            switch (src0->type) {
                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
@@ -1774,36 +1771,27 @@ static enum ggml_status ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
-                            [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                            [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
-                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                            [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
-                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
-                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
-                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
-                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
-                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
-                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
-                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
-                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
-                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
-                            [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
-                            // TODO: how to make this an array? read Metal docs
-                            for (int j = 0; j < 8; ++j) {
-                                // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
-
-                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
-                            }
+                            [encoder setBuffer:id_src2 offset:offs_src2    atIndex:3];
+                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:4];
+                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:5];
+                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:6];
+                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:7];
+                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:8];
+                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:9];
+                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:10];
+                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:11];
+                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:12];
+                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:13];
+                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:14];
+                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:15];
+                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:16];
+                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:17];
+                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:18];
+                            [encoder setBytes:&idx     length:sizeof(idx)  atIndex:19];

                            [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];

-                            [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                        } else {
                            int nth0 = 32;
                            int nth1 = 1;
@@ -1813,7 +1801,7 @@ static enum ggml_status ggml_metal_graph_compute(
                            id<MTLComputePipelineState> pipeline = nil;

                            // use custom matrix x vector kernel
-                            switch (src2t) {
+                            switch (src0t) {
                                case GGML_TYPE_F32:
                                    {
                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
@@ -1947,8 +1935,8 @@ static enum ggml_status ggml_metal_graph_compute(
                                    }
                            };

-                            if (ggml_is_quantized(src2t)) {
-                                GGML_ASSERT(ne20 >= nth0*nth1);
+                            if (ggml_is_quantized(src0t)) {
+                                GGML_ASSERT(ne00 >= nth0*nth1);
                            }

                            const int64_t _ne1 = 1; // kernels needs a reference in constant memory
@@ -1957,75 +1945,66 @@ static enum ggml_status ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
-                            [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
-                            [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
-                            [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
-                            [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
-                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
-                            [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
-                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
-                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
-                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
-                            [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
-                            // TODO: how to make this an array? read Metal docs
-                            for (int j = 0; j < 8; ++j) {
-                                // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
+                            [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
+                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:4];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:5];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:6];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:7];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:8];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
+                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:19];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:20];
+                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:21];
+                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:22];
+                            [encoder setBytes:&idx  length:sizeof(idx)  atIndex:23];

-                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
-
-                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
+                            if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
+                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
+                                src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
-
-                            if (src2t == GGML_TYPE_Q4_0  || src2t == GGML_TYPE_Q4_1  || src2t == GGML_TYPE_Q5_0 ||
-                                src2t == GGML_TYPE_Q5_1  || src2t == GGML_TYPE_Q8_0  || src2t == GGML_TYPE_Q2_K ||
-                                src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ1_M || src2t == GGML_TYPE_IQ2_S) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                            }
-                            else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
-                                const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
+                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
-                            else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) {
-                                const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
+                            else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
+                                const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
-                            else if (src2t == GGML_TYPE_IQ4_NL || src2t == GGML_TYPE_IQ4_XS) {
+                            else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
                                const int mem_size = 32*sizeof(float);
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
-                            else if (src2t == GGML_TYPE_Q4_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            else if (src0t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
-                            else if (src2t == GGML_TYPE_Q3_K) {
+                            else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                            }
-                            else if (src2t == GGML_TYPE_Q5_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            else if (src0t == GGML_TYPE_Q5_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
-                            else if (src2t == GGML_TYPE_Q6_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            else if (src0t == GGML_TYPE_Q6_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            } else {
                                const int64_t ny = (_ne1 + nrows - 1)/nrows;
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                        }
                    } break;
@@ -2432,6 +2411,16 @@ static enum ggml_status ggml_metal_graph_compute(

                        enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

+                        // bitonic sort requires the number of elements to be power of 2
+                        int64_t ne00_padded = 1;
+                        while (ne00_padded < ne00) {
+                            ne00_padded *= 2;
+                        }
+
+                        // Metal kernels require the buffer size to be multiple of 16 bytes
+                        // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
+                        const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16);
+
                        id<MTLComputePipelineState> pipeline = nil;

                        switch (order) {
@@ -2441,11 +2430,13 @@ static enum ggml_status ggml_metal_graph_compute(
                        };

                        [encoder setComputePipelineState:pipeline];
-                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                        [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst      offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3];
+                        [encoder setThreadgroupMemoryLength:mem_size atIndex:0];

-                        [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
+                        [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)];
                    } break;
                case GGML_OP_LEAKY_RELU:
                    {
@@ -12,70 +12,70 @@ extern "C" {
 #endif

 // Quantization
-void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
-void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
-void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
-void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
-void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
-void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
+void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);

-void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
-void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
-void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
-void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
-void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
-void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
+void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);

-void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
-void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int k);
-void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int k);
-void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int k);
-void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int k);
+void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);

-void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

-void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

-void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

-void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

-void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

-size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
-size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
+size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
@@ -338,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
    return GGML_FP32_TO_FP16(x);
 }

-void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
-    for (int i = 0; i < n; i++) {
+void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
+    for (int64_t i = 0; i < n; i++) {
        y[i] = GGML_FP16_TO_FP32(x[i]);
    }
 }

-void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
-    int i = 0;
+void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
+    int64_t i = 0;
 #if defined(__F16C__)
    for (; i + 7 < n; i += 8) {
        __m256 x_vec = _mm256_loadu_ps(x + i);
@@ -4573,45 +4573,38 @@ void ggml_mul_mat_set_prec(

 // ggml_mul_mat_id

+// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
+//       this will allow computing all the used experts in a single matrix multiplication
 struct ggml_tensor * ggml_mul_mat_id(
        struct ggml_context * ctx,
-        struct ggml_tensor  * const as[],
-        int                   n_as,
+        struct ggml_tensor  * as,
        struct ggml_tensor  * ids,
        int                   id,
        struct ggml_tensor  * b) {

    GGML_ASSERT(ids->type == GGML_TYPE_I32);
-    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
-    GGML_ASSERT(ids->ne[1] == b->ne[1]);
+    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
+    GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
    GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
-    GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
-    GGML_ASSERT(id >= 0 && id < ids->ne[0]);
+    GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
+    GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat

    bool is_node = false;

-    if (as[0]->grad || b->grad) {
+    if (as->grad || b->grad) {
        is_node = true;
    }

-    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    ggml_set_op_params_i32(result, 0, id);
-    ggml_set_op_params_i32(result, 1, n_as);

    result->op   = GGML_OP_MUL_MAT_ID;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = ids;
+    result->src[0] = as;
    result->src[1] = b;
-
-    for (int i = 0; i < n_as; i++) {
-        struct ggml_tensor * a = as[i];
-        GGML_ASSERT(ggml_are_same_shape(as[0], a));
-        GGML_ASSERT(ggml_can_mul_mat(a, b));
-        GGML_ASSERT(!ggml_is_transposed(a));
-        result->src[i + 2] = a;
-    }
+    result->src[2] = ids;

    return result;
 }
@@ -10948,10 +10941,9 @@ static void ggml_compute_forward_mul_mat_id(
        const struct ggml_compute_params * params,
              struct ggml_tensor * dst) {

-    const struct ggml_tensor * ids = dst->src[0];
+    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
-
-    const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
+    const struct ggml_tensor * ids = dst->src[2];

    GGML_TENSOR_BINARY_OP_LOCALS

@@ -10981,13 +10973,13 @@ static void ggml_compute_forward_mul_mat_id(
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
+    // broadcast is not supported with mmid
+    assert(ne12 == 1);
+    assert(ne13 == 1);

    // row groups
    const int id   = ggml_get_op_params_i32(dst, 0);
-    const int n_as = ggml_get_op_params_i32(dst, 1);
+    const int n_as = src0->ne[2];

    char * wdata_src1_end = (src1->type == vec_dot_type) ?
            (char *) params->wdata :
@@ -11047,7 +11039,7 @@ static void ggml_compute_forward_mul_mat_id(
            continue;
        }

-        const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
+        size_t src0_offset = cur_a*src0->nb[2];

        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -11082,9 +11074,6 @@ static void ggml_compute_forward_mul_mat_id(
            continue;
        }

-        assert(ne12 % ne02 == 0);
-        assert(ne13 % ne03 == 0);
-
        // block-tiling attempt
        const int64_t blck_0 = 16;
        const int64_t blck_1 = 16;
@@ -11101,14 +11090,14 @@ static void ggml_compute_forward_mul_mat_id(
                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);

                    // broadcast src0 into src1
-                    const int64_t i03 = i13/r3;
-                    const int64_t i02 = i12/r2;
+                    //const int64_t i03 = i13/r3;
+                    //const int64_t i02 = i12/r2;

                    const int64_t i1 = i11;
                    const int64_t i2 = i12;
                    const int64_t i3 = i13;

-                    const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
+                    const char * src0_row = (const char *) src0->data + src0_offset;

                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -18464,13 +18453,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
            case GGML_OP_MUL_MAT_ID:
                {
                    cur = 0;
-                    const struct ggml_tensor * src0 = node->src[2];
+                    const struct ggml_tensor * src0 = node->src[0];
                    const struct ggml_tensor * src1 = node->src[1];
                    const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
                    if (src1->type != vec_dot_type) {
                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
                    }
-                    const int n_as = ggml_get_op_params_i32(node, 1);
+                    const int n_as = src0->ne[2];
                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
@@ -20342,11 +20331,11 @@ size_t ggml_quantize_chunk(
        enum ggml_type   type,
           const float * src,
                  void * dst,
-                   int   start,
-                   int   nrows,
-                   int   n_per_row,
+               int64_t   start,
+               int64_t   nrows,
+               int64_t   n_per_row,
           const float * imatrix) {
-    const int n = nrows * n_per_row;
+    const int64_t n = (int64_t) nrows * n_per_row;

    if (ggml_quantize_requires_imatrix(type)) {
        GGML_ASSERT(imatrix != NULL);
@@ -332,8 +332,8 @@ extern "C" {
    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);

-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);

    struct ggml_object;
    struct ggml_context;
@@ -1164,8 +1164,7 @@ extern "C" {
    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
            struct ggml_context * ctx,
-            struct ggml_tensor  * const as[],
-            int                   n_as,
+            struct ggml_tensor  * as,
            struct ggml_tensor  * ids,
            int                   id,
            struct ggml_tensor  * b);
@@ -2211,9 +2210,9 @@ extern "C" {
            enum ggml_type   type,
               const float * src,
                      void * dst,
-                       int   start,
-                       int   nrows,
-                       int   n_per_row,
+                   int64_t   start,
+                   int64_t   nrows,
+                   int64_t   n_per_row,
               const float * imatrix);

    //
@@ -2378,8 +2377,8 @@ extern "C" {
 #else
 #define GGML_RESTRICT restrict
 #endif
-    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
+    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                      const void * GGML_RESTRICT y, size_t by, int nrc);

@@ -24,6 +24,7 @@ class Keys:
        ALIGNMENT            = "general.alignment"
        NAME                 = "general.name"
        AUTHOR               = "general.author"
+        VERSION              = "general.version"
        URL                  = "general.url"
        DESCRIPTION          = "general.description"
        LICENSE              = "general.license"
@@ -221,9 +222,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
    MODEL_TENSOR.FFN_ACT:         "blk.{bid}.ffn",
-    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate.{xid}",
-    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
-    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
+    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate_exps",
+    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down_exps",
+    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up_exps",
    MODEL_TENSOR.LAYER_OUT_NORM:  "blk.{bid}.layer_output_norm",
    MODEL_TENSOR.SSM_IN:          "blk.{bid}.ssm_in",
    MODEL_TENSOR.SSM_CONV1D:      "blk.{bid}.ssm_conv1d",
@@ -367,6 +368,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_ACT,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.POS_EMBD,
    ],
    MODEL_ARCH.GPTJ: [
        MODEL_TENSOR.TOKEN_EMBD,
@@ -635,6 +639,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
    ],
    # TODO
 }
@@ -296,6 +296,9 @@ class GGUFWriter:
    def add_author(self, author: str) -> None:
        self.add_string(Keys.General.AUTHOR, author)

+    def add_version(self, version: str) -> None:
+        self.add_string(Keys.General.VERSION, version)
+
    def add_tensor_data_layout(self, layout: str) -> None:
        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)

@@ -305,6 +308,9 @@ class GGUFWriter:
    def add_description(self, description: str) -> None:
        self.add_string(Keys.General.DESCRIPTION, description)

+    def add_licence(self, licence: str) -> None:
+        self.add_string(Keys.General.LICENSE, licence)
+
    def add_source_url(self, url: str) -> None:
        self.add_string(Keys.General.SOURCE_URL, url)

@@ -231,9 +231,8 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.FFN_UP_EXP: (
-            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear_v",   # Grok
+            "layers.{bid}.feed_forward.experts.w3",                 # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_v",         # Grok (merged)
        ),

        # AWQ-activation gate
@@ -252,9 +251,8 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.FFN_GATE_EXP: (
-            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear"      # Grok
+            "layers.{bid}.feed_forward.experts.w1",                 # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear"            # Grok (merged)
        ),

        # Feed-forward down
@@ -280,20 +278,22 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.FFN_DOWN_EXP: (
-            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear_1",   # Grok
-
+            "layers.{bid}.feed_forward.experts.w2",                 # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_1",         # Grok (merged)
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
            "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
+            "model.layers.{bid}.self_attn.q_norm",                            # cohere
+            "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
        ),

        MODEL_TENSOR.ATTN_K_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
            "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
+            "model.layers.{bid}.self_attn.k_norm",                            # cohere
+            "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
        ),

        MODEL_TENSOR.ROPE_FREQS: (
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.8.0"
+version = "0.9.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
@@ -37,10 +37,14 @@

 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
+#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 5

+#define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
+#define LLAMA_STATE_SEQ_VERSION 1
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -523,6 +527,7 @@ extern "C" {
            struct llama_context * ctx);

    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
    // seq_id < 0 : match any sequence
    // p0 < 0     : [0,  p1]
    // p1 < 0     : [p0, inf)
@@ -594,34 +599,92 @@ extern "C" {

    // Returns the maximum size in bytes of the state (rng, logits, embedding
    // and kv_cache) - will often be smaller after compacting tokens
-    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
+    LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
+    LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
+        "use llama_state_get_size instead");

    // Copies the state to the specified destination address.
    // Destination needs to have allocated enough memory.
    // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(
+    LLAMA_API size_t llama_state_get_data(
            struct llama_context * ctx,
                         uint8_t * dst);
+    LLAMA_API DEPRECATED(size_t llama_copy_state_data(
+            struct llama_context * ctx,
+                         uint8_t * dst),
+        "use llama_state_get_data instead");

    // Set the state reading from the specified address
    // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(
+    LLAMA_API size_t llama_state_set_data(
            struct llama_context * ctx,
                   const uint8_t * src);
+    LLAMA_API DEPRECATED(size_t llama_set_state_data(
+            struct llama_context * ctx,
+                   const uint8_t * src),
+        "use llama_state_set_data instead");

    // Save/load session file
-    LLAMA_API bool llama_load_session_file(
+    LLAMA_API bool llama_state_load_file(
            struct llama_context * ctx,
                      const char * path_session,
                     llama_token * tokens_out,
                          size_t   n_token_capacity,
                          size_t * n_token_count_out);
+    LLAMA_API DEPRECATED(bool llama_load_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out),
+        "use llama_state_load_file instead");

-    LLAMA_API bool llama_save_session_file(
+    LLAMA_API bool llama_state_save_file(
            struct llama_context * ctx,
                      const char * path_session,
               const llama_token * tokens,
                          size_t   n_token_count);
+    LLAMA_API DEPRECATED(bool llama_save_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+               const llama_token * tokens,
+                          size_t   n_token_count),
+        "use llama_state_save_file instead");
+
+    // Get the exact size needed to copy the KV cache of a single sequence
+    LLAMA_API size_t llama_state_seq_get_size(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
+
+    // Copy the KV cache of a single sequence into the specified buffer
+    LLAMA_API size_t llama_state_seq_get_data(
+            struct llama_context * ctx,
+                         uint8_t * dst,
+                    llama_seq_id   seq_id);
+
+    // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
+    // Returns:
+    //  - Positive: Ok
+    //  - Zero: Failed to load
+    LLAMA_API size_t llama_state_seq_set_data(
+            struct llama_context * ctx,
+                   const uint8_t * src,
+                    llama_seq_id   dest_seq_id);
+
+    LLAMA_API size_t llama_state_seq_save_file(
+            struct llama_context * ctx,
+                      const char * filepath,
+                    llama_seq_id   seq_id,
+               const llama_token * tokens,
+                          size_t   n_token_count);
+
+    LLAMA_API size_t llama_state_seq_load_file(
+            struct llama_context * ctx,
+                      const char * filepath,
+                    llama_seq_id   dest_seq_id,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out);

    //
    // Decoding
@@ -684,8 +747,9 @@ extern "C" {
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);

-    // Logits for the ith token. Equivalent to:
+    // Logits for the ith token. For positive indices, Equivalent to:
    // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
+    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
    // returns NULL for invalid ids.
    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);

@@ -697,8 +761,9 @@ extern "C" {
    // Otherwise, returns NULL.
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

-    // Get the embeddings for the ith token. Equivalent to:
+    // Get the embeddings for the ith token. For positive indices, Equivalent to:
    // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
+    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
    // shape: [n_embd] (1-dimensional)
    // returns NULL for invalid ids.
    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@@ -721,6 +786,8 @@ extern "C" {
    // Special tokens
    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
+    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line

    // Returns -1 if unknown, 1 for true or 0 for false.
@@ -743,16 +810,16 @@ extern "C" {
    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
    /// @return Returns the number of tokens on success, no more than n_tokens_max
    /// @return Returns a negative number on failure - the number of tokens that would have been returned
-    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
-    ///                Does not insert a leading space.
+    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
+    ///                      as plaintext. Does not insert a leading space.
    LLAMA_API int32_t llama_tokenize(
        const struct llama_model * model,
                      const char * text,
                         int32_t   text_len,
                     llama_token * tokens,
                         int32_t   n_tokens_max,
-                            bool   add_bos,
-                            bool   special);
+                            bool   add_special,
+                            bool   parse_special);

    // Token Id -> Piece.
    // Uses the vocabulary in the provided context.
@@ -1007,10 +1074,38 @@ extern "C" {

 struct ggml_tensor;

+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar {
+    const std::vector<std::vector<llama_grammar_element>>   rules;
+    std::vector<std::vector<const llama_grammar_element *>> stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8                                      partial_utf8;
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );

+std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
+        const std::vector<std::vector<llama_grammar_element>>         & rules,
+        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
+        const uint32_t                                                  chr);
+
+std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+        const std::string & src,
+        llama_partial_utf8   partial_start);
+
 #endif // LLAMA_API_INTERNAL

 #endif // LLAMA_H
@@ -178,6 +178,9 @@ def get_commit_hexsha8(name):
    for t in repo.tags:
        if t.name == name:
            return t.commit.hexsha[:8]
+    for c in repo.iter_commits("--all"):
+        if c.hexsha[:8] == name[:8]:
+            return c.hexsha[:8]
    return None


@@ -224,7 +227,7 @@ if known_args.compare is not None:
        hexsha8_compare = get_commit_hexsha8(known_args.compare)
        name_compare = known_args.compare
    if hexsha8_compare is None:
-        print(f"ERROR: cannot find data for baseline={known_args.compare}.")
+        print(f"ERROR: cannot find data for compare={known_args.compare}.")
        sys.exit(1)
 # Otherwise, search for the commit for llama-bench was most recently run
 # and that is not a parent of master:
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+printf "# date: $(date)\n" > AUTHORS
+printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS
+
+git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS
+
+# if necessary, update your name here. for example: jdoe -> John Doe
+sed -i '' 's/^jdoe/John Doe/g' AUTHORS
@@ -60,11 +60,14 @@ while read c; do
        src/ggml*.m \
        src/ggml*.metal \
        src/ggml*.cu \
+        src/ggml-cuda/* \
        tests/test-opt.cpp \
        tests/test-grad0.cpp \
        tests/test-quantize-fns.cpp \
        tests/test-quantize-perf.cpp \
        tests/test-backend-ops.cpp \
+        LICENSE \
+        scripts/gen-authors.sh \
        >> $SRC_LLAMA/ggml-src.patch
 done < $SRC_LLAMA/ggml-commits

@@ -122,6 +125,9 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # tests/test-quantize-fns.cpp  -> tests/test-quantize-fns.cpp
    # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
    # tests/test-backend-ops.cpp   -> tests/test-backend-ops.cpp
+    #
+    # LICENSE                      -> LICENSE
+    # scripts/gen-authors.sh       -> scripts/gen-authors.sh

    cat ggml-src.patch | sed \
        -e 's/src\/ggml\.c/ggml.c/g' \
@@ -155,6 +161,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \
        -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \
        -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \
+        -e 's/LICENSE/LICENSE/g' \
+        -e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \
        > ggml-src.patch.tmp
    mv ggml-src.patch.tmp ggml-src.patch

@@ -1 +1 @@
-43a6d4af1971ee2912ff7bc2404011ff327b6a60
+98875cdb7e9ceeb726d1c196d2fecb3cbb59b93a
@@ -31,3 +31,6 @@ cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
 cp -rpv ../ggml/tests/test-opt.cpp         ./tests/test-opt.cpp
 cp -rpv ../ggml/tests/test-grad0.cpp       ./tests/test-grad0.cpp
 cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
+
+cp -rpv ../LICENSE                         ./LICENSE
+cp -rpv ../ggml/scripts/gen-authors.sh     ./scripts/gen-authors.sh
@@ -59,6 +59,7 @@ llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-gpt2             AR

 llama_test(test-grammar-parser.cpp)
 llama_test(test-llama-grammar.cpp)
+llama_test(test-grammar-integration.cpp)
 llama_test(test-grad0.cpp)
 # llama_test(test-opt.cpp) # SLOW
 llama_test(test-backend-ops.cpp)
@@ -979,17 +979,13 @@ struct test_mul_mat_id : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        std::vector<ggml_tensor *> mats;
-        for (int i = 0; i < n_mats; i++) {
-            ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
-            mats.push_back(a);
-        }
+        ggml_tensor * mats = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
        if (v) {
            ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
        }
        ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
-        ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
+        ggml_tensor * out = ggml_mul_mat_id(ctx, mats, ids, v ? id/2 : id, b);
        return out;
    }

@@ -1477,91 +1473,6 @@ struct test_leaky_relu : public test_case {
    }
 };

-// Mixtral MOE
-struct test_moe : public test_case {
-    const int n_experts;
-    const int n_experts_per_tok;
-    const int n_tokens;
-    const int n_embd;
-    const int n_ff;
-
-    std::string op_desc(ggml_tensor * t) override {
-        return "MOE";
-
-        GGML_UNUSED(t);
-    }
-
-    std::string vars() override {
-        return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
-    }
-
-    test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
-        : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
-    }
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
-
-        std::vector<ggml_tensor *> ffn_up_exp(n_experts);
-        std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
-        std::vector<ggml_tensor *> ffn_down_exp(n_experts);
-
-        for (int i = 0; i < n_experts; ++i) {
-            ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-            ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-            ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
-        }
-
-        ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
-        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);
-
-        // select experts
-        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
-
-        ggml_tensor * weights = ggml_get_rows(ctx,
-                ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
-
-        weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
-
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
-
-        weights = ggml_div(ctx, weights, weights_sum);
-
-        // compute expert outputs
-        ggml_tensor * moe_out = nullptr;
-
-        for (int i = 0; i < n_experts_per_tok; ++i) {
-            ggml_tensor * cur_expert;
-
-            ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
-
-            ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
-
-            cur_gate = ggml_silu(ctx, cur_gate);
-
-            cur_expert = ggml_mul(ctx, cur_up, cur_gate);
-
-            cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
-
-            cur_expert = ggml_mul(ctx, cur_expert,
-                    ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
-
-            if (i == 0) {
-                moe_out = cur_expert;
-            } else {
-                moe_out = ggml_add(ctx, moe_out, cur_expert);
-            }
-        }
-
-        cur = moe_out;
-
-        return cur;
-    }
-};
-
-
 enum llm_norm_type {
    LLM_NORM,
    LLM_NORM_RMS,
@@ -2169,6 +2080,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
+        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
    }

    test_cases.emplace_back(new test_sum_rows());
@@ -2182,11 +2094,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op

    // these tests are disabled to save execution time, but they can be handy for debugging
 #if 0
-#if !defined(__SANITIZE_THREAD__)
-    // FIXME: these tests use too much memory with thread sanitizer
-    test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
-    //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
-#endif
    test_cases.emplace_back(new test_llama(1));
    test_cases.emplace_back(new test_llama(2));
    test_cases.emplace_back(new test_falcon(1));
@@ -33,6 +33,18 @@ int main(void) {
        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
        // OrionStarAI/Orion-14B-Chat
        "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
+        // openchat/openchat-3.5-0106
+        // The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d
+        // So we match against the included template but implement the suggested version.
+        "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+        // deepseek-ai/deepseek-coder-33b-instruct
+        "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+        // eachadea/vicuna-13b-1.1
+        // No template included in tokenizer_config.json, so this template likely needs to be manually set.
+        "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
+        // Orca-Vicuna
+        // No template included in tokenizer_config.json, so this template likely needs to be manually set.
+        "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
@@ -49,6 +61,14 @@ int main(void) {
        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
        // OrionStarAI/Orion-14B-Chat
        "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
+        // openchat/openchat-3.5-0106
+        "You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
+        // deepseek-ai/deepseek-coder-33b-instruct
+        "You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n   I am an assistant   \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n",
+        // eachadea/vicuna-13b-1.1
+        "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
+        // Orca-Vicuna
+        "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;
@@ -0,0 +1,243 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#define LLAMA_API_INTERNAL
+
+#include "ggml.h"
+#include "llama.h"
+#include "grammar-parser.h"
+#include "unicode.h"
+#include <cassert>
+#include <string>
+
+static void test_simple_grammar() {
+    // Test case for a simple grammar
+    const std::string grammar_str = R"""(root ::= expr
+expr ::= term ("+" term)*
+term ::= number
+number ::= [0-9]+)""";
+
+    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+    // Ensure we parsed correctly
+    assert(!parsed_grammar.rules.empty());
+
+    // Ensure we have a root node
+    assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()));
+
+    std::vector<const llama_grammar_element*> grammar_rules(parsed_grammar.c_rules());
+    llama_grammar* grammar = llama_grammar_init(
+        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+
+    std::string input = "123+456";
+
+    auto decoded = decode_utf8(input, {});
+
+    const auto & code_points = decoded.first;
+
+    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+        auto prev_stacks = grammar->stacks;
+        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+        assert(!grammar->stacks.empty());
+    }
+
+    bool completed_grammar = false;
+
+    for (const auto & stack : grammar->stacks) {
+        if (stack.empty()) {
+            completed_grammar = true;
+            break;
+        }
+    }
+
+    assert(completed_grammar);
+
+    // Clean up allocated memory
+    llama_grammar_free(grammar);
+}
+
+static void test_complex_grammar() {
+    // Test case for a more complex grammar, with both failure strings and success strings
+    const std::string grammar_str = R"""(root ::= expression
+expression ::= term ws (("+"|"-") ws term)*
+term ::= factor ws (("*"|"/") ws factor)*
+factor ::= number | variable | "(" expression ")" | function-call
+number ::= [0-9]+
+variable ::= [a-zA-Z_][a-zA-Z0-9_]*
+function-call ::= variable ws "(" (expression ("," ws expression)*)? ")"
+ws ::= [ \t\n\r]?)""";
+
+    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+    // Ensure we parsed correctly
+    assert(!parsed_grammar.rules.empty());
+
+    // Ensure we have a root node
+    assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()));
+
+    std::vector<const llama_grammar_element*> grammar_rules(parsed_grammar.c_rules());
+    llama_grammar* grammar = llama_grammar_init(
+        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+
+    // Save the original grammar stacks so that we can reset after every new string we want to test
+    auto original_stacks = grammar->stacks;
+
+    // Test a few strings
+    std::vector<std::string> test_strings_pass = {
+        "42",
+        "1*2*3*4*5",
+        "x",
+        "x+10",
+        "x1+y2",
+        "(a+b)*(c-d)",
+        "func()",
+        "func(x,y+2)",
+        "a*(b+c)-d/e",
+        "f(g(x),h(y,z))",
+        "x + 10",
+        "x1 + y2",
+        "(a + b) * (c - d)",
+        "func()",
+        "func(x, y + 2)",
+        "a * (b + c) - d / e",
+        "f(g(x), h(y, z))",
+        "123+456",
+        "123*456*789-123/456+789*123",
+        "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456"
+    };
+
+    std::vector<std::string> test_strings_fail = {
+        "+",
+        "/ 3x",
+        "x + + y",
+        "a * / b",
+        "func(,)",
+        "func(x y)",
+        "(a + b",
+        "x + y)",
+        "a + b * (c - d",
+        "42 +",
+        "x +",
+        "x + 10 +",
+        "(a + b) * (c - d",
+        "func(",
+        "func(x, y + 2",
+        "a * (b + c) - d /",
+        "f(g(x), h(y, z)",
+        "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
+    };
+
+    // Passing strings
+    for (const auto & test_string : test_strings_pass) {
+        auto decoded = decode_utf8(test_string, {});
+
+        const auto & code_points = decoded.first;
+
+        int pos = 0;
+        for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+            ++pos;
+            auto prev_stacks = grammar->stacks;
+            grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+
+            // Expect that each code point will not cause the grammar to fail
+            if (grammar->stacks.empty()) {
+                fprintf(stdout, "Error at position %d\n", pos);
+                fprintf(stderr, "Unexpected character '%s'\n", unicode_cpt_to_utf8(*it).c_str());
+                fprintf(stderr, "Input string is %s:\n", test_string.c_str());
+            }
+            assert(!grammar->stacks.empty());
+        }
+
+        bool completed_grammar = false;
+
+        for (const auto & stack : grammar->stacks) {
+            if (stack.empty()) {
+                completed_grammar = true;
+                break;
+            }
+        }
+
+        assert(completed_grammar);
+
+        // Reset the grammar stacks
+        grammar->stacks = original_stacks;
+    }
+
+    // Failing strings
+    for (const auto & test_string : test_strings_fail) {
+        auto decoded = decode_utf8(test_string, {});
+
+        const auto & code_points = decoded.first;
+        bool parse_failed = false;
+
+        for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+            auto prev_stacks = grammar->stacks;
+            grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+            if (grammar->stacks.empty()) {
+                parse_failed = true;
+                break;
+            }
+            assert(!grammar->stacks.empty());
+        }
+
+        bool completed_grammar = false;
+
+        for (const auto & stack : grammar->stacks) {
+            if (stack.empty()) {
+                completed_grammar = true;
+                break;
+            }
+        }
+
+        // Ensure that the grammar is not completed, or that each string failed to match as-expected
+        assert((!completed_grammar) || parse_failed);
+
+        // Reset the grammar stacks
+        grammar->stacks = original_stacks;
+    }
+
+    // Clean up allocated memory
+    llama_grammar_free(grammar);
+}
+
+static void test_failure_missing_root() {
+    // Test case for a grammar that is missing a root rule
+    const std::string grammar_str = R"""(rot ::= expr
+expr ::= term ("+" term)*
+term ::= number
+number ::= [0-9]+)""";
+
+    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+    // Ensure we parsed correctly
+    assert(!parsed_grammar.rules.empty());
+
+    // Ensure we do NOT have a root node
+    assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end());
+}
+
+static void test_failure_missing_reference() {
+    // Test case for a grammar that is missing a referenced rule
+    const std::string grammar_str = R"""(root ::= expr
+expr ::= term ("+" term)*
+term ::= numero
+number ::= [0-9]+)""";
+
+    fprintf(stderr, "Expected error:  ");
+
+    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+    // Ensure we did NOT parsed correctly
+    assert(parsed_grammar.rules.empty());
+
+    fprintf(stderr, "End of expected error. Test successful.\n");
+}
+
+int main() {
+    test_simple_grammar();
+    test_complex_grammar();
+    test_failure_missing_root();
+    test_failure_missing_reference();
+    return 0;
+}
Author	SHA1	Message	Date
Ralph Soika	b3a96f27f0	minor layout improvements (#6572 ) * minor layout improvements * added missing file, run deps.sh locally	2024-04-10 19:18:25 +02:00
slaren	4f407a0a35	llama : add model types for mixtral (#6589 )	2024-04-10 17:24:14 +02:00
slaren	65c64dc36f	convert.py : add consolidated.safetensors for mixtral 8x22b (#6587 )	2024-04-10 15:23:12 +02:00
Pierrick Hymbert	67fac4b95f	docs : how to add a model (#6565 ) * docs: how to add a model * docs: model: typo and docs * docs: model: add prevision on RoPE * docs: model: rephrasing README.md * docs: model: rephrasing README.md * docs: model: README.md fix trailing spaces * docs : some fixes * Update README.md --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-04-10 09:58:48 +03:00
Artem Zinnatullin	29122d32ac	readme : fix ROCm link (#6579 )	2024-04-10 09:49:12 +03:00
sjxx	b231b37b09	readme : update UI list (#6560 )	2024-04-10 09:34:00 +03:00
Jiří Sejkora	ba5e134e07	readme: fix typo in amdgpu target name (#6573 )	2024-04-10 00:23:02 +02:00
Jared Van Bortel	1b67731e18	BERT tokenizer fixes (#6498 ) Key changes: * BERT conversion: fix abuse of LlamaHfVocab, do not set BOS or EOS * Nomic Embed conversion: pad vocab instead of slicing embedding tensor * llama_tokenize: handle added special tokens like HF does	2024-04-09 13:44:08 -04:00
Georgi Gerganov	c4a3a4ff47	sync : ggml	2024-04-09 20:29:06 +03:00
Ed Lee	400d5d722d	server : detect search query to start webchat (#6554 )	2024-04-09 10:31:47 +02:00
Carolinabanana	5dc9dd7152	llama : add Command R Plus support (#6491 ) * Add Command R Plus GGUF * Add Command R Plus GGUF * Loading works up to LayerNorm2D * Export new tensors in 1D so they are not quantized. * Fix embedding layer based on Noeda's example * Whitespace * Add line * Fix unexpected tokens on MPS. Re-add F16 fix. ((Noeda) * dranger003: Fix block index overflow in CUDA dequantizing. * Reverted blocked multiplication code as it still has issues and could affect other Llama arches * export norms as f32 * fix overflow issues during quant and other cleanup * Type convention Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * dranger003: Fix more int overflow during quant. --------- Co-authored-by: S <seast@Ss-Mac-Studio.local> Co-authored-by: S <s@example.com> Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-04-09 11:16:13 +03:00
Georgi Gerganov	e11a8999b5	license : update copyright notice + add AUTHORS (#6405 ) * license : add AUTHORS * authors : update * scipts : add LICENSE and gen-authors.sh to sync	2024-04-09 09:23:19 +03:00
Georgi Gerganov	cc4a95426d	llama : fix attention layer count sanity check (#6550 ) * llama : fix attention layer count sanity check * llama : fix parentheses in attention layer count sanity check There was otherwise a warning when compiling. --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net>	2024-04-08 22:25:49 +03:00
kunnis	cecd8d3c98	Comment explaining a decision (#6531 )	2024-04-08 17:44:19 +02:00
Georgi Gerganov	b73e564b16	quantize : fix precedence of cli args (#6541 )	2024-04-08 16:23:01 +03:00
Rick G	e3c337d87c	llama : support negative ith in llama_get_ API (#6519 ) * llama_sampling_sample with default args is more naively usable * Batches populated by either llama_batch_get_one or llama_batch_add work with default args * Previously get_one could use the default argument * Previously add should usually have used the last index where logits[idx] == true * This hopefully encourages the use of llama_batch_add * By giving expected results when using default arguments. * Adds "negative indexing" feature to llama_get_logits_ith and llama_get_embeddings_ith * Believed to work with any currently well behaved program * Default arg now works for both cases (previously would give strange results for add case) * Any non-negative number is unaffected and behaves as previously * Negative arguments were previously invalid. * Implemented as a special case of indexing as suggested by @compilade in https://github.com/ggerganov/llama.cpp/pull/6519 * Fixed mismatch type errors * cited in macOS CI tests * Missed in original updates based on PR feedback in https://github.com/ggerganov/llama.cpp/pull/6519	2024-04-08 16:02:30 +03:00
Jan Boon	beea6e1b16	llama : save and restore kv cache for single seq id (#6341 ) * llama : save and restore kv cache for single seq id * remove trailing whitespace * respond error in case there's no space in the kv cache * add kv seq save restore to test case * add --slot-save-path arg to enable save restore and restrict save location * Returning 0 for some cases, instead of asserting. * cleanup error cases * rename sequence state functions * rename state get set functions * add previous function names back in with DEPRECATED notice * update doc * adjust endpoints to preferred style * fix restoring zero cell count * handle seq rm return value * unused param * keep in the size check * fix return types * add server test case for slot save restore * cleanup * add cake * cleanup style * add special * removing a whole sequence never fails * move sequence state file functionality from server to llama to match session api and add version tags * catch exceptions on save as well * error log messages * check types for stricter restore * update server doc * readme : update API changes date * strict filename validation * move include, reject bom as well * also reject empty filename * reject whitespace and trailing dot --------- Co-authored-by: Martin Evans <martindevans@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-04-08 15:43:30 +03:00
Abhilash Majumder	87fb5b4234	remove row=1 cond (#6532 )	2024-04-08 16:26:01 +08:00
Firat	d752327c33	Adding KodiBot to UI list (#6535 ) KodiBot is free and open source ai chat app released under the GNU General Public License.	2024-04-08 09:48:29 +02:00
Mark Fairbairn	855f54402e	Change Windows AMD example to release build to make inference much faster. (#6525 )	2024-04-07 20:52:19 +02:00
Georgi Gerganov	b909236c0b	flake.lock: Update (#6517 ) Flake lock file updates: • Updated input 'flake-parts': 'github:hercules-ci/flake-parts/f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2' (2024-03-01) → 'github:hercules-ci/flake-parts/9126214d0a59633752a136528f5f3b9aa8565b7d' (2024-04-01) • Updated input 'flake-parts/nixpkgs-lib': 'github:NixOS/nixpkgs/1536926ef5621b09bba54035ae2bb6d806d72ac8?dir=lib' (2024-02-29) → 'github:NixOS/nixpkgs/d8fe5e6c92d0d190646fb9f1056741a229980089?dir=lib' (2024-03-29) • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/d8fe5e6c92d0d190646fb9f1056741a229980089' (2024-03-29) → 'github:NixOS/nixpkgs/fd281bd6b7d3e32ddfa399853946f782553163b5' (2024-04-03) Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2024-04-07 11:25:30 -07:00
DAN™	e0717e751e	Add GritLM as supported models. (#6513 )	2024-04-07 19:33:59 +02:00
Georgi Gerganov	c37247796b	sync : ggml	2024-04-07 17:05:51 +03:00
Slava Primenko	f77261a7c5	ggml: bypass code incompatible with CUDA < 11.1 (whisper/2020) `cudaHostRegisterReadOnly` parameter was only introduced in CUDA 11.1 See this issue for more details: https://github.com/ggerganov/examples/whisper/whisper.cpp/issues/2007	2024-04-07 17:05:40 +03:00
Georgi Gerganov	43e8995e75	scripts : sync ggml-cuda folder	2024-04-07 16:08:12 +03:00
limitedAtonement	9472bce308	Run make to build the project (#6457 )	2024-04-07 13:05:40 +02:00
Neo Zhang Jianyu	d4f220a5cc	support/fix OPs GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M (#6521 )	2024-04-07 10:55:59 +08:00
Georgi Gerganov	54ea0698fb	sync : ggml	2024-04-06 18:27:46 +03:00
Daniel Bevenius	b66aec675c	backend : fix typo in scheduler documentation (ggml/781) Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-04-06 17:42:26 +03:00
Clint Herron	57dd02c44b	Tests: Added integration tests for GBNF parser (#6472 ) * Added integration tests for GBNF parser to validate correctness of parsing, as well as correctness of string matching. Intended for use to pin behavior while working on performance improvements. * Fixing whitespace errors and cleaning error message alert to be clearer. * Removing hacky include to llama.cpp from grammar integration test now that needed functions are available via internal API. * Comment cleanup. * Reorganizing tests for readability. * Cleaning up debug message to make a bit more sense.	2024-04-06 10:31:33 -04:00
Pierrick Hymbert	75cd4c7729	ci: bench: support sse and fix prompt processing time / server: add tokens usage in stream OAI response (#6495 ) * ci: bench: support sse and fix prompt processing time server: add tokens usage in stream mode * ci: bench: README.md EOL * ci: bench: remove total pp and tg as it is not accurate * ci: bench: fix case when there is no token generated * ci: bench: change to the 95 percentile for pp and tg as it is closer to what the server exports in metrics * ci: bench: fix finish reason rate	2024-04-06 05:40:47 +02:00
Brian	a8bd14d557	gguf.py : add licence and version to gguf writer (#6504 )	2024-04-05 21:41:38 +03:00
Hoang Nguyen	d0f5deebf8	readme : update UI list (#6503 ) * Add MindMac to UI list * Update proprietary description Co-authored-by: slaren <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-04-05 21:39:43 +03:00
Ting Sun	87e21bbacd	bench : make n_batch and n_ubatch configurable in Batched bench (#6500 ) * bench: make n_batch and n_ubatch configurable * bench: update doc for batched bench	2024-04-05 21:34:53 +03:00
Ouadie EL FAROUKI	1b496a745c	[SYCL] Fixed minor bug when enabling FP16 for non intel targets (#6464 ) * moved INTEL_MKL guard from gemm_impl to gemm (wrapper) * Update ggml-sycl.cpp Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> --------- Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>	2024-04-05 19:05:06 +05:30
alexpinel	a307375c02	readme : add Dot to UI list (#6487 )	2024-04-04 13:22:50 -04:00
Jun Jie	b660a5729e	readme : fix typo (#6481 )	2024-04-04 13:16:37 -04:00
Ed Lepedus	0a1d889e27	server: add cURL support to server Dockerfiles (#6474 ) * server: add cURL support to `full.Dockerfile` * server: add cURL support to `full-cuda.Dockerfile` and `server-cuda.Dockerfile` * server: add cURL support to `full-rocm.Dockerfile` and `server-rocm.Dockerfile` * server: add cURL support to `server-intel.Dockerfile` * server: add cURL support to `server-vulkan.Dockerfile` * fix typo in `server-vulkan.Dockerfile` Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-04-04 18:31:22 +02:00
Minsoo Cheong	7dda1b727e	ci: exempt master branch workflows from getting cancelled (#6486 ) * ci: exempt master branch workflows from getting cancelled * apply to bench.yml	2024-04-04 18:30:53 +02:00
Ewout ter Hoeven	c666ba26c3	build CI: Name artifacts (#6482 ) Name the artifacts in the build CI, so that they get uploaded with separate names, instead of all put into the same `artifact` ZIP. It might be possible to further simplify the packing step (in future PRs).	2024-04-04 17:08:55 +02:00
Shakhar Dasgupta	2e66913e5f	server: allow penalizing repetition of newlines on server webpage (#6431 )	2024-04-04 17:03:00 +02:00
Pierrick Hymbert	8120efee1d	ci: bench fix concurrency for workflow trigger dispatch with sha1 (#6478 )	2024-04-04 16:59:04 +02:00
limitedAtonement	a74401f0e5	Correct README link (#6458 ) README is called README.md.	2024-04-04 16:30:02 +02:00
Pierrick Hymbert	7a2c92637a	ci: bench: add more ftype, fix triggers and bot comment (#6466 ) * ci: bench: change trigger path to not spawn on each PR * ci: bench: add more file type for phi-2: q8_0 and f16. - do not show the comment by default * ci: bench: add seed parameter in k6 script * ci: bench: artefact name perf job * Add iteration in the commit status, reduce again the autocomment * ci: bench: add per slot metric in the commit status * Fix trailing spaces	2024-04-04 12:57:58 +03:00
Daniel Bevenius	4bcd6b959c	common: remove duplicate check for curl (#6471 ) This commit removes one of the two identical checks for curl being NULL in llama_load_model_from_url. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-04-04 09:49:21 +02:00
Clint Herron	9b84ae1806	examples : add GBNF validator program (#5948 ) * Revising GBNF validator program to be much simpler. * Changing from streams to using cstdio * Adding final newline character.	2024-04-04 10:44:28 +03:00
Georgi Gerganov	4399f13fb9	server : remove obsolete --memory-f32 option	2024-04-04 09:34:58 +03:00
Xiao-Yong Jin	1a43c7254e	server : add option to disable KV offload (#6468 )	2024-04-04 09:33:48 +03:00
Clint Herron	72d73af651	convert : fix for lint error complaining of bare except (#6470 )	2024-04-04 09:32:53 +03:00
Fattire	5fb1574c81	A few small fixes to server's README docs (#6428 ) * Typo fix to server's README.md Fix minor typo ("tonen") in server README. * server readme grammar/style fixes. Quickly went through this file to look for inconsistencies in presentation of defaults, flag options, and looked for typos and grammar issues. Not perfect, but hopefully improved. * Update README.md Remove an extra space before newline.	2024-04-03 22:22:57 +02:00
JH23X	60cdf40cc3	server : handle exception on wrong type in request (#6452 ) Co-authored-by: Jonas Holzner <jonas.holzner.external@hensoldt.net>	2024-04-03 21:09:52 +03:00
bryanSwk	bb43cf7e9d	llama : add SEA-LION support (#6448 ) * initial commit for sealion support * add sealion support * minor fix * q/k ln and pos_embd only if required * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * minor : clear whitespaces --------- Co-authored-by: bryan <bryansiow@aisingapore.org> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-04-03 21:05:10 +03:00
Ewout ter Hoeven	9f62c0173d	ci : update checkout, setup-python and upload-artifact to latest (#6456 ) * CI: Update actions/checkout to v4 * CI: Update actions/setup-python to v5 * CI: Update actions/upload-artifact to v4	2024-04-03 21:01:13 +03:00
Ed Lepedus	5d4f12e462	server: add cURL support to `server.Dockerfile` (#6461 )	2024-04-03 19:56:37 +02:00
Francisco Melo	154d4ee39c	readme : add feature-rich rust bindings (#6465 )	2024-04-03 20:53:37 +03:00
Joyce	e69945d953	security : create policy (#6354 ) * Create SECURITY.md Signed-off-by: Joyce <joycebrum@google.com> * Fix: link on SECURITY.md Signed-off-by: Joyce <joycebrum@google.com> * Fix: link on SECURITY.md Signed-off-by: Joyce <joycebrum@google.com> * minor * fix * fix --------- Signed-off-by: Joyce <joycebrum@google.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-04-03 20:48:07 +03:00
Abhishek Gopinath K	db214fa578	Missing tokenizer.model error during gguf conversion (#6443 ) Co-authored-by: Jared Van Bortel <jared@nomic.ai>	2024-04-03 11:42:52 -04:00
kaizau	1ff4d9f3d6	Add OpenChat, Alpaca, Vicuna chat templates (#6397 ) * Add openchat chat template * Add chat template test for openchat * Add chat template for vicuna * Add chat template for orca-vicuna * Add EOS for vicuna templates * Combine vicuna chat templates * Add tests for openchat and vicuna chat templates * Add chat template for alpaca * Add separate template name for vicuna-orca * Remove alpaca, match deepseek with jinja output * Regenerate chat template test with add_generation_prompt * Separate deepseek bos from system message * Match openchat template with jinja output * Remove BOS token from templates, unprefix openchat	2024-04-03 17:24:31 +02:00
Georgi Gerganov	076b08649e	readme : update hot topics	2024-04-03 16:11:15 +03:00
slaren	08a0c02060	ggml : mul_mat_id use the same tensor for all the experts (#6387 ) * ggml : update mul_mat_id to use the same tensor for all the experts * update cuda * minor * update metal * update test-backend-ops * fix cuda * Update ggml-metal.m Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update convert.py * update convert-hf-to-gguf.py * update convert.py for mixtral hf models * Update convert-hf-to-gguf.py Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * cuda : support non-pow-2 number of experts * allow quantize to work for split and merged experts models in the same way * cleanup + disable mmap automatically with split tensors models * update imatrix * test-backend-ops : test qwen argsort * update grok model loading * llama : add merged experts tensors to the grok tensor map * minor * gguf : bump version * fix quantizing of merged experts * convert-hf-to-gguf.py : update grok (untested) * make linter happy * cuda/argsort : use shared memory instead of pool memory * convert : fix grok tensor names * metal : add support for non-pow-2 argsort * llama : more loader cleanup, better error checking * cuda : fix warning * llama : still use mmap for loading old models, but copy the data to a host buffer * add review note * llama : remove ffn tensor counting + add sanity check ggml-ci * convert : fix handling of n_experts == None ggml-ci * imatrix : fix ncall counters * llama : produce error if imatrix size does not match * quantize : terminate on errors + trace logs ggml-ci * metal : pad shared memory to 16 bytes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-04-03 16:07:05 +03:00
Meng, Hengyu	52604860f9	[SYCL] Disable iqx on windows as WA (#6435 ) * disable iqx on windows as WA * array instead of global_memory	2024-04-03 10:34:40 +08:00
Georgi Gerganov	f87f7b8986	flake.lock: Update (#6402 ) Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/44d0940ea560dee511026a53f0e2e2cde489b4d4' (2024-03-23) → 'github:NixOS/nixpkgs/d8fe5e6c92d0d190646fb9f1056741a229980089' (2024-03-29) Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2024-04-01 09:05:57 -07:00
Johannes Gäßler	33a5244806	compare-llama-bench.py: fix long hexsha args (#6424 )	2024-04-01 13:30:43 +02:00
Pierrick Hymbert	226e819371	ci: server: verify deps are coherent with the commit (#6409 ) * ci: server: verify deps are coherent with the commit * ci: server: change the ref to build as now it's a pull event target	2024-04-01 12:36:40 +02:00
Georgi Gerganov	c50a82ce0f	readme : update hot topics	2024-03-31 11:56:30 +03:00
Pierrick Hymbert	37e7854c10	ci: bench: fix Resource not accessible by integration on PR event (#6393 )	2024-03-30 12:36:07 +02:00
Mohammadreza Hendiani	c342d070c6	Fedora build update (#6388 ) * fixed deprecated address * fixed deprecated address * fixed deprecated address * Added 'Apache-2.0' SPDX license identifier due to 'kompute.cc' submodule licensing. Explanation of licensing method: https://docs.fedoraproject.org/en-US/legal/spdx/#_and_expressions * Added 'Apache-2.0' SPDX license identifier due to 'kompute.cc' submodule licensing. Explanation of licensing method: https://docs.fedoraproject.org/en-US/legal/spdx/#_and_expressions * Added 'Apache-2.0' SPDX license identifier due to 'kompute.cc' submodule licensing. Explanation of licensing method: https://docs.fedoraproject.org/en-US/legal/spdx/#_and_expressions * reverted back to only the MIT license	2024-03-29 22:59:56 +01:00