Compare commits

...

18 Commits

Author SHA1 Message Date
Jeff Bolz 02115dcd9a vulkan: Allow up to 4096 elements for mul_mat_id row_ids (#13326)
This assert fired running Qwen_Qwen3-30B-A3B-Q2_K.gguf:

GGML_ASSERT(nei0 * nei1 <= 3072);

The tensor is 8 x 512. Increase this array size to accommodate.
2025-05-09 09:23:41 +02:00
Xuan-Son Nguyen d9c4accaff server : (webui) rename has_multimodal --> modalities (#13393)
* server : (webui) rename has_multimodal --> modalities

* allow converting SVG to PNG

* less complicated code
2025-05-09 09:06:37 +02:00
Diego Devesa 15e03282bb ci : limit write permission to only the release step + fixes (#13392)
* ci : limit write permission to only the release step

* fix win cuda file name

* fix license file copy on multi-config generators
2025-05-08 23:45:22 +02:00
Matt Clayton f05a6d71a0 mtmd : Expose helper_decode_image_chunk (#13366)
* mtmd: Expose helper_decode_image, output_embd_copy, image_tokens_copy/free

* Slim down

* Cleanups
2025-05-08 20:25:39 +02:00
Xuan-Son Nguyen ee01d71e58 server : (webui) fix a very small misalignment (#13387)
* server : (webui) fix a very small misalignment

* restore font-bold
2025-05-08 18:51:45 +02:00
Xuan-Son Nguyen 8c83449cb7 server : (webui) revamp the input area, plus many small UI improvements (#13365)
* rework the input area

* process selected file

* change all icons to heroicons

* fix thought process collapse

* move conversation more menu to sidebar

* sun icon --> moon icon

* rm default system message

* stricter upload file check, only allow image if server has mtmd

* build it

* add renaming

* better autoscroll

* build

* add conversation group

* fix scroll

* extra context first, then user input in the end

* fix <hr> tag

* clean up a bit

* build

* add mb-3 for <pre>

* throttle adjustTextareaHeight to make it less laggy

* (nits) missing padding in sidebar

* rm stray console log
2025-05-08 15:37:29 +02:00
Sigbjørn Skjæret 1a844be132 convert : support rope_scaling type and rope_type (#13349) 2025-05-08 15:34:29 +02:00
welix 0ccc121354 mtmd : fix the calculation of n_tokens for smolvlm (#13381)
Co-authored-by: Taichi Nishimura <Taichi.A.Nishimura@sony.com>
2025-05-08 15:03:53 +02:00
Georgi Gerganov 6562e5a4d6 context : allow cache-less context for embeddings (#13108)
* context : allow cache-less context for embeddings

ggml-ci

* context : enable reranking with encode()

ggml-ci

* context : encode() clears embd_seq

ggml-ci

* examples : use llama_encode() when appropriate

ggml-ci

* models : nomic bert moe does not require KV cache

* llama : update comments for llama_decode/llama_encode

ggml-ci

* context : update warning log [no ci]
2025-05-08 14:28:33 +03:00
Georgi Gerganov 51fb96b1ff context : remove logits_all flag (#13284)
* context : remove logits_all flag

ggml-ci

* llama : remove logits_all flag + reorder llama_context_params

ggml-ci
2025-05-08 14:26:50 +03:00
Diego Devesa 70a6991edf ci : move release workflow to a separate file (#13362) 2025-05-08 13:15:28 +02:00
Diego Devesa f061021206 llama : print size and type of overridden tensors (#13364) 2025-05-08 13:15:15 +02:00
Alberto Cabrera Pérez 8733e0cf6e sycl: addressing non-contiguous src1 mul_mats (nc and batched) (#13343)
* sycl: fixed non-contiguous src1 mul_mats (nc and batched)

* Fixed wrong static_cast inside kernel
2025-05-08 10:08:01 +01:00
Diego Devesa 814f795e06 docker : disable arm64 and intel images (#13356) 2025-05-07 16:36:33 +02:00
Georgi Gerganov d879433824 sync : ggml
ggml-ci
2025-05-07 17:28:36 +03:00
Daniel Bevenius 13b0a04597 whisper: remove MSVC warnings pragmas (whisper/3090)
* ggml : remove MSVC warnings pragmas

This commit removes the MSVC-specific pragmas as these are now handled
in ggml/CMakeLists.txt.

* whisper : remove MSVC warning pragmas

This commit removes the MSVC-specific pragmas. These are now handled in
the ggml/CMakeLists.txt file.
2025-05-07 17:28:36 +03:00
Jared Tweed bba9d945c1 cmake : removed stdc++fs (whisper/3097)
* removed stdc++fs

* kept line, but removed stdc++fs
2025-05-07 17:28:36 +03:00
Sigbjørn Skjæret bc4e1128f7 llama : deci : support ffn-free with attention (#13296) 2025-05-07 12:49:27 +02:00
64 changed files with 2406 additions and 1410 deletions
+22
View File
@@ -0,0 +1,22 @@
name: "Determine tag name"
description: "Determine the tag name to use for a release"
outputs:
name:
description: "The name of the tag"
value: ${{ steps.tag.outputs.name }}
runs:
using: "composite"
steps:
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
@@ -0,0 +1,67 @@
name: "Windows - Setup CUDA Toolkit"
description: "Setup CUDA Toolkit for Windows"
inputs:
cuda_version:
description: "CUDA toolkit version"
required: true
runs:
using: "composite"
steps:
- name: Install Cuda Toolkit 11.7
if: ${{ inputs.cuda_version == '11.7' }}
shell: pwsh
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install Cuda Toolkit 12.4
if: ${{ inputs.cuda_version == '12.4' }}
shell: pwsh
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+7 -713
View File
@@ -2,30 +2,19 @@ name: CI
on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
description: 'Create new release'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
contents: write # for creating release
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
@@ -40,8 +29,6 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@@ -74,33 +61,6 @@ jobs:
cd build
ctest -L 'main|curl' --verbose --timeout 900
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip
macOS-latest-cmake-x64:
runs-on: macos-13
@@ -108,8 +68,6 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@@ -143,33 +101,6 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
ubuntu-cpu-cmake:
strategy:
matrix:
@@ -185,8 +116,6 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@@ -225,33 +154,6 @@ jobs:
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
name: llama-bin-ubuntu-${{ matrix.build }}.zip
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
@@ -378,8 +280,6 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@@ -409,33 +309,6 @@ jobs:
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 2700
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
name: llama-bin-ubuntu-vulkan-x64.zip
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2
@@ -831,8 +704,6 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@@ -935,35 +806,6 @@ jobs:
# $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
# & $sde -future -- ctest -L main -C Release --verbose --timeout 900
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}.zip
ubuntu-latest-cmake-cuda:
runs-on: ubuntu-latest
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
@@ -972,8 +814,6 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
env:
@@ -1005,77 +845,23 @@ jobs:
strategy:
matrix:
cuda: ['12.4', '11.7']
build: ['cuda']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
key: windows-cuda-${{ matrix.cuda }}
variant: ccache
evict-old-files: 1d
- name: Install Cuda Toolkit 11.7
if: ${{ matrix.cuda == '11.7' }}
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install Cuda Toolkit 12.4
if: ${{ matrix.cuda == '12.4' }}
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install Cuda Toolkit
uses: ./.github/actions/windows-setup-cuda
with:
cuda_version: ${{ matrix.cuda }}
- name: Install Ninja
id: install_ninja
@@ -1105,51 +891,6 @@ jobs:
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
run: |
echo "Cuda install location: ${{ env.CUDA_PATH }}"
$dst='.\build\bin\cudart\'
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
windows-latest-cmake-sycl:
runs-on: windows-latest
@@ -1165,8 +906,6 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@@ -1185,52 +924,6 @@ jobs:
id: cmake_build
run: examples/sycl/win-build-sycl.bat
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Build the release package
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }}
runs-on: windows-latest
@@ -1288,110 +981,12 @@ jobs:
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
# TODO: reuse windows-latest-cmake-hip instead of duplicating this job
windows-latest-cmake-hip-release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: windows-latest
strategy:
matrix:
gpu_target: [gfx1100, gfx1101, gfx1030]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Clone rocWMMA repository
id: clone_rocwmma
run: |
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-release
evict-old-files: 1d
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DGGML_RPC=ON `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
ios-xcode-build:
runs-on: macos-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Build
id: cmake_build
@@ -1418,32 +1013,6 @@ jobs:
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework
android-build:
runs-on: ubuntu-latest
@@ -1471,283 +1040,8 @@ jobs:
- name: Build
run: |
cd examples/llama.android
./gradlew build --no-daemon
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: ubuntu-latest
needs:
- ubuntu-cpu-cmake
- ubuntu-22-cmake-vulkan
- windows-latest-cmake
- windows-2019-cmake-cuda
- windows-latest-cmake-sycl
- windows-latest-cmake-hip-release
- macOS-latest-cmake-arm64
- macOS-latest-cmake-x64
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: release
evict-old-files: 1d
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
with:
path: ./artifact
- name: Move artifacts
id: move_artifacts
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
- name: Create release
id: create_release
uses: ggml-org/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.tag.outputs.name }}
- name: Upload release
id: upload_release
uses: actions/github-script@v3
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./artifact/release')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./artifact/release/${file}`)
});
}
}
# ubuntu-latest-gcc:
# runs-on: ubuntu-latest
#
# strategy:
# matrix:
# build: [Debug, Release]
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
# sudo apt-get install cmake
#
# - name: Configure
# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
#
# - name: Build
# run: |
# make
#
# ubuntu-latest-clang:
# runs-on: ubuntu-latest
#
# strategy:
# matrix:
# build: [Debug, Release]
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
# sudo apt-get install cmake
#
# - name: Configure
# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
#
# - name: Build
# run: |
# make
#
# ubuntu-latest-gcc-sanitized:
# runs-on: ubuntu-latest
#
# strategy:
# matrix:
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
# sudo apt-get install cmake
#
# - name: Configure
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
#
# - name: Build
# run: |
# make
#
# windows:
# runs-on: windows-latest
#
# strategy:
# matrix:
# build: [Release]
# arch: [Win32, x64]
# include:
# - arch: Win32
# s2arc: x86
# - arch: x64
# s2arc: x64
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
#
# - name: Configure
# run: >
# cmake -S . -B ./build -A ${{ matrix.arch }}
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
#
# - name: Build
# run: |
# cd ./build
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
#
# - name: Upload binaries
# uses: actions/upload-artifact@v4
# with:
# name: llama-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
#
# windows-blas:
# runs-on: windows-latest
#
# strategy:
# matrix:
# build: [Release]
# arch: [Win32, x64]
# blas: [ON]
# include:
# - arch: Win32
# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
# s2arc: x86
# - arch: x64
# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
# s2arc: x64
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
#
# - name: Fetch OpenBLAS
# if: matrix.blas == 'ON'
# run: |
# C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
# 7z x blas.zip -oblas -y
# copy blas/include/cblas.h .
# copy blas/include/openblas_config.h .
# echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
#
# - name: Configure
# run: >
# cmake -S . -B ./build -A ${{ matrix.arch }}
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
#
# - name: Build
# run: |
# cd ./build
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
#
# - name: Copy libopenblas.dll
# if: matrix.blas == 'ON'
# run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
#
# - name: Upload binaries
# if: matrix.blas == 'ON'
# uses: actions/upload-artifact@v4
# with:
# name: llama-blas-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
#
# emscripten:
# runs-on: ubuntu-latest
#
# strategy:
# matrix:
# build: [Release]
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Dependencies
# run: |
# wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
# tar -xvf master.tar.gz
# emsdk-master/emsdk update
# emsdk-master/emsdk install latest
# emsdk-master/emsdk activate latest
#
# - name: Configure
# run: echo "tmp"
#
# - name: Build
# run: |
# pushd emsdk-master
# source ./emsdk_env.sh
# popd
# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
# make
openEuler-latest-cmake-cann:
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
defaults:
+6 -2
View File
@@ -36,10 +36,14 @@ jobs:
matrix:
config:
# Multi-stage build
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
# Note: the arm64 images are failing, which prevents the amd64 images from being built
# https://github.com/ggml-org/llama.cpp/issues/11888
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
# Note: the intel images are failing due to an out of disk space error
# - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
+709
View File
@@ -0,0 +1,709 @@
name: Create Release
on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
description: 'Create new release'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
jobs:
macOS-arm64:
runs-on: macos-14
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip
macOS-x64:
runs-on: macos-13
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-x64
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
run: |
sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
ubuntu-22-cpu:
strategy:
matrix:
include:
- build: 'x64'
os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-22.04-arm
runs-on: ${{ matrix.os }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-cpu-cmake
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
name: llama-bin-ubuntu-${{ matrix.build }}.zip
ubuntu-22-vulkan:
runs-on: ubuntu-22.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-vulkan
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
cmake -B build \
-DGGML_VULKAN=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
name: llama-bin-ubuntu-vulkan-x64.zip
windows:
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
VULKAN_VERSION: 1.4.309.0
strategy:
matrix:
include:
- build: 'cpu-x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
#- build: 'openblas-x64'
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'vulkan-x64'
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
- build: 'cpu-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
- build: 'opencl-adreno-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-${{ matrix.build }}
variant: ccache
evict-old-files: 1d
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
mkdir $env:RUNNER_TEMP/openblas
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'opencl-adreno-arm64' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
cmake -B build `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
cmake -B build-arm64-release `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cmake -S . -B build ${{ matrix.defines }} `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas-x64' }}
run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}.zip
windows-cuda:
runs-on: windows-2019
strategy:
matrix:
cuda: ['12.4', '11.7']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-cuda-${{ matrix.cuda }}
variant: ccache
evict-old-files: 1d
- name: Install Cuda Toolkit
uses: ./.github/actions/windows-setup-cuda
with:
cuda_version: ${{ matrix.cuda }}
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
shell: cmd
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=ON ^
-DGGML_CUDA=ON ^
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
${{ env.CMAKE_ARGS }}
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
run: |
echo "Cuda install location: ${{ env.CUDA_PATH }}"
$dst='.\build\bin\cudart\'
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime
uses: actions/upload-artifact@v4
with:
path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
windows-sycl:
runs-on: windows-latest
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-sycl
variant: ccache
evict-old-files: 1d
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Build the release package
id: pack_artifacts
run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
windows-hip:
runs-on: windows-latest
strategy:
matrix:
gpu_target: [gfx1100, gfx1101, gfx1030]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Clone rocWMMA repository
id: clone_rocwmma
run: |
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-release
evict-old-files: 1d
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
${{ env.CMAKE_ARGS }}
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
ios-xcode-build:
runs-on: macos-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
- name: xcodebuild for swift package
id: xcodebuild
run: |
./build-xcframework.sh
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
contents: write # for creating release
runs-on: ubuntu-latest
needs:
- ubuntu-22-cpu
- ubuntu-22-vulkan
- windows
- windows-cuda
- windows-sycl
- windows-hip
- macOS-arm64
- macOS-x64
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
with:
path: ./artifact
- name: Move artifacts
id: move_artifacts
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
- name: Create release
id: create_release
uses: ggml-org/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.tag.outputs.name }}
- name: Upload release
id: upload_release
uses: actions/github-script@v3
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./artifact/release')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./artifact/release/${file}`)
});
}
}
-17
View File
@@ -252,20 +252,3 @@ configure_file(cmake/llama.pc.in
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
#
# copy the license files
#
# Check if running in GitHub Actions
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
message(STATUS "Running inside GitHub Actions - copying license files")
# Copy all files from licenses/ to build/bin/
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
foreach(LICENSE_FILE ${LICENSE_FILES})
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
endforeach()
endif()
+24
View File
@@ -144,3 +144,27 @@ endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_17)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
#
# copy the license files
#
# Check if running in GitHub Actions
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
message(STATUS "Running inside GitHub Actions - copying license files")
# Copy all files from licenses/ to build/bin/
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
foreach(LICENSE_FILE ${LICENSE_FILES})
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
add_custom_command(
POST_BUILD
TARGET ${TARGET}
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${LICENSE_FILE}"
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
endforeach()
endif()
-7
View File
@@ -2097,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.cache_type_v = kv_cache_type_from_str(value);
}
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
add_opt(common_arg(
{"--perplexity", "--all-logits"},
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
[](common_params & params) {
params.logits_all = true;
}
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--hellaswag"},
"compute HellaSwag score over random tasks from datafile supplied with -f",
+3 -1
View File
@@ -125,7 +125,9 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
msgs.push_back(msg);
}
} catch (const std::exception & e) {
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
// @ngxson : disable otherwise it's bloating the API response
// printf("%s\n", std::string("; messages = ") + messages.dump(2));
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
}
return msgs;
-1
View File
@@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base;
-1
View File
@@ -324,7 +324,6 @@ struct common_params {
bool ctx_shift = true; // context shift on inifinite text generation
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
+58 -57
View File
@@ -1388,10 +1388,10 @@ class BaichuanModel(TextModel):
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_file_type(self.ftype)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
head_count = self.hparams["num_attention_heads"]
@@ -1512,10 +1512,10 @@ class XverseModel(TextModel):
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_file_type(self.ftype)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
@@ -1828,10 +1828,10 @@ class LlamaModel(TextModel):
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -2206,10 +2206,10 @@ class DeciModel(TextModel):
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -2449,10 +2449,10 @@ class MiniCPMModel(TextModel):
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
self.gguf_writer.add_logit_scale(logit_scale)
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
if self.hparams.get("rope_scaling") is not None:
if self.hparams["rope_scaling"].get("type") == "longrope":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
@@ -2597,11 +2597,11 @@ class Qwen2Model(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self._try_set_pooling_type()
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if self.hf_arch == "Qwen2Model":
@@ -2763,11 +2763,11 @@ class Qwen2MoeModel(TextModel):
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
# YaRN is not enabled by default
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
_experts: list[dict[str, Tensor]] | None = None
@@ -3035,7 +3035,7 @@ class Phi3MiniModel(TextModel):
scale = max_pos_embds / orig_max_pos_embds
rope_scaling_type = rope_scaling.get('type', '').lower()
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')
@@ -3347,10 +3347,10 @@ class InternLM2Model(TextModel):
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
self.gguf_writer.add_file_type(self.ftype)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
num_heads = self.hparams["num_attention_heads"]
@@ -3425,10 +3425,10 @@ class InternLM3Model(TextModel):
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
@@ -4866,12 +4866,12 @@ class DeepseekV2Model(TextModel):
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
_experts: list[dict[str, Tensor]] | None = None
@@ -5363,11 +5363,11 @@ class Glm4Model(TextModel):
super().set_gguf_parameters()
rope_dim = self.hparams["head_dim"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
@@ -5600,10 +5600,10 @@ class ExaoneModel(TextModel):
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
if hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
@@ -5706,10 +5706,11 @@ class BailingMoeModel(TextModel):
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if (self.hparams.get("rope_scaling") or {}).get("type") == "yarn" and "factor" in self.hparams["rope_scaling"]:
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
else:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+2 -11
View File
@@ -35,23 +35,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
const struct llama_model * model = llama_get_model(ctx);
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_self_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
// encoder-only model
if (llama_encode(ctx, batch) < 0) {
LOG_ERR("%s : failed to encode\n", __func__);
}
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
// decoder-only model
if (llama_decode(ctx, batch) < 0) {
LOG_ERR("%s : failed to decode\n", __func__);
}
if (llama_encode(ctx, batch) < 0) {
LOG_ERR("%s : failed to encode\n", __func__);
}
for (int i = 0; i < batch.n_tokens; i++) {
+2
View File
@@ -366,6 +366,8 @@ if (MSVC)
/wd4005 # Macro redefinition
/wd4244 # Conversion from one type to another type, possible loss of data
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
/wd4996 # Disable POSIX deprecation warnings
/wd4702 # Unreachable code warnings
)
function(disable_msvc_warnings target_name)
if(TARGET ${target_name})
+1 -1
View File
@@ -214,7 +214,7 @@ add_library(ggml
target_link_libraries(ggml PUBLIC ggml-base)
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
target_link_libraries(ggml PRIVATE dl stdc++fs)
target_link_libraries(ggml PRIVATE dl)
endif()
function(ggml_add_backend_library backend)
-2
View File
@@ -72,8 +72,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Woverlength-strings"
#elif defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define UNUSED GGML_UNUSED
-6
View File
@@ -20,12 +20,6 @@
#define GROUP_MAX_EPS_IQ1_M 1e-7f
#define GROUP_MAX_EPS_IQ1_S 1e-12f
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid warnings for hundreds of casts
// we should just be careful :)
#pragma warning(disable: 4244 4267)
#endif
#define UNUSED GGML_UNUSED
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
-13
View File
@@ -50,19 +50,6 @@
#include "llamafile/sgemm.h"
#endif
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
#pragma warning(disable: 4244 4267)
// disable POSIX deprecation warnings
// these functions are never going away, anyway
#pragma warning(disable: 4996)
// unreachable code because of multiple instances of code after GGML_ABORT
#pragma warning(disable: 4702)
#endif
// Note: once we move threading into a separate C++ file
// will use std::hardware_destructive_interference_size instead of hardcoding it here
// and we'll use C++ attribute syntax.
-13
View File
@@ -8,19 +8,6 @@
#include <float.h>
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
#pragma warning(disable: 4244 4267)
// disable POSIX deprecation warnings
// these functions are never going away, anyway
#pragma warning(disable: 4996)
// unreachable code because of multiple instances of code after GGML_ABORT
#pragma warning(disable: 4702)
#endif
// ggml_compute_forward_dup
static void ggml_compute_forward_dup_same_cont(
-6
View File
@@ -2,12 +2,6 @@
#include <cassert>
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
#pragma warning(disable: 4244 4267)
#endif
// precomputed gelu table for f16 (128 KB)
ggml_fp16_t ggml_table_gelu_f16[1 << 16];
-4
View File
@@ -130,10 +130,6 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define GGML_CUDA_MAX_STREAMS 8
[[noreturn]]
-6
View File
@@ -19,12 +19,6 @@
#define GROUP_MAX_EPS_IQ1_M 1e-7f
#define GROUP_MAX_EPS_IQ1_S 1e-12f
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid warnings for hundreds of casts
// we should just be careful :)
#pragma warning(disable: 4244 4267)
#endif
#define UNUSED GGML_UNUSED
// reference implementation for deterministic creation of model files
+6 -15
View File
@@ -80,10 +80,6 @@ extern int g_ggml_sycl_disable_optimize;
// max batch size to use MMQ kernels when tensor cores are available
#define MMQ_MAX_BATCH_SIZE 32
#if defined(_MSC_VER)
#pragma warning(disable : 4244 4267) // possible loss of data
#endif
// dmmv = dequantize_mul_mat_vec
#ifndef GGML_SYCL_DMMV_X
#define GGML_SYCL_DMMV_X 32
@@ -118,17 +114,12 @@ static void crash() {
GGML_ABORT("SYCL error");
}
#define SYCL_CHECK(err) \
do { \
auto err_ = (err); \
if (err_ != 0) \
ggml_sycl_error( \
#err, \
__func__, \
__FILE__, \
__LINE__, \
"Meet error in this line code!"); \
} while (0)
#define SYCL_CHECK(err) \
do { \
auto err_ = (err); \
if (err_ != 0) \
ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \
} while (0)
#if DPCT_COMPAT_RT_VERSION >= 11100
#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
+43 -23
View File
@@ -437,41 +437,52 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
}
template <typename src_t, typename dst_t>
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
const sycl::nd_item<3> &item_ct1) {
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
const sycl::nd_item<3> & item_ct1) {
const int64_t work_group_size = item_ct1.get_local_range(2);
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
const int64_t i01 = item_ct1.get_group(1);
const int64_t i02 = item_ct1.get_group(0) % ne02;
const int64_t i03 = item_ct1.get_group(0) / ne02;
// make each work-item deal with more elements since sycl global range can not exceed max int
const src_t * x = (const src_t *) vx;
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
y[i] = x[i];
const src_t * x = static_cast<const src_t *>(vx);
const int64_t ix = i03 * s03 + i02 * s02 + i01 * s01;
const int64_t iy = ((i03 * ne02 + i02) * ne01 + i01) * ne00;
#pragma unroll
for (int64_t i00 = global_id; i00 < ne00; i00 += work_group_size * item_ct1.get_group_range(2)) {
y[iy + i00] = static_cast<dst_t>(x[ix + i00]);
}
}
template <typename src_t, typename dst_t>
static void convert_unary_sycl(const void *__restrict__ vx,
dst_t *__restrict__ y, const int64_t k,
dpct::queue_ptr stream) {
const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
static void convert_unary_nc_sycl(const void * __restrict__ vx, dst_t * __restrict__ y,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t s01, const int64_t s02, const int64_t s03, dpct::queue_ptr queue) {
dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 });
sycl::range<3> global_size(ne02 * ne03, ne01, ceil_div(ne00, SYCL_DEQUANTIZE_BLOCK_SIZE));
// decrease global range when it exceeds the max int
int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
sycl::range<3> block_nums(1, 1, num_blocks);
sycl::range<3> local_range(1, 1, local_size);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
// TODO: Downsample logic is separated from the kernel, a rewrite is desirable
int64_t downsized_workgroup = downsample_sycl_global_range(global_size[0], SYCL_DEQUANTIZE_BLOCK_SIZE);
sycl::range<3> workgroup_size(1, 1, downsized_workgroup);
stream->parallel_for(
sycl::nd_range<3>(block_nums * local_range, local_range),
[=](sycl::nd_item<3> item_ct1) {
convert_unary<src_t>(vx, y, k, item_ct1);
});
}
queue->parallel_for(sycl::nd_range<3>(global_size * workgroup_size, workgroup_size), [=](sycl::nd_item<3> item_ct1) {
convert_unary_nc<src_t>(vx, y, ne00, ne01, ne02, s01, s02, s03, item_ct1);
});
}
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
template <typename src_t, typename dst_t>
static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr queue) {
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
}
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
switch (type) {
case GGML_TYPE_Q4_0:
if (dst->src[0]->extra &&
@@ -574,3 +585,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
return nullptr;
}
}
to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return convert_unary_nc_sycl<float>;
default:
return nullptr;
}
}
+14 -7
View File
@@ -1,6 +1,6 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: MIT
//
@@ -16,12 +16,19 @@
#include "common.hpp"
template <typename T>
using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
int64_t k, dpct::queue_ptr stream);
typedef to_t_sycl_t<float> to_fp32_sycl_t;
using to_t_sycl_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, dpct::queue_ptr stream);
typedef to_t_sycl_t<float> to_fp32_sycl_t;
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst);
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst);
#endif // GGML_SYCL_CONVERT_HPP
// Nc = Non-contiguous
template <typename T>
using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
int64_t s01, int64_t s02, int64_t s03, dpct::queue_ptr queue);
typedef to_t_nc_sycl_t<sycl::half> to_fp16_nc_sycl_t;
to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type);
#endif // GGML_SYCL_CONVERT_HPP
+75 -84
View File
@@ -2694,35 +2694,31 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
const sycl::half *src1_as_f16, char *dst,
const void **ptrs_src, void **ptrs_dst,
int64_t ne12, int64_t ne13, int64_t ne23,
size_t nb02, size_t nb03, size_t nb12,
size_t nb13, size_t nbd2, size_t nbd3,
int64_t r2, int64_t r3,
const sycl::nd_item<3> &item_ct1) {
int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
item_ct1.get_local_id(2);
int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, char * dst,
const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23,
size_t nb02, size_t nb03, size_t nb12, size_t nb13, size_t nbd2, size_t nbd3,
int64_t r2, int64_t r3, const sycl::nd_item<3> & item_ct1) {
const int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
const int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
if (i13 >= ne13 || i12 >= ne12) {
return;
}
int64_t i03 = i13 / r3;
int64_t i02 = i12 / r2;
const int64_t i03 = i13 / r3;
const int64_t i02 = i12 / r2;
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
const uint8_t * src0_bytes = reinterpret_cast<const uint8_t *>(src0_as_f16);
const uint8_t * src1_bytes = reinterpret_cast<const uint8_t *>(src1_as_f16);
uint8_t * dst_bytes = reinterpret_cast<uint8_t *>(dst);
ptrs_src[0 * ne23 + i12 + i13 * ne12] = src0_bytes + i02 * nb02 + i03 * nb03;
ptrs_src[1 * ne23 + i12 + i13 * ne12] = src1_bytes + i12 * nb12 + i13 * nb13;
ptrs_dst[0 * ne23 + i12 + i13 * ne12] = dst_bytes + i12 * nbd2 + i13 * nbd3;
}
static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
const ggml_tensor *src0,
const ggml_tensor *src1,
ggml_tensor *dst) try {
static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
const ggml_tensor * src1, ggml_tensor * dst) try {
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
@@ -2730,102 +2726,100 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
GGML_TENSOR_BINARY_OP_LOCALS
// TODO: see https://github.com/ggml-org/llama.cpp/pull/13155
// Batched mul_mat requires a rewrite to support both oneDNN and non-contiguous dst
GGML_ASSERT(ggml_is_contiguous(dst));
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
queue_ptr main_stream = ctx.stream();;
queue_ptr queue = ctx.stream();
void * src0_ddq = src0->data;
sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
float * src1_ddf = (float *) src1->data;
float * dst_ddf = (float *) dst->data;
dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 });
const sycl::half * src0_f16 = static_cast<const sycl::half *>(src0->data);
float * dst_ddf = static_cast<float *>(dst->data);
const sycl::half * src1_f16 = static_cast<const sycl::half *>(src1->data);
const size_t type_size_src1 = ggml_type_size(src1->type);
GGML_ASSERT(nb10 == type_size_src1);
// SRC1 strides
int64_t s11 = nb11 / type_size_src1;
int64_t s12 = nb12 / type_size_src1;
int64_t s13 = nb13 / type_size_src1;
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
// convert src1 to fp16
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
if (src1->type != GGML_TYPE_F16) {
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
const int64_t ne_src1 = ggml_nelements(src1);
src1_f16_alloc.alloc(ne_src1);
GGML_ASSERT(to_fp16_sycl != nullptr);
to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
src1_f16 = src1_f16_alloc.get();
s11 = ne10;
s12 = ne11 * s11;
s13 = ne12 * s12;
}
sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
: src1_f16_alloc.get();
char * dst_t;
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
char * dst_t = reinterpret_cast<char *>(dst_ddf);
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
dpct::library_data_t mkl_compute_type = dpct::library_data_t::real_float;
dpct::library_data_t mkl_data_type = dpct::library_data_t::real_float;
// dst strides
size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3];
const float alpha_f32 = 1.0f;
const float beta_f32 = 0.0f;
const float beta_f32 = 0.0f;
const void * alpha = &alpha_f32;
const void * beta = &beta_f32;
dst_t = (char *) dst_ddf;
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
// broadcast factors
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
const int64_t r2 = ne12 / ne02;
const int64_t r3 = ne13 / ne03;
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
(const char *) src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
(const char *) src1_f16, dpct::library_data_t::real_half, nb11 / nb10, nb12 / nb10, beta, (char *) dst_t,
cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type)));
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_t,
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
} else {
const int ne23 = ne12*ne13;
const int ne23 = ne12 * ne13;
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
ggml_sycl_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
sycl::range<3> block_dims(1, ne12, ne13);
/*
DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if needed.
*/
{
dpct::has_capability_or_fail(main_stream->get_device(),
{sycl::aspect::fp16});
main_stream->submit([&](sycl::handler &cgh) {
const void **ptrs_src_get = ptrs_src.get();
void **ptrs_dst_get = ptrs_dst.get();
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
k_compute_batched_ptrs(
src0_as_f16, src1_f16,
dst_t, ptrs_src_get,
ptrs_dst_get, ne12, ne13, ne23,
nb02, nb03, nb12_scaled, nb13_scaled,
nbd2, nbd3, r2, r3, item_ct1);
});
queue->submit([&](sycl::handler & cgh) {
const void ** ptrs_src_get = ptrs_src.get();
void ** ptrs_dst_get = ptrs_dst.get();
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
k_compute_batched_ptrs(src0_f16, src1_f16, dst_t, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
});
}
});
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
}
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
} catch (const sycl::exception & exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
std::exit(1);
}
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
@@ -2966,7 +2960,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
}
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
@@ -3873,9 +3867,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
if (a->ne[3] != b->ne[3]) {
return false;
}
if (!ggml_is_contiguous(b)) {
return false;
}
ggml_type a_type = a->type;
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
+2 -2
View File
@@ -1632,7 +1632,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
const uint32_t warps = warptile[0] / warptile[10];
const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
const uint32_t mmid_row_ids = mul_mat_id ? 4096 * sizeof(uint32_t) : 0;
const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0;
const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size;
@@ -5260,7 +5260,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
const uint64_t nei0 = ids->ne[0];
const uint64_t nei1 = ids->ne[1];
GGML_ASSERT(nei0 * nei1 <= 3072);
GGML_ASSERT(nei0 * nei1 <= 4096);
const uint32_t nbi1 = ids->nb[1];
const uint32_t nbi2 = ids->nb[2];
@@ -103,7 +103,7 @@ shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE];
shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];
#ifdef MUL_MAT_ID
shared u16vec2 row_ids[3072];
shared u16vec2 row_ids[4096];
#endif // MUL_MAT_ID
#define NUM_WARPS (BLOCK_SIZE / WARP)
@@ -92,7 +92,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
#ifdef MUL_MAT_ID
layout (binding = 3) readonly buffer IDS {int data_ids[];};
shared u16vec4 row_ids[3072];
shared u16vec4 row_ids[4096];
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
B_TYPE b[];
@@ -101,7 +101,7 @@ shared FLOAT_TYPE_VEC2 buf_b_ds[BN];
#define LOAD_VEC_B 4
#ifdef MUL_MAT_ID
shared u16vec2 row_ids[3072];
shared u16vec2 row_ids[4096];
#endif // MUL_MAT_ID
#define NUM_WARPS (BLOCK_SIZE / WARP)
+13 -10
View File
@@ -351,19 +351,17 @@ extern "C" {
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
// TODO: move at the end of the struct
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// currently works only with CPU execution
ggml_abort_callback abort_callback;
void * abort_callback_data;
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
};
// model quantization parameters
@@ -924,14 +922,19 @@ extern "C" {
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API void llama_batch_free(struct llama_batch batch);
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
// Stores the encoder output internally for later use by the decoder cross-attention layers.
// Process a batch of tokens.
// In contrast to llama_decode() - this call does not use KV cache.
// For encode-decoder contexts, processes the batch using the encoder.
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
// 0 - success
// < 0 - error. the KV cache state is restored to the state before this call
LLAMA_API int32_t llama_encode(
struct llama_context * ctx,
struct llama_batch batch);
// Process a batch of tokens.
// Requires KV cache.
// For encode-decoder contexts, processes the batch using the decoder.
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+1 -1
View File
@@ -1 +1 @@
0482de9c63b9134eb462c7732888c0ee0dbc2755
b59bddafe278877dfa22a80e53a637513862babb
+25 -14
View File
@@ -116,8 +116,6 @@ llama_context::llama_context(
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
logits_all = params.logits_all;
if (!hparams.vocab_only) {
// GPU backends
for (auto * dev : model.devices) {
@@ -253,7 +251,7 @@ llama_context::llama_context(
}
// reserve worst-case graph
if (!hparams.vocab_only) {
if (!hparams.vocab_only && memory) {
const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
@@ -702,6 +700,8 @@ int llama_context::encode(llama_batch & inp_batch) {
t_compute_start_us = ggml_time_us();
}
embd_seq.clear();
n_queued_tokens += n_tokens;
const int64_t n_embd = hparams.n_embd;
@@ -763,12 +763,12 @@ int llama_context::encode(llama_batch & inp_batch) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
GGML_ASSERT(backend_embd != nullptr);
GGML_ASSERT(embd != nullptr);
switch (cparams.pooling_type) {
case LLAMA_POOLING_TYPE_NONE:
{
// extract token embeddings
GGML_ASSERT(embd != nullptr);
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
} break;
@@ -793,11 +793,18 @@ int llama_context::encode(llama_batch & inp_batch) {
} break;
case LLAMA_POOLING_TYPE_RANK:
{
// TODO: this likely should be the same logic as in llama_decoder_internal, but better to
// wait for an encoder model that requires this pooling type in order to test it
// https://github.com/ggerganov/llama.cpp/pull/9510
GGML_ABORT("RANK pooling not implemented yet");
}
// extract the rerank score - a single float per sequence
auto & embd_seq_out = embd_seq;
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
const llama_seq_id seq_id = ubatch.seq_id[s][0];
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
continue;
}
embd_seq_out[seq_id].resize(1);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ABORT("unknown pooling type");
@@ -835,6 +842,11 @@ int llama_context::encode(llama_batch & inp_batch) {
}
int llama_context::decode(llama_batch & inp_batch) {
if (!memory) {
LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
return encode(inp_batch);
}
if (inp_batch.n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1;
@@ -890,7 +902,7 @@ int llama_context::decode(llama_batch & inp_batch) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
}
} else if (logits_all || embd_pooled) {
} else if (embd_pooled) {
n_outputs_all = n_tokens_all;
} else {
// keep last output only
@@ -1853,13 +1865,12 @@ llama_context_params llama_context_default_params() {
/*.cb_eval_user_data =*/ nullptr,
/*.type_k =*/ GGML_TYPE_F16,
/*.type_v =*/ GGML_TYPE_F16,
/*.logits_all =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
return result;
-3
View File
@@ -187,9 +187,6 @@ private:
std::unique_ptr<llama_memory_i> memory;
// TODO: remove
bool logits_all = false;
// decode output (2-dimensional array: [n_outputs][n_vocab])
size_t logits_size = 0; // capacity (of floats) for logits
float * logits = nullptr;
+12 -2
View File
@@ -1651,8 +1651,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
std::regex pattern(overrides->pattern);
if (std::regex_search(tensor_name, pattern)) {
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
buft = overrides->buft;
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
tensor_name.c_str(),
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
ggml_backend_buft_name(buft));
break;
}
}
@@ -4792,7 +4795,7 @@ struct llm_build_deci : public llm_graph_context {
}
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
if (n_head == 0 && n_ff == 0) {
if (n_ff == 0) {
continue;
}
@@ -12849,6 +12852,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
llama_memory_i * res;
switch (arch) {
case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
{
res = nullptr;
} break;
case LLM_ARCH_MAMBA:
case LLM_ARCH_RWKV6:
case LLM_ARCH_RWKV6QWEN2:
-1
View File
@@ -585,7 +585,6 @@ int main(int argc, char ** argv) {
params.out_file = "imatrix.dat" ;
params.n_ctx = 512;
params.logits_all = true;
params.escape = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
-8
View File
@@ -99,14 +99,6 @@ int main(int argc, char ** argv) {
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+1 -1
View File
@@ -3010,7 +3010,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
n_patches /= params.proj_scale_factor;
n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
int n_merge = params.spatial_merge_size;
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
+78 -51
View File
@@ -580,6 +580,79 @@ struct decode_embd_batch {
}
};
// Helper function for decoding an image whose embeddings have already been calculated
int32_t mtmd_helper_decode_image_chunk(
mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past) {
if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
return -1;
}
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
if (!image_tokens) {
LOG_ERR("failed to decode image chunk: image tokens are null\n");
return -1;
}
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
if (mtmd_decode_use_mrope(ctx)) {
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
} else {
batch_embd.set_position_normal(n_past, seq_id);
}
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, false);
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
}
while (i_batch < n_img_batches) { // split into batches
int pos_offset = i_batch*n_batch;
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
int64_t t1 = ggml_time_ms();
int32_t ret = llama_decode(lctx, batch_embd_view);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_set_causal_attn(lctx, true); // restore causal attn
return ret;
}
if (ctx->print_timings) {
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
}
i_batch++;
}
n_past += mtmd_image_tokens_get_n_pos(image_tokens);
*new_n_past = n_past;
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, true);
}
return 0;
}
int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
@@ -591,8 +664,6 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
int32_t ret;
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
auto chunk_type = mtmd_input_chunk_get_type(chunk);
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens;
@@ -637,57 +708,13 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
if (ctx->print_timings) {
LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
}
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx);
decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
if (mtmd_decode_use_mrope(ctx)) {
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
} else {
batch_embd.set_position_normal(n_past, seq_id);
ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_batch_free(text_batch);
return ret;
}
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, false);
// TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
}
while (i_batch < n_img_batches) { // split into batches
int pos_offset = i_batch*n_batch;
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_embd_view);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_set_causal_attn(lctx, true); // restore causal attn
llama_batch_free(text_batch);
return ret;
}
if (ctx->print_timings) {
LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
}
i_batch++;
}
n_past += mtmd_image_tokens_get_n_pos(image_tokens);
*new_n_past = n_past;
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, true);
}
} else {
GGML_ABORT("chunk type not supported");
}
+12
View File
@@ -231,6 +231,18 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
bool logits_last,
llama_pos * new_n_past);
// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);
/////////////////////////////////////////
// test function, to be used in test-mtmd-c-api.c
+4 -2
View File
@@ -1554,7 +1554,10 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
if (int(batch_indeces.size()) != num_answers) {
batch_indeces.resize(num_answers);
}
for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
for (int s = 0; s < num_answers; ++s) {
batch_indeces[s] = s0 + s;
}
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
@@ -1970,7 +1973,6 @@ int main(int argc, char ** argv) {
common_params params;
params.n_ctx = 512;
params.logits_all = true;
params.escape = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
Binary file not shown.
+9 -2
View File
@@ -3214,7 +3214,14 @@ struct server_context {
batch.logits + i,
};
const int ret = llama_decode(ctx, batch_view);
int ret = 0;
if (params_base.embedding || params_base.reranking) {
ret = llama_encode(ctx, batch_view);
} else {
ret = llama_decode(ctx, batch_view);
}
metrics.on_decoded(slots);
if (ret != 0) {
@@ -3943,7 +3950,7 @@ int main(int argc, char ** argv) {
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
server_task_type type,
json & data,
std::function<bool()> is_connection_closed,
const std::function<bool()> & is_connection_closed,
httplib::Response & res,
oaicompat_type oaicompat) {
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
+92 -1
View File
@@ -21,6 +21,8 @@
"postcss": "^8.4.49",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-dropzone": "^14.3.8",
"react-hot-toast": "^2.5.2",
"react-markdown": "^9.0.3",
"react-router": "^7.1.5",
"rehype-highlight": "^7.0.2",
@@ -2058,6 +2060,15 @@
"dev": true,
"license": "Python-2.0"
},
"node_modules/attr-accept": {
"version": "2.2.5",
"resolved": "https://registry.npmjs.org/attr-accept/-/attr-accept-2.2.5.tgz",
"integrity": "sha512-0bDNnY/u6pPwHDMoF0FieU354oBi0a8rD9FcsLwzcGWbc8KS8KPIi7y+s13OlVY+gMWc/9xEMUgNE6Qm8ZllYQ==",
"license": "MIT",
"engines": {
"node": ">=4"
}
},
"node_modules/autoprefixer": {
"version": "10.4.20",
"resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.20.tgz",
@@ -2804,6 +2815,18 @@
"node": ">=16.0.0"
}
},
"node_modules/file-selector": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/file-selector/-/file-selector-2.1.2.tgz",
"integrity": "sha512-QgXo+mXTe8ljeqUFaX3QVHc5osSItJ/Km+xpocx0aSqWGMSCf6qYs/VnzZgS864Pjn5iceMRFigeAV7AfTlaig==",
"license": "MIT",
"dependencies": {
"tslib": "^2.7.0"
},
"engines": {
"node": ">= 12"
}
},
"node_modules/fill-range": {
"version": "7.1.1",
"resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
@@ -2917,6 +2940,15 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/goober": {
"version": "2.1.16",
"resolved": "https://registry.npmjs.org/goober/-/goober-2.1.16.tgz",
"integrity": "sha512-erjk19y1U33+XAMe1VTvIONHYoSqE4iS7BYUZfHaqeohLmnC0FdxEh7rQU+6MZ4OajItzjZFSRtVANrQwNq6/g==",
"license": "MIT",
"peerDependencies": {
"csstype": "^3.0.10"
}
},
"node_modules/graceful-fs": {
"version": "4.2.11",
"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
@@ -4674,6 +4706,15 @@
"node": ">=0.10.0"
}
},
"node_modules/object-assign": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
"integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/optionator": {
"version": "0.9.4",
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
@@ -4872,6 +4913,17 @@
"url": "https://github.com/prettier/prettier?sponsor=1"
}
},
"node_modules/prop-types": {
"version": "15.8.1",
"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
"integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
"license": "MIT",
"dependencies": {
"loose-envify": "^1.4.0",
"object-assign": "^4.1.1",
"react-is": "^16.13.1"
}
},
"node_modules/property-information": {
"version": "6.5.0",
"resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
@@ -4938,6 +4990,46 @@
"react": "^18.3.1"
}
},
"node_modules/react-dropzone": {
"version": "14.3.8",
"resolved": "https://registry.npmjs.org/react-dropzone/-/react-dropzone-14.3.8.tgz",
"integrity": "sha512-sBgODnq+lcA4P296DY4wacOZz3JFpD99fp+hb//iBO2HHnyeZU3FwWyXJ6salNpqQdsZrgMrotuko/BdJMV8Ug==",
"license": "MIT",
"dependencies": {
"attr-accept": "^2.2.4",
"file-selector": "^2.1.0",
"prop-types": "^15.8.1"
},
"engines": {
"node": ">= 10.13"
},
"peerDependencies": {
"react": ">= 16.8 || 18.0.0"
}
},
"node_modules/react-hot-toast": {
"version": "2.5.2",
"resolved": "https://registry.npmjs.org/react-hot-toast/-/react-hot-toast-2.5.2.tgz",
"integrity": "sha512-Tun3BbCxzmXXM7C+NI4qiv6lT0uwGh4oAfeJyNOjYUejTsm35mK9iCaYLGv8cBz9L5YxZLx/2ii7zsIwPtPUdw==",
"license": "MIT",
"dependencies": {
"csstype": "^3.1.3",
"goober": "^2.1.16"
},
"engines": {
"node": ">=10"
},
"peerDependencies": {
"react": ">=16",
"react-dom": ">=16"
}
},
"node_modules/react-is": {
"version": "16.13.1",
"resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
"integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
"license": "MIT"
},
"node_modules/react-markdown": {
"version": "9.0.3",
"resolved": "https://registry.npmjs.org/react-markdown/-/react-markdown-9.0.3.tgz",
@@ -5814,7 +5906,6 @@
"version": "2.8.1",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
"devOptional": true,
"license": "0BSD"
},
"node_modules/turbo-stream": {
+2
View File
@@ -24,6 +24,8 @@
"postcss": "^8.4.49",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-dropzone": "^14.3.8",
"react-hot-toast": "^2.5.2",
"react-markdown": "^9.0.3",
"react-router": "^7.1.5",
"rehype-highlight": "^7.0.2",
+2
View File
@@ -4,6 +4,7 @@ import Sidebar from './components/Sidebar';
import { AppContextProvider, useAppContext } from './utils/app.context';
import ChatScreen from './components/ChatScreen';
import SettingDialog from './components/SettingDialog';
import { Toaster } from 'react-hot-toast';
function App() {
return (
@@ -40,6 +41,7 @@ function AppLayout() {
onClose={() => setShowSettings(false)}
/>
}
<Toaster />
</>
);
}
+1 -1
View File
@@ -12,7 +12,7 @@ export const CONFIG_DEFAULT = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
// Do not use nested objects, keep it single level. Prefix the key if you need to group them.
apiKey: '',
systemMessage: 'You are a helpful assistant.',
systemMessage: '',
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
@@ -0,0 +1,92 @@
import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline';
import { MessageExtra } from '../utils/types';
import { useState } from 'react';
import { classNames } from '../utils/misc';
export default function ChatInputExtraContextItem({
items,
removeItem,
clickToShow,
}: {
items?: MessageExtra[];
removeItem?: (index: number) => void;
clickToShow?: boolean;
}) {
const [show, setShow] = useState(-1);
const showingItem = show >= 0 ? items?.[show] : undefined;
if (!items) return null;
return (
<div className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1">
{items.map((item, i) => (
<div
className="indicator"
key={i}
onClick={() => clickToShow && setShow(i)}
>
{removeItem && (
<div className="indicator-item indicator-top">
<button
className="btn btn-neutral btn-sm w-4 h-4 p-0 rounded-full"
onClick={() => removeItem(i)}
>
<XMarkIcon className="h-3 w-3" />
</button>
</div>
)}
<div
className={classNames({
'flex flex-row rounded-md shadow-sm items-center m-0 p-0': true,
'cursor-pointer hover:shadow-md': !!clickToShow,
})}
>
{item.type === 'imageFile' ? (
<>
<img
src={item.base64Url}
alt={item.name}
className="w-14 h-14 object-cover rounded-md"
/>
</>
) : (
<>
<div className="w-14 h-14 flex items-center justify-center">
<DocumentTextIcon className="h-8 w-14 text-base-content/50" />
</div>
<div className="text-xs pr-4">
<b>{item.name ?? 'Extra content'}</b>
</div>
</>
)}
</div>
</div>
))}
{showingItem && (
<dialog className="modal modal-open">
<div className="modal-box">
<div className="flex justify-between items-center mb-4">
<b>{showingItem.name ?? 'Extra content'}</b>
<button className="btn btn-ghost btn-sm">
<XMarkIcon className="h-5 w-5" onClick={() => setShow(-1)} />
</button>
</div>
{showingItem.type === 'imageFile' ? (
<img src={showingItem.base64Url} alt={showingItem.name} />
) : (
<div className="overflow-x-auto">
<pre className="whitespace-pre-wrap break-words text-sm">
{showingItem.content}
</pre>
</div>
)}
</div>
<div className="modal-backdrop" onClick={() => setShow(-1)}></div>
</dialog>
)}
</div>
);
}
@@ -3,7 +3,14 @@ import { useAppContext } from '../utils/app.context';
import { Message, PendingMessage } from '../utils/types';
import { classNames } from '../utils/misc';
import MarkdownDisplay, { CopyButton } from './MarkdownDisplay';
import { ChevronLeftIcon, ChevronRightIcon } from '@heroicons/react/24/outline';
import {
ArrowPathIcon,
ChevronLeftIcon,
ChevronRightIcon,
PencilSquareIcon,
} from '@heroicons/react/24/outline';
import ChatInputExtraContextItem from './ChatInputExtraContextItem';
import { BtnWithTooltips } from '../utils/common';
interface SplitMessage {
content: PendingMessage['content'];
@@ -85,10 +92,14 @@ export default function ChatMessage({
'chat-end': msg.role === 'user',
})}
>
{msg.extra && msg.extra.length > 0 && (
<ChatInputExtraContextItem items={msg.extra} clickToShow />
)}
<div
className={classNames({
'chat-bubble markdown': true,
'chat-bubble-base-300': msg.role !== 'user',
'chat-bubble bg-transparent': msg.role !== 'user',
})}
>
{/* textarea for editing message */}
@@ -133,59 +144,11 @@ export default function ChatMessage({
{/* render message as markdown */}
<div dir="auto">
{thought && (
<details
className="collapse bg-base-200 collapse-arrow mb-4"
open={isThinking && config.showThoughtInProgress}
>
<summary className="collapse-title">
{isPending && isThinking ? (
<span>
<span
v-if="isGenerating"
className="loading loading-spinner loading-md mr-2"
style={{ verticalAlign: 'middle' }}
></span>
<b>Thinking</b>
</span>
) : (
<b>Thought Process</b>
)}
</summary>
<div className="collapse-content">
<MarkdownDisplay
content={thought}
isGenerating={isPending}
/>
</div>
</details>
)}
{msg.extra && msg.extra.length > 0 && (
<details
className={classNames({
'collapse collapse-arrow mb-4 bg-base-200': true,
'bg-opacity-10': msg.role !== 'assistant',
})}
>
<summary className="collapse-title">
Extra content
</summary>
<div className="collapse-content">
{msg.extra.map(
(extra, i) =>
extra.type === 'textFile' ? (
<div key={extra.name}>
<b>{extra.name}</b>
<pre>{extra.content}</pre>
</div>
) : extra.type === 'context' ? (
<div key={i}>
<pre>{extra.content}</pre>
</div>
) : null // TODO: support other extra types
)}
</div>
</details>
<ThoughtProcess
isThinking={!!isThinking && !!isPending}
content={thought}
open={config.showThoughtInProgress}
/>
)}
<MarkdownDisplay
@@ -259,34 +222,36 @@ export default function ChatMessage({
)}
{/* user message */}
{msg.role === 'user' && (
<button
className="badge btn-mini show-on-hover"
<BtnWithTooltips
className="btn-mini show-on-hover w-8 h-8"
onClick={() => setEditingContent(msg.content)}
disabled={msg.content === null}
tooltipsContent="Edit message"
>
Edit
</button>
<PencilSquareIcon className="h-4 w-4" />
</BtnWithTooltips>
)}
{/* assistant message */}
{msg.role === 'assistant' && (
<>
{!isPending && (
<button
className="badge btn-mini show-on-hover mr-2"
<BtnWithTooltips
className="btn-mini show-on-hover w-8 h-8"
onClick={() => {
if (msg.content !== null) {
onRegenerateMessage(msg as Message);
}
}}
disabled={msg.content === null}
tooltipsContent="Regenerate response"
>
🔄 Regenerate
</button>
<ArrowPathIcon className="h-4 w-4" />
</BtnWithTooltips>
)}
</>
)}
<CopyButton
className="badge btn-mini show-on-hover mr-2"
className="btn-mini show-on-hover w-8 h-8"
content={msg.content}
/>
</div>
@@ -294,3 +259,44 @@ export default function ChatMessage({
</div>
);
}
function ThoughtProcess({
isThinking,
content,
open,
}: {
isThinking: boolean;
content: string;
open: boolean;
}) {
return (
<div
tabIndex={0}
className={classNames({
'collapse bg-none': true,
})}
>
<input type="checkbox" defaultChecked={open} />
<div className="collapse-title px-0">
<div className="btn rounded-xl">
{isThinking ? (
<span>
<span
className="loading loading-spinner loading-md mr-2"
style={{ verticalAlign: 'middle' }}
></span>
Thinking
</span>
) : (
<>Thought Process</>
)}
</div>
</div>
<div className="collapse-content text-base-content/70 text-sm p-1">
<div className="border-l-2 border-base-content/20 pl-4 mb-4">
<MarkdownDisplay content={content} />
</div>
</div>
</div>
);
}
+169 -65
View File
@@ -1,12 +1,25 @@
import { useEffect, useMemo, useState } from 'react';
import { useEffect, useMemo, useRef, useState } from 'react';
import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
import ChatMessage from './ChatMessage';
import { CanvasType, Message, PendingMessage } from '../utils/types';
import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
import { classNames, cleanCurrentUrl } from '../utils/misc';
import CanvasPyInterpreter from './CanvasPyInterpreter';
import StorageUtils from '../utils/storage';
import { useVSCodeContext } from '../utils/llama-vscode';
import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
import {
ArrowUpIcon,
StopIcon,
PaperClipIcon,
} from '@heroicons/react/24/solid';
import {
ChatExtraContextApi,
useChatExtraContext,
} from './useChatExtraContext.tsx';
import Dropzone from 'react-dropzone';
import toast from 'react-hot-toast';
import ChatInputExtraContextItem from './ChatInputExtraContextItem.tsx';
import { scrollToBottom, useChatScroll } from './useChatScroll.tsx';
/**
* A message display is a message node with additional information for rendering.
@@ -72,24 +85,6 @@ function getListMessageDisplay(
return res;
}
const scrollToBottom = throttle(
(requiresNearBottom: boolean, delay: number = 80) => {
const mainScrollElem = document.getElementById('main-scroll');
if (!mainScrollElem) return;
const spaceToBottom =
mainScrollElem.scrollHeight -
mainScrollElem.scrollTop -
mainScrollElem.clientHeight;
if (!requiresNearBottom || spaceToBottom < 50) {
setTimeout(
() => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
delay
);
}
},
80
);
export default function ChatScreen() {
const {
viewingChat,
@@ -102,10 +97,11 @@ export default function ChatScreen() {
} = useAppContext();
const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());
const extraContext = useChatExtraContext();
useVSCodeContext(textarea, extraContext);
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
// TODO: improve this when we have "upload file" feature
const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
const msgListRef = useRef<HTMLDivElement>(null);
useChatScroll(msgListRef);
// keep track of leaf node for rendering
const [currNodeId, setCurrNodeId] = useState<number>(-1);
@@ -129,13 +125,15 @@ export default function ChatScreen() {
if (currLeafNodeId) {
setCurrNodeId(currLeafNodeId);
}
scrollToBottom(true);
// useChatScroll will handle the auto scroll
};
const sendNewMessage = async () => {
const lastInpMsg = textarea.value();
if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? '')) {
toast.error('Please enter a message');
return;
}
textarea.setValue('');
scrollToBottom(false);
setCurrNodeId(-1);
@@ -146,7 +144,7 @@ export default function ChatScreen() {
currConvId,
lastMsgNodeId,
lastInpMsg,
currExtra,
extraContext.items,
onChunk
))
) {
@@ -154,7 +152,7 @@ export default function ChatScreen() {
textarea.setValue(lastInpMsg);
}
// OK
clearExtraContext();
extraContext.clearItems();
};
// for vscode context
@@ -234,10 +232,17 @@ export default function ChatScreen() {
})}
>
{/* chat messages */}
<div id="messages-list" className="grow">
<div className="mt-auto flex justify-center">
<div id="messages-list" className="grow" ref={msgListRef}>
<div className="mt-auto flex flex-col items-center">
{/* placeholder to shift the message to the bottom */}
{viewingChat ? '' : 'Send a message to start'}
{viewingChat ? (
''
) : (
<>
<div className="mb-4">Send a message to start</div>
<ServerInfo />
</>
)}
</div>
{[...messages, ...pendingMsgDisplay].map((msg) => (
<ChatMessage
@@ -248,46 +253,19 @@ export default function ChatScreen() {
onRegenerateMessage={handleRegenerateMessage}
onEditMessage={handleEditMessage}
onChangeSibling={setCurrNodeId}
isPending={msg.isPending}
/>
))}
</div>
{/* chat input */}
<div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
<textarea
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
placeholder="Type a message (Shift+Enter to add a new line)"
ref={textarea.ref}
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
onKeyDown={(e) => {
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault();
sendNewMessage();
}
}}
id="msg-input"
dir="auto"
// Set a base height of 2 rows for mobile views
// On lg+ screens, the hook will calculate and set the initial height anyway
rows={2}
></textarea>
{isGenerating(currConvId ?? '') ? (
<button
className="btn btn-neutral ml-2"
onClick={() => stopGenerating(currConvId ?? '')}
>
Stop
</button>
) : (
<button className="btn btn-primary ml-2" onClick={sendNewMessage}>
Send
</button>
)}
</div>
<ChatInput
textarea={textarea}
extraContext={extraContext}
onSend={sendNewMessage}
onStop={() => stopGenerating(currConvId ?? '')}
isGenerating={isGenerating(currConvId ?? '')}
/>
</div>
<div className="w-full sticky top-[7em] h-[calc(100vh-9em)]">
{canvasData?.type === CanvasType.PY_INTERPRETER && (
@@ -297,3 +275,129 @@ export default function ChatScreen() {
</div>
);
}
function ServerInfo() {
const { serverProps } = useAppContext();
return (
<div className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6">
<div className="card-body">
<b>Server Info</b>
<p>
<b>Model</b>: {serverProps?.model_path?.split(/(\\|\/)/).pop()}
<br />
<b>Build</b>: {serverProps?.build_info}
<br />
</p>
</div>
</div>
);
}
function ChatInput({
textarea,
extraContext,
onSend,
onStop,
isGenerating,
}: {
textarea: ChatTextareaApi;
extraContext: ChatExtraContextApi;
onSend: () => void;
onStop: () => void;
isGenerating: boolean;
}) {
const [isDrag, setIsDrag] = useState(false);
return (
<div
className={classNames({
'flex items-end pt-8 pb-6 sticky bottom-0 bg-base-100': true,
'opacity-50': isDrag, // simply visual feedback to inform user that the file will be accepted
})}
>
<Dropzone
noClick
onDrop={(files: File[]) => {
setIsDrag(false);
extraContext.onFileAdded(files);
}}
onDragEnter={() => setIsDrag(true)}
onDragLeave={() => setIsDrag(false)}
multiple={true}
>
{({ getRootProps, getInputProps }) => (
<div
className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
{...getRootProps()}
>
{!isGenerating && (
<ChatInputExtraContextItem
items={extraContext.items}
removeItem={extraContext.removeItem}
/>
)}
<div className="flex flex-row w-full">
<textarea
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
className="text-md outline-none border-none w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
placeholder="Type a message (Shift+Enter to add a new line)"
ref={textarea.ref}
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
onKeyDown={(e) => {
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault();
onSend();
}
}}
id="msg-input"
dir="auto"
// Set a base height of 2 rows for mobile views
// On lg+ screens, the hook will calculate and set the initial height anyway
rows={2}
></textarea>
{/* buttons area */}
<div className="flex flex-row gap-2 ml-2">
<label
htmlFor="file-upload"
className={classNames({
'btn w-8 h-8 p-0 rounded-full': true,
'btn-disabled': isGenerating,
})}
>
<PaperClipIcon className="h-5 w-5" />
</label>
<input
id="file-upload"
type="file"
className="hidden"
disabled={isGenerating}
{...getInputProps()}
hidden
/>
{isGenerating ? (
<button
className="btn btn-neutral w-8 h-8 p-0 rounded-full"
onClick={onStop}
>
<StopIcon className="h-5 w-5" />
</button>
) : (
<button
className="btn btn-primary w-8 h-8 p-0 rounded-full"
onClick={onSend}
>
<ArrowUpIcon className="h-5 w-5" />
</button>
)}
</div>
</div>
</div>
)}
</Dropzone>
</div>
);
}
+8 -98
View File
@@ -4,10 +4,13 @@ import { useAppContext } from '../utils/app.context';
import { classNames } from '../utils/misc';
import daisyuiThemes from 'daisyui/theme/object';
import { THEMES } from '../Config';
import { useNavigate } from 'react-router';
import {
Cog8ToothIcon,
MoonIcon,
Bars3Icon,
} from '@heroicons/react/24/outline';
export default function Header() {
const navigate = useNavigate();
const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
const { setShowSettings } = useAppContext();
@@ -24,105 +27,21 @@ export default function Header() {
);
}, [selectedTheme]);
const { isGenerating, viewingChat } = useAppContext();
const isCurrConvGenerating = isGenerating(viewingChat?.conv.id ?? '');
const removeConversation = () => {
if (isCurrConvGenerating || !viewingChat) return;
const convId = viewingChat?.conv.id;
if (window.confirm('Are you sure to delete this conversation?')) {
StorageUtils.remove(convId);
navigate('/');
}
};
const downloadConversation = () => {
if (isCurrConvGenerating || !viewingChat) return;
const convId = viewingChat?.conv.id;
const conversationJson = JSON.stringify(viewingChat, null, 2);
const blob = new Blob([conversationJson], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `conversation_${convId}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
};
return (
<div className="flex flex-row items-center pt-6 pb-6 sticky top-0 z-10 bg-base-100">
{/* open sidebar button */}
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-list"
viewBox="0 0 16 16"
>
<path
fillRule="evenodd"
d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"
/>
</svg>
<Bars3Icon className="h-5 w-5" />
</label>
<div className="grow text-2xl font-bold ml-2">llama.cpp</div>
{/* action buttons (top right) */}
<div className="flex items-center">
{viewingChat && (
<div className="dropdown dropdown-end">
{/* "..." button */}
<button
tabIndex={0}
role="button"
className="btn m-1"
disabled={isCurrConvGenerating}
>
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-three-dots-vertical"
viewBox="0 0 16 16"
>
<path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0" />
</svg>
</button>
{/* dropdown menu */}
<ul
tabIndex={0}
className="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow"
>
<li onClick={downloadConversation}>
<a>Download</a>
</li>
<li className="text-error" onClick={removeConversation}>
<a>Delete</a>
</li>
</ul>
</div>
)}
<div className="tooltip tooltip-bottom" data-tip="Settings">
<button className="btn" onClick={() => setShowSettings(true)}>
{/* settings button */}
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-gear"
viewBox="0 0 16 16"
>
<path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0" />
<path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z" />
</svg>
<Cog8ToothIcon className="w-5 h-5" />
</button>
</div>
@@ -130,16 +49,7 @@ export default function Header() {
<div className="tooltip tooltip-bottom" data-tip="Themes">
<div className="dropdown dropdown-end dropdown-bottom">
<div tabIndex={0} role="button" className="btn m-1">
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-palette2"
viewBox="0 0 16 16"
>
<path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z" />
</svg>
<MoonIcon className="w-5 h-5" />
</div>
<ul
tabIndex={0}
@@ -11,6 +11,8 @@ import { ElementContent, Root } from 'hast';
import { visit } from 'unist-util-visit';
import { useAppContext } from '../utils/app.context';
import { CanvasType } from '../utils/types';
import { BtnWithTooltips } from '../utils/common';
import { DocumentDuplicateIcon, PlayIcon } from '@heroicons/react/24/outline';
export default function MarkdownDisplay({
content,
@@ -81,10 +83,13 @@ const CodeBlockButtons: React.ElementType<
'display-none': !node?.position,
})}
>
<CopyButton className="badge btn-mini" content={copiedContent} />
<CopyButton
className="badge btn-mini btn-soft shadow-sm"
content={copiedContent}
/>
{canRunCode && (
<RunPyCodeButton
className="badge btn-mini ml-2"
className="badge btn-mini shadow-sm ml-2"
content={copiedContent}
/>
)}
@@ -101,16 +106,17 @@ export const CopyButton = ({
}) => {
const [copied, setCopied] = useState(false);
return (
<button
<BtnWithTooltips
className={className}
onClick={() => {
copyStr(content);
setCopied(true);
}}
onMouseLeave={() => setCopied(false)}
tooltipsContent={copied ? 'Copied!' : 'Copy'}
>
{copied ? 'Copied!' : '📋 Copy'}
</button>
<DocumentDuplicateIcon className="h-4 w-4" />
</BtnWithTooltips>
);
};
@@ -124,7 +130,7 @@ export const RunPyCodeButton = ({
const { setCanvasData } = useAppContext();
return (
<>
<button
<BtnWithTooltips
className={className}
onClick={() =>
setCanvasData({
@@ -132,9 +138,10 @@ export const RunPyCodeButton = ({
content,
})
}
tooltipsContent="Run code"
>
Run
</button>
<PlayIcon className="h-4 w-4" />
</BtnWithTooltips>
</>
);
};
+269 -30
View File
@@ -1,13 +1,26 @@
import { useEffect, useState } from 'react';
import { useEffect, useMemo, useState } from 'react';
import { classNames } from '../utils/misc';
import { Conversation } from '../utils/types';
import StorageUtils from '../utils/storage';
import { useNavigate, useParams } from 'react-router';
import {
ArrowDownTrayIcon,
EllipsisVerticalIcon,
PencilIcon,
PencilSquareIcon,
TrashIcon,
XMarkIcon,
} from '@heroicons/react/24/outline';
import { BtnWithTooltips } from '../utils/common';
import { useAppContext } from '../utils/app.context';
import toast from 'react-hot-toast';
export default function Sidebar() {
const params = useParams();
const navigate = useNavigate();
const { isGenerating } = useAppContext();
const [conversations, setConversations] = useState<Conversation[]>([]);
const [currConv, setCurrConv] = useState<Conversation | null>(null);
@@ -26,6 +39,11 @@ export default function Sidebar() {
};
}, []);
const groupedConv = useMemo(
() => groupConversationsByDate(conversations),
[conversations]
);
return (
<>
<input
@@ -47,46 +65,100 @@ export default function Sidebar() {
{/* close sidebar button */}
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
<svg
xmlns="http://www.w3.org/2000/svg"
width="16"
height="16"
fill="currentColor"
className="bi bi-arrow-bar-left"
viewBox="0 0 16 16"
>
<path
fillRule="evenodd"
d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"
/>
</svg>
<XMarkIcon className="w-5 h-5" />
</label>
</div>
{/* list of conversations */}
{/* new conversation button */}
<div
className={classNames({
'btn btn-ghost justify-start': true,
'btn-active': !currConv,
'btn btn-ghost justify-start px-2': true,
'btn-soft': !currConv,
})}
onClick={() => navigate('/')}
>
+ New conversation
<PencilSquareIcon className="w-5 h-5" />
New conversation
</div>
{conversations.map((conv) => (
<div
key={conv.id}
className={classNames({
'btn btn-ghost justify-start font-normal': true,
'btn-active': conv.id === currConv?.id,
})}
onClick={() => navigate(`/chat/${conv.id}`)}
dir="auto"
>
<span className="truncate">{conv.name}</span>
{/* list of conversations */}
{groupedConv.map((group, i) => (
<div key={i}>
{/* group name (by date) */}
{group.title ? (
// we use btn class here to make sure that the padding/margin are aligned with the other items
<b className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold">
{group.title}
</b>
) : (
<div className="h-2" />
)}
{group.conversations.map((conv) => (
<ConversationItem
key={conv.id}
conv={conv}
isCurrConv={currConv?.id === conv.id}
onSelect={() => {
navigate(`/chat/${conv.id}`);
}}
onDelete={() => {
if (isGenerating(conv.id)) {
toast.error(
'Cannot delete conversation while generating'
);
return;
}
if (
window.confirm(
'Are you sure to delete this conversation?'
)
) {
toast.success('Conversation deleted');
StorageUtils.remove(conv.id);
navigate('/');
}
}}
onDownload={() => {
if (isGenerating(conv.id)) {
toast.error(
'Cannot download conversation while generating'
);
return;
}
const conversationJson = JSON.stringify(conv, null, 2);
const blob = new Blob([conversationJson], {
type: 'application/json',
});
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `conversation_${conv.id}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}}
onRename={() => {
if (isGenerating(conv.id)) {
toast.error(
'Cannot rename conversation while generating'
);
return;
}
const newName = window.prompt(
'Enter new name for the conversation',
conv.name
);
if (newName && newName.trim().length > 0) {
StorageUtils.updateConversationName(conv.id, newName);
}
}}
/>
))}
</div>
))}
<div className="text-center text-xs opacity-40 mt-auto mx-4">
<div className="text-center text-xs opacity-40 mt-auto mx-4 pt-8">
Conversations are saved to browser's IndexedDB
</div>
</div>
@@ -94,3 +166,170 @@ export default function Sidebar() {
</>
);
}
function ConversationItem({
conv,
isCurrConv,
onSelect,
onDelete,
onDownload,
onRename,
}: {
conv: Conversation;
isCurrConv: boolean;
onSelect: () => void;
onDelete: () => void;
onDownload: () => void;
onRename: () => void;
}) {
return (
<div
className={classNames({
'group flex flex-row btn btn-ghost justify-start items-center font-normal px-2 h-9':
true,
'btn-soft': isCurrConv,
})}
>
<div
key={conv.id}
className="w-full overflow-hidden truncate text-start"
onClick={onSelect}
dir="auto"
>
{conv.name}
</div>
<div className="dropdown dropdown-end h-5">
<BtnWithTooltips
// on mobile, we always show the ellipsis icon
// on desktop, we only show it when the user hovers over the conversation item
// we use opacity instead of hidden to avoid layout shift
className="cursor-pointer opacity-100 md:opacity-0 group-hover:opacity-100"
onClick={() => {}}
tooltipsContent="More"
>
<EllipsisVerticalIcon className="w-5 h-5" />
</BtnWithTooltips>
{/* dropdown menu */}
<ul
tabIndex={0}
className="dropdown-content menu bg-base-100 rounded-box z-[1] p-2 shadow"
>
<li onClick={onRename}>
<a>
<PencilIcon className="w-4 h-4" />
Rename
</a>
</li>
<li onClick={onDownload}>
<a>
<ArrowDownTrayIcon className="w-4 h-4" />
Download
</a>
</li>
<li className="text-error" onClick={onDelete}>
<a>
<TrashIcon className="w-4 h-4" />
Delete
</a>
</li>
</ul>
</div>
</div>
);
}
// WARN: vibe code below
export interface GroupedConversations {
title?: string;
conversations: Conversation[];
}
// TODO @ngxson : add test for this function
// Group conversations by date
// - "Previous 7 Days"
// - "Previous 30 Days"
// - "Month Year" (e.g., "April 2023")
export function groupConversationsByDate(
conversations: Conversation[]
): GroupedConversations[] {
const now = new Date();
const today = new Date(now.getFullYear(), now.getMonth(), now.getDate()); // Start of today
const sevenDaysAgo = new Date(today);
sevenDaysAgo.setDate(today.getDate() - 7);
const thirtyDaysAgo = new Date(today);
thirtyDaysAgo.setDate(today.getDate() - 30);
const groups: { [key: string]: Conversation[] } = {
Today: [],
'Previous 7 Days': [],
'Previous 30 Days': [],
};
const monthlyGroups: { [key: string]: Conversation[] } = {}; // Key format: "Month Year" e.g., "April 2023"
// Sort conversations by lastModified date in descending order (newest first)
// This helps when adding to groups, but the final output order of groups is fixed.
const sortedConversations = [...conversations].sort(
(a, b) => b.lastModified - a.lastModified
);
for (const conv of sortedConversations) {
const convDate = new Date(conv.lastModified);
if (convDate >= today) {
groups['Today'].push(conv);
} else if (convDate >= sevenDaysAgo) {
groups['Previous 7 Days'].push(conv);
} else if (convDate >= thirtyDaysAgo) {
groups['Previous 30 Days'].push(conv);
} else {
const monthName = convDate.toLocaleString('default', { month: 'long' });
const year = convDate.getFullYear();
const monthYearKey = `${monthName} ${year}`;
if (!monthlyGroups[monthYearKey]) {
monthlyGroups[monthYearKey] = [];
}
monthlyGroups[monthYearKey].push(conv);
}
}
const result: GroupedConversations[] = [];
if (groups['Today'].length > 0) {
result.push({
title: undefined, // no title for Today
conversations: groups['Today'],
});
}
if (groups['Previous 7 Days'].length > 0) {
result.push({
title: 'Previous 7 Days',
conversations: groups['Previous 7 Days'],
});
}
if (groups['Previous 30 Days'].length > 0) {
result.push({
title: 'Previous 30 Days',
conversations: groups['Previous 30 Days'],
});
}
// Sort monthly groups by date (most recent month first)
const sortedMonthKeys = Object.keys(monthlyGroups).sort((a, b) => {
const dateA = new Date(a); // "Month Year" can be parsed by Date constructor
const dateB = new Date(b);
return dateB.getTime() - dateA.getTime();
});
for (const monthKey of sortedMonthKeys) {
if (monthlyGroups[monthKey].length > 0) {
result.push({ title: monthKey, conversations: monthlyGroups[monthKey] });
}
}
return result;
}
@@ -0,0 +1,234 @@
import { useState } from 'react';
import { MessageExtra } from '../utils/types';
import toast from 'react-hot-toast';
import { useAppContext } from '../utils/app.context';
// Interface describing the API returned by the hook
export interface ChatExtraContextApi {
items?: MessageExtra[]; // undefined if empty, similar to Message['extra']
addItems: (items: MessageExtra[]) => void;
removeItem: (idx: number) => void;
clearItems: () => void;
onFileAdded: (files: File[]) => void; // used by "upload" button
}
export function useChatExtraContext(): ChatExtraContextApi {
const { serverProps } = useAppContext();
const [items, setItems] = useState<MessageExtra[]>([]);
const addItems = (newItems: MessageExtra[]) => {
setItems((prev) => [...prev, ...newItems]);
};
const removeItem = (idx: number) => {
setItems((prev) => prev.filter((_, i) => i !== idx));
};
const clearItems = () => {
setItems([]);
};
const onFileAdded = (files: File[]) => {
for (const file of files) {
const mimeType = file.type;
console.debug({ mimeType, file });
if (file.size > 10 * 1024 * 1024) {
toast.error('File is too large. Maximum size is 10MB.');
break;
}
if (mimeType.startsWith('image/')) {
if (!serverProps?.modalities?.vision) {
toast.error('Multimodal is not supported by this server or model.');
break;
}
const reader = new FileReader();
reader.onload = async (event) => {
if (event.target?.result) {
let base64Url = event.target.result as string;
if (mimeType === 'image/svg+xml') {
// Convert SVG to PNG
base64Url = await svgBase64UrlToPngDataURL(base64Url);
}
addItems([
{
type: 'imageFile',
name: file.name,
base64Url,
},
]);
}
};
reader.readAsDataURL(file);
} else if (
mimeType.startsWith('video/') ||
mimeType.startsWith('audio/')
) {
toast.error('Video and audio files are not supported yet.');
break;
} else if (mimeType.startsWith('application/pdf')) {
toast.error('PDF files are not supported yet.');
break;
} else {
// Because there can be many text file types (like code file), we will not check the mime type
// and will just check if the file is not binary.
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
const content = event.target.result as string;
if (!isLikelyNotBinary(content)) {
toast.error('File is binary. Please upload a text file.');
return;
}
addItems([
{
type: 'textFile',
name: file.name,
content,
},
]);
}
};
reader.readAsText(file);
}
}
};
return {
items: items.length > 0 ? items : undefined,
addItems,
removeItem,
clearItems,
onFileAdded,
};
}
// WARN: vibe code below
// This code is a heuristic to determine if a string is likely not binary.
// It is necessary because input file can have various mime types which we don't have time to investigate.
// For example, a python file can be text/plain, application/x-python, etc.
export function isLikelyNotBinary(str: string): boolean {
const options = {
prefixLength: 1024 * 10, // Check the first 10KB of the string
suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
maxAbsoluteNullBytes: 2,
};
if (!str) {
return true; // Empty string is considered "not binary" or trivially text.
}
const sampleLength = Math.min(str.length, options.prefixLength);
if (sampleLength === 0) {
return true; // Effectively an empty string after considering prefixLength.
}
let suspiciousCharCount = 0;
let nullByteCount = 0;
for (let i = 0; i < sampleLength; i++) {
const charCode = str.charCodeAt(i);
// 1. Check for Unicode Replacement Character (U+FFFD)
// This is a strong indicator if the string was created from decoding bytes as UTF-8.
if (charCode === 0xfffd) {
suspiciousCharCount++;
continue;
}
// 2. Check for Null Bytes (U+0000)
if (charCode === 0x0000) {
nullByteCount++;
// We also count nulls towards the general suspicious character count,
// as they are less common in typical text files.
suspiciousCharCount++;
continue;
}
// 3. Check for C0 Control Characters (U+0001 to U+001F)
// Exclude common text control characters: TAB (9), LF (10), CR (13).
// We can also be a bit lenient with BEL (7) and BS (8) which sometimes appear in logs.
if (charCode < 32) {
if (
charCode !== 9 && // TAB
charCode !== 10 && // LF
charCode !== 13 && // CR
charCode !== 7 && // BEL (Bell) - sometimes in logs
charCode !== 8 // BS (Backspace) - less common, but possible
) {
suspiciousCharCount++;
}
}
// Characters from 32 (space) up to 126 (~) are printable ASCII.
// Characters 127 (DEL) is a control character.
// Characters >= 128 are extended ASCII / multi-byte Unicode.
// If they resulted in U+FFFD, we caught it. Otherwise, they are valid
// (though perhaps unusual) Unicode characters from JS's perspective.
// The main concern is if those higher characters came from misinterpreting
// a single-byte encoding as UTF-8, which again, U+FFFD would usually flag.
}
// Check absolute null byte count
if (nullByteCount > options.maxAbsoluteNullBytes) {
return false; // Too many null bytes is a strong binary indicator
}
// Check ratio of suspicious characters
const ratio = suspiciousCharCount / sampleLength;
return ratio <= options.suspiciousCharThresholdRatio;
}
// WARN: vibe code below
// Converts a Base64URL encoded SVG string to a PNG Data URL using browser Canvas API.
function svgBase64UrlToPngDataURL(base64UrlSvg: string): Promise<string> {
const backgroundColor = 'white'; // Default background color for PNG
return new Promise((resolve, reject) => {
try {
const img = new Image();
img.onload = () => {
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
if (!ctx) {
reject(new Error('Failed to get 2D canvas context.'));
return;
}
// Use provided dimensions or SVG's natural dimensions, with fallbacks
// Fallbacks (e.g., 300x300) are for SVGs without explicit width/height
// or when naturalWidth/Height might be 0 before full processing.
const targetWidth = img.naturalWidth || 300;
const targetHeight = img.naturalHeight || 300;
canvas.width = targetWidth;
canvas.height = targetHeight;
if (backgroundColor) {
ctx.fillStyle = backgroundColor;
ctx.fillRect(0, 0, canvas.width, canvas.height);
}
ctx.drawImage(img, 0, 0, targetWidth, targetHeight);
resolve(canvas.toDataURL('image/png'));
};
img.onerror = () => {
reject(
new Error('Failed to load SVG image. Ensure the SVG data is valid.')
);
};
// Load SVG string into an Image element
img.src = base64UrlSvg;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
const errorMessage = `Error converting SVG to PNG: ${message}`;
toast.error(errorMessage);
reject(new Error(errorMessage));
}
});
}
@@ -0,0 +1,34 @@
import React, { useEffect } from 'react';
import { throttle } from '../utils/misc';
export const scrollToBottom = (requiresNearBottom: boolean, delay?: number) => {
const mainScrollElem = document.getElementById('main-scroll');
if (!mainScrollElem) return;
const spaceToBottom =
mainScrollElem.scrollHeight -
mainScrollElem.scrollTop -
mainScrollElem.clientHeight;
if (!requiresNearBottom || spaceToBottom < 100) {
setTimeout(
() => mainScrollElem.scrollTo({ top: mainScrollElem.scrollHeight }),
delay ?? 80
);
}
};
const scrollToBottomThrottled = throttle(scrollToBottom, 80);
export function useChatScroll(msgListRef: React.RefObject<HTMLDivElement>) {
useEffect(() => {
if (!msgListRef.current) return;
const resizeObserver = new ResizeObserver((_) => {
scrollToBottomThrottled(true, 10);
});
resizeObserver.observe(msgListRef.current);
return () => {
resizeObserver.disconnect();
};
}, [msgListRef]);
}
@@ -1,35 +1,39 @@
import { useEffect, useRef, useState, useCallback } from 'react';
import { throttle } from '../utils/misc';
// Media Query for detecting "large" screens (matching Tailwind's lg: breakpoint)
const LARGE_SCREEN_MQ = '(min-width: 1024px)';
// Calculates and sets the textarea height based on its scrollHeight
const adjustTextareaHeight = (textarea: HTMLTextAreaElement | null) => {
if (!textarea) return;
const adjustTextareaHeight = throttle(
(textarea: HTMLTextAreaElement | null) => {
if (!textarea) return;
// Only perform auto-sizing on large screens
if (!window.matchMedia(LARGE_SCREEN_MQ).matches) {
// On small screens, reset inline height and max-height styles.
// This allows CSS (e.g., `rows` attribute or classes) to control the height,
// and enables manual resizing if `resize-vertical` is set.
textarea.style.height = ''; // Use 'auto' or '' to reset
textarea.style.maxHeight = '';
return; // Do not adjust height programmatically on small screens
}
// Only perform auto-sizing on large screens
if (!window.matchMedia(LARGE_SCREEN_MQ).matches) {
// On small screens, reset inline height and max-height styles.
// This allows CSS (e.g., `rows` attribute or classes) to control the height,
// and enables manual resizing if `resize-vertical` is set.
textarea.style.height = ''; // Use 'auto' or '' to reset
textarea.style.maxHeight = '';
return; // Do not adjust height programmatically on small screens
}
const computedStyle = window.getComputedStyle(textarea);
// Get the max-height specified by CSS (e.g., from `lg:max-h-48`)
const currentMaxHeight = computedStyle.maxHeight;
const computedStyle = window.getComputedStyle(textarea);
// Get the max-height specified by CSS (e.g., from `lg:max-h-48`)
const currentMaxHeight = computedStyle.maxHeight;
// Temporarily remove max-height to allow scrollHeight to be calculated correctly
textarea.style.maxHeight = 'none';
// Reset height to 'auto' to measure the actual scrollHeight needed
textarea.style.height = 'auto';
// Set the height to the calculated scrollHeight
textarea.style.height = `${textarea.scrollHeight}px`;
// Re-apply the original max-height from CSS to enforce the limit
textarea.style.maxHeight = currentMaxHeight;
};
// Temporarily remove max-height to allow scrollHeight to be calculated correctly
textarea.style.maxHeight = 'none';
// Reset height to 'auto' to measure the actual scrollHeight needed
textarea.style.height = 'auto';
// Set the height to the calculated scrollHeight
textarea.style.height = `${textarea.scrollHeight}px`;
// Re-apply the original max-height from CSS to enforce the limit
textarea.style.maxHeight = currentMaxHeight;
},
100
); // Throttle to prevent excessive calls
// Interface describing the API returned by the hook
export interface ChatTextareaApi {
@@ -65,6 +69,7 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
}
}, [textareaRef, savedInitValue]); // Depend on ref and savedInitValue
// On input change, we adjust the height of the textarea
const handleInput = useCallback(
(event: React.FormEvent<HTMLTextAreaElement>) => {
// Call adjustTextareaHeight on every input - it will decide whether to act
@@ -94,6 +99,6 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
},
ref: textareaRef,
refOnSubmit: onSubmitRef,
onInput: handleInput,
onInput: handleInput, // for adjusting height on input
};
}
+5 -2
View File
@@ -22,12 +22,15 @@ html {
all: revert;
}
pre {
@apply whitespace-pre-wrap rounded-lg p-2;
@apply whitespace-pre-wrap rounded-lg p-2 mb-3;
border: 1px solid currentColor;
}
p {
@apply mb-2;
}
hr {
@apply my-4 border-base-content/20 border-1;
}
/* TODO: fix markdown table */
}
@@ -35,7 +38,7 @@ html {
@apply md:opacity-0 md:group-hover:opacity-100;
}
.btn-mini {
@apply cursor-pointer hover:shadow-md;
@apply cursor-pointer;
}
.chat-screen {
max-width: 900px;
+25 -1
View File
@@ -3,6 +3,7 @@ import {
APIMessage,
CanvasData,
Conversation,
LlamaCppServerProps,
Message,
PendingMessage,
ViewingChat,
@@ -12,9 +13,11 @@ import {
filterThoughtFromMsgs,
normalizeMsgsForAPI,
getSSEStreamAsync,
getServerProps,
} from './misc';
import { BASE_URL, CONFIG_DEFAULT, isDev } from '../Config';
import { matchPath, useLocation, useNavigate } from 'react-router';
import toast from 'react-hot-toast';
interface AppContextValue {
// conversations and messages
@@ -46,6 +49,9 @@ interface AppContextValue {
saveConfig: (config: typeof CONFIG_DEFAULT) => void;
showSettings: boolean;
setShowSettings: (show: boolean) => void;
// props
serverProps: LlamaCppServerProps | null;
}
// this callback is used for scrolling to the bottom of the chat and switching to the last node
@@ -74,6 +80,9 @@ export const AppContextProvider = ({
const params = matchPath('/chat/:convId', pathname);
const convId = params?.params?.convId;
const [serverProps, setServerProps] = useState<LlamaCppServerProps | null>(
null
);
const [viewingChat, setViewingChat] = useState<ViewingChat | null>(null);
const [pendingMessages, setPendingMessages] = useState<
Record<Conversation['id'], PendingMessage>
@@ -85,6 +94,20 @@ export const AppContextProvider = ({
const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
const [showSettings, setShowSettings] = useState(false);
// get server props
useEffect(() => {
getServerProps(BASE_URL, config.apiKey)
.then((props) => {
console.debug('Server props:', props);
setServerProps(props);
})
.catch((err) => {
console.error(err);
toast.error('Failed to fetch server props');
});
// eslint-disable-next-line
}, []);
// handle change when the convId from URL is changed
useEffect(() => {
// also reset the canvas data
@@ -260,7 +283,7 @@ export const AppContextProvider = ({
} else {
console.error(err);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
alert((err as any)?.message ?? 'Unknown error');
toast.error((err as any)?.message ?? 'Unknown error');
throw err; // rethrow
}
}
@@ -377,6 +400,7 @@ export const AppContextProvider = ({
saveConfig,
showSettings,
setShowSettings,
serverProps,
}}
>
{children}
+29
View File
@@ -36,3 +36,32 @@ export const OpenInNewTab = ({
{children}
</a>
);
export function BtnWithTooltips({
className,
onClick,
onMouseLeave,
children,
tooltipsContent,
disabled,
}: {
className?: string;
onClick: () => void;
onMouseLeave?: () => void;
children: React.ReactNode;
tooltipsContent: string;
disabled?: boolean;
}) {
return (
<div className="tooltip tooltip-bottom" data-tip={tooltipsContent}>
<button
className={`${className ?? ''} flex items-center justify-center`}
onClick={onClick}
disabled={disabled}
onMouseLeave={onMouseLeave}
>
{children}
</button>
</div>
);
}
+16 -17
View File
@@ -1,6 +1,6 @@
import { useEffect, useState } from 'react';
import { MessageExtraContext } from './types';
import { useEffect } from 'react';
import { ChatTextareaApi } from '../components/useChatTextarea.ts';
import { ChatExtraContextApi } from '../components/useChatExtraContext.tsx';
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,11 +15,10 @@ interface SetTextEvData {
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
*/
export const useVSCodeContext = (textarea: ChatTextareaApi) => {
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
null
);
export const useVSCodeContext = (
textarea: ChatTextareaApi,
extraContext: ChatExtraContextApi
) => {
// Accept setText message from a parent window and set inputMsg and extraContext
useEffect(() => {
const handleMessage = (event: MessageEvent) => {
@@ -27,10 +26,14 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
const data: SetTextEvData = event.data;
textarea.setValue(data?.text);
if (data?.context && data.context.length > 0) {
setExtraContext({
type: 'context',
content: data.context,
});
extraContext.clearItems();
extraContext.addItems([
{
type: 'context',
name: 'Extra context',
content: data.context,
},
]);
}
textarea.focus();
setTimeout(() => {
@@ -41,7 +44,7 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
window.addEventListener('message', handleMessage);
return () => window.removeEventListener('message', handleMessage);
}, [textarea]);
}, [textarea, extraContext]);
// Add a keydown listener that sends the "escapePressed" message to the parent window
useEffect(() => {
@@ -55,9 +58,5 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
return () => window.removeEventListener('keydown', handleKeyDown);
}, []);
return {
extraContext,
// call once the user message is sent, to clear the extra context
clearExtraContext: () => setExtraContext(null),
};
return {};
};
+68 -7
View File
@@ -1,6 +1,11 @@
// @ts-expect-error this package does not have typing
import TextLineStream from 'textlinestream';
import { APIMessage, Message } from './types';
import {
APIMessage,
APIMessageContentPart,
LlamaCppServerProps,
Message,
} from './types';
// ponyfill for missing ReadableStream asyncIterator on Safari
import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
@@ -57,19 +62,47 @@ export const copyStr = (textToCopy: string) => {
*/
export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
return messages.map((msg) => {
let newContent = '';
if (msg.role !== 'user' || !msg.extra) {
return {
role: msg.role,
content: msg.content,
} as APIMessage;
}
// extra content first, then user text message in the end
// this allow re-using the same cache prefix for long context
const contentArr: APIMessageContentPart[] = [];
for (const extra of msg.extra ?? []) {
if (extra.type === 'context') {
newContent += `${extra.content}\n\n`;
contentArr.push({
type: 'text',
text: extra.content,
});
} else if (extra.type === 'textFile') {
contentArr.push({
type: 'text',
text: `File: ${extra.name}\nContent:\n\n${extra.content}`,
});
} else if (extra.type === 'imageFile') {
contentArr.push({
type: 'image_url',
image_url: { url: extra.base64Url },
});
} else {
throw new Error('Unknown extra type');
}
}
newContent += msg.content;
// add user message to the end
contentArr.push({
type: 'text',
text: msg.content,
});
return {
role: msg.role,
content: newContent,
content: contentArr,
};
}) as APIMessage[];
}
@@ -78,13 +111,19 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
* recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
*/
export function filterThoughtFromMsgs(messages: APIMessage[]) {
console.debug({ messages });
return messages.map((msg) => {
if (msg.role !== 'assistant') {
return msg;
}
// assistant message is always a string
const contentStr = msg.content as string;
return {
role: msg.role,
content:
msg.role === 'assistant'
? msg.content.split('</think>').at(-1)!.trim()
: msg.content,
? contentStr.split('</think>').at(-1)!.trim()
: contentStr,
} as APIMessage;
});
}
@@ -126,3 +165,25 @@ export const cleanCurrentUrl = (removeQueryParams: string[]) => {
});
window.history.replaceState({}, '', url.toString());
};
export const getServerProps = async (
baseUrl: string,
apiKey?: string
): Promise<LlamaCppServerProps> => {
try {
const response = await fetch(`${baseUrl}/props`, {
headers: {
'Content-Type': 'application/json',
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {}),
},
});
if (!response.ok) {
throw new Error('Failed to fetch server props');
}
const data = await response.json();
return data as LlamaCppServerProps;
} catch (error) {
console.error('Error fetching server props:', error);
throw error;
}
};
+10
View File
@@ -116,6 +116,16 @@ const StorageUtils = {
});
return conv;
},
/**
* update the name of a conversation
*/
async updateConversationName(convId: string, name: string): Promise<void> {
await db.conversations.update(convId, {
name,
lastModified: Date.now(),
});
dispatchConversationChange(convId);
},
/**
* if convId does not exist, throw an error
*/
+36 -2
View File
@@ -48,7 +48,10 @@ export interface Message {
children: Message['id'][];
}
type MessageExtra = MessageExtraTextFile | MessageExtraContext; // TODO: will add more in the future
export type MessageExtra =
| MessageExtraTextFile
| MessageExtraImageFile
| MessageExtraContext;
export interface MessageExtraTextFile {
type: 'textFile';
@@ -56,12 +59,32 @@ export interface MessageExtraTextFile {
content: string;
}
export interface MessageExtraImageFile {
type: 'imageFile';
name: string;
base64Url: string;
}
export interface MessageExtraContext {
type: 'context';
name: string;
content: string;
}
export type APIMessage = Pick<Message, 'role' | 'content'>;
export type APIMessageContentPart =
| {
type: 'text';
text: string;
}
| {
type: 'image_url';
image_url: { url: string };
};
export type APIMessage = {
role: Message['role'];
content: string | APIMessageContentPart[];
};
export interface Conversation {
id: string; // format: `conv-{timestamp}`
@@ -89,3 +112,14 @@ export interface CanvasPyInterpreter {
}
export type CanvasData = CanvasPyInterpreter;
// a non-complete list of props, only contains the ones we need
export interface LlamaCppServerProps {
build_info: string;
model_path: string;
n_ctx: number;
modalities?: {
vision: boolean;
};
// TODO: support params
}
+1
View File
@@ -71,6 +71,7 @@ export default defineConfig({
server: {
proxy: {
'/v1': 'http://localhost:8080',
'/props': 'http://localhost:8080',
},
headers: {
'Cross-Origin-Embedder-Policy': 'require-corp',