Compare commits

...

2 Commits

Author SHA1 Message Date
ZihaoMu 85f99dca8b ggml: support concat for scalar types at cuda backend (#24011)
* cuda: support concat for scalar types

* Update concat.cu

* fix metal ci issue
2026-06-12 09:32:44 +03:00
Neo Zhang 099ea76fb4 [SYCL] Fix CI build & release for SYCL backend (#24387)
* restore SYCL build and release, remove github cache

* modify for test only

* verify the ccache is used

* remove debug code change

* rm duplicate action, update key in ccache

* add action ccache-clear after building in both ubuntu and windows

* set %NUMBER_OF_PROCESSORS% in widnows build
2026-06-12 09:30:24 +03:00
6 changed files with 404 additions and 389 deletions
+103 -124
View File
@@ -34,129 +34,108 @@ env:
LLAMA_ARG_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
# in order to enable it again, we have to provision dedicated runners to run it
# ubuntu-24-sycl:
# strategy:
# matrix:
# build: [fp32]
# include:
# - build: fp32
# fp16: OFF
#
# runs-on: ubuntu-24.04
#
# env:
# ONEAPI_ROOT: /opt/intel/oneapi/
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
# LEVEL_ZERO_VERSION: "1.28.2"
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
#
# continue-on-error: true
#
# steps:
# - uses: actions/checkout@v6
#
# - name: Use oneAPI Installation Cache
# uses: actions/cache@v5
# id: cache-sycl
# with:
# path: ${{ env.ONEAPI_ROOT }}
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
#
# - name: Download & Install oneAPI
# shell: bash
# if: steps.cache-sycl.outputs.cache-hit != 'true'
# run: |
# cd /tmp
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
#
# - name: Install Level Zero SDK
# shell: bash
# run: |
# cd /tmp
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
#
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.21
# with:
# key: sycl-ubuntu-24-${{ matrix.build }}
# evict-old-files: 1d
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
#
# - name: Build
# id: cmake_build
# run: |
# source /opt/intel/oneapi/setvars.sh
# cmake -B build \
# -G "Ninja" \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_SYCL=ON \
# -DCMAKE_C_COMPILER=icx \
# -DCMAKE_CXX_COMPILER=icpx \
# -DLLAMA_OPENSSL=OFF \
# -DGGML_NATIVE=OFF \
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
# time cmake --build build --config Release -j $(nproc)
runs-on: ubuntu-24.04
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
# in order to enable it again, we have to provision dedicated runners to run it
# windows-latest-sycl:
# runs-on: windows-2022
#
# defaults:
# run:
# shell: bash
#
# env:
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Use oneAPI Installation Cache
# uses: actions/cache@v5
# id: cache-sycl
# with:
# path: ${{ env.ONEAPI_ROOT }}
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
#
# - name: Download & Install oneAPI
# shell: bash
# if: steps.cache-sycl.outputs.cache-hit != 'true'
# run: |
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
#
# - name: Install Level Zero SDK
# shell: pwsh
# run: |
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
#
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.21
# with:
# key: sycl-windows-latest
# variant: ccache
# evict-old-files: 1d
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
#
# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
#
# - name: Build
# id: cmake_build
# run: examples/sycl/win-build-sycl.bat
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
LEVEL_ZERO_VERSION: "1.28.2"
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
continue-on-error: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Download & Install oneAPI
shell: bash
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: Install Level Zero SDK
shell: bash
run: |
cd /tmp
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: sycl-ubuntu-24-${{ matrix.build }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
windows-latest-sycl:
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Download & Install oneAPI
shell: bash
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Install Level Zero SDK
shell: pwsh
run: |
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: sycl-windows-latest
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
+195 -203
View File
@@ -754,210 +754,202 @@ jobs:
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
# in order to enable it again, we have to provision dedicated runners to run it
# windows-sycl:
#
# runs-on: windows-2022
#
# defaults:
# run:
# shell: bash
#
# env:
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
#
# - name: Use oneAPI Installation Cache
# uses: actions/cache@v5
# id: cache-sycl
# with:
# path: ${{ env.ONEAPI_ROOT }}
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
#
# - name: Download & Install oneAPI
# shell: bash
# if: steps.cache-sycl.outputs.cache-hit != 'true'
# run: |
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
#
# - name: Install Level Zero SDK
# shell: pwsh
# run: |
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
#
# - name: Setup Node.js
# uses: actions/setup-node@v6
# with:
# node-version: "24"
# cache: "npm"
# cache-dependency-path: "tools/ui/package-lock.json"
#
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.21
# with:
# key: release-windows-2022-x64-sycl
#
# - name: Build
# id: cmake_build
# shell: cmd
# run: |
# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# cmake -G "Ninja" -B build ^
# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
# -DCMAKE_BUILD_TYPE=Release ^
# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
# -DGGML_CPU=OFF -DGGML_SYCL=ON ^
# -DLLAMA_BUILD_BORINGSSL=ON
# cmake --build build --target ggml-sycl -j
#
# - name: Build the release package
# id: pack_artifacts
# run: |
# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
#
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
#
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
# if [ -n "$ZE_LOADER_DLL" ]; then
# echo "Using Level Zero loader: $ZE_LOADER_DLL"
# cp "$ZE_LOADER_DLL" ./build/bin
# else
# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
# fi
#
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
#
# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
#
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
#
# echo "cp oneAPI running time dll files to ./build/bin done"
# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
#
# - name: Upload the release package
# uses: actions/upload-artifact@v6
# with:
# path: llama-bin-win-sycl-x64.zip
# name: llama-bin-win-sycl-x64.zip
windows-sycl:
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
# in order to enable it again, we have to provision dedicated runners to run it
# ubuntu-24-sycl:
#
# strategy:
# matrix:
# build: [fp32]
# include:
# - build: fp32
# fp16: OFF
#
# runs-on: ubuntu-24.04
#
# env:
# ONEAPI_ROOT: /opt/intel/oneapi/
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
# LEVEL_ZERO_VERSION: "1.28.2"
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
# with:
# fetch-depth: 0
#
# - name: Use oneAPI Installation Cache
# uses: actions/cache@v5
# id: cache-sycl
# with:
# path: ${{ env.ONEAPI_ROOT }}
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
#
# - name: Download & Install oneAPI
# shell: bash
# if: steps.cache-sycl.outputs.cache-hit != 'true'
# run: |
# cd /tmp
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
#
# - name: Install Level Zero SDK
# shell: bash
# run: |
# cd /tmp
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
#
# - name: Setup Node.js
# uses: actions/setup-node@v6
# with:
# node-version: "24"
# cache: "npm"
# cache-dependency-path: "tools/ui/package-lock.json"
#
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.21
# with:
# key: release-ubuntu-24.04-sycl
#
# - name: Build
# id: cmake_build
# run: |
# source /opt/intel/oneapi/setvars.sh
# cmake -B build \
# -G "Ninja" \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_SYCL=ON \
# -DCMAKE_C_COMPILER=icx \
# -DCMAKE_CXX_COMPILER=icpx \
# -DLLAMA_OPENSSL=OFF \
# -DGGML_NATIVE=OFF \
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
# time cmake --build build --config Release -j $(nproc)
#
# - name: Determine tag name
# id: tag
# uses: ./.github/actions/get-tag-name
#
# - name: Pack artifacts
# id: pack_artifacts
# run: |
# cp LICENSE ./build/bin/
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
#
# - name: Upload artifacts
# uses: actions/upload-artifact@v6
# with:
# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Download & Install oneAPI
shell: bash
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Install Level Zero SDK
shell: pwsh
run: |
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: "24"
cache: "npm"
cache-dependency-path: "tools/ui/package-lock.json"
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: release-windows-2022-x64-sycl
- name: Build
id: cmake_build
shell: cmd
run: |
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
cmake -G "Ninja" -B build ^
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
-DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --target ggml-sycl -j %NUMBER_OF_PROCESSORS%
- name: ccache-clear
uses: ./.github/actions/ccache-clear
with:
key: release-windows-2022-x64-sycl
- name: Build the release package
id: pack_artifacts
run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
if [ -n "$ZE_LOADER_DLL" ]; then
echo "Using Level Zero loader: $ZE_LOADER_DLL"
cp "$ZE_LOADER_DLL" ./build/bin
else
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
fi
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
uses: actions/upload-artifact@v6
with:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
LEVEL_ZERO_VERSION: "1.28.2"
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Download & Install oneAPI
shell: bash
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: Install Level Zero SDK
shell: bash
run: |
cd /tmp
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: "24"
cache: "npm"
cache-dependency-path: "tools/ui/package-lock.json"
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: release-ubuntu-24.04-sycl-${{ matrix.build }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
- name: ccache-clear
uses: ./.github/actions/ccache-clear
with:
key: release-ubuntu-24.04-sycl-${{ matrix.build }}
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
ubuntu-22-rocm:
needs: [check-release]
+82 -60
View File
@@ -1,16 +1,18 @@
#include "concat.cuh"
#include <stdint.h>
// contiguous kernels
template <int dim>
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
const float * y,
float * dst,
int64_t ne00,
int64_t ne01,
int64_t ne02,
int64_t ne0,
int64_t ne1,
int64_t ne2) {
template <typename T, int dim>
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_cont(const T * x,
const T * y,
T * dst,
int64_t ne00,
int64_t ne01,
int64_t ne02,
int64_t ne0,
int64_t ne1,
int64_t ne2) {
static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");
const int64_t n = ne0 * ne1 * ne2;
@@ -50,37 +52,37 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont
}
}
static void concat_f32_cuda(const float * x,
const float * y,
float * dst,
int64_t ne00,
int64_t ne01,
int64_t ne02,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int dim,
cudaStream_t stream) {
template <typename T>
static void concat_cont_cuda(const T * x,
const T * y,
T * dst,
int64_t ne00,
int64_t ne01,
int64_t ne02,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int dim,
cudaStream_t stream) {
const int64_t n = ne0 * ne1 * ne2;
const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
if (dim == 0) {
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream);
ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
ggml_cuda_kernel_launch(concat_cont<T, 0>, launch_params, x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
return;
}
if (dim == 1) {
concat_f32_cont<1>
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
concat_cont<T, 1><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
return;
}
concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
concat_cont<T, 2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
}
// non-contiguous kernel (slow)
template <int dim>
template <typename T, int dim>
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
concat_f32_non_cont(
concat_non_cont(
const char * src0,
const char * src1,
char * dst,
@@ -107,61 +109,49 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
uint64_t nb0,
uint64_t nb1,
uint64_t nb2,
uint64_t nb3){
uint64_t nb3) {
static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");
const int64_t i3 = blockIdx.z;
const int64_t i2 = blockIdx.y;
const int64_t i1 = blockIdx.x;
const float * x;
const T * x;
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
x = (const T *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
} else {
if constexpr (dim == 0) {
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
x = (const T *)(src1 + i3*nb13 + i2*nb12 + i1*nb11 + (i0 - ne00)*nb10);
} else if constexpr (dim == 1) {
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
x = (const T *)(src1 + i3*nb13 + i2*nb12 + (i1 - ne01)*nb11 + i0*nb10);
} else if constexpr (dim == 2) {
x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
x = (const T *)(src1 + i3*nb13 + (i2 - ne02)*nb12 + i1*nb11 + i0*nb10);
} else if constexpr (dim == 3) {
x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
x = (const T *)(src1 + (i3 - ne03)*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
}
}
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
T * y = (T *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*y = *x;
}
}
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
cudaStream_t stream = ctx.stream();
const int32_t dim = ((int32_t *) dst->op_params)[0];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
template <typename T>
static void concat_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, int dim, cudaStream_t stream) {
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data;
const T * src0_d = (const T *) src0->data;
const T * src1_d = (const T *) src1->data;
T * dst_d = (T *) dst->data;
if (dim != 3) {
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
concat_f32_cuda(
src0_d + i3 * (src0->nb[3] / 4),
src1_d + i3 * (src1->nb[3] / 4),
dst_d + i3 * ( dst->nb[3] / 4),
for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) {
concat_cont_cuda(
src0_d + i3*(src0->nb[3] / sizeof(T)),
src1_d + i3*(src1->nb[3] / sizeof(T)),
dst_d + i3*( dst->nb[3] / sizeof(T)),
src0->ne[0], src0->ne[1], src0->ne[2],
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
}
@@ -169,13 +159,13 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const size_t size0 = ggml_nbytes(src0);
const size_t size1 = ggml_nbytes(src1);
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync((char *) dst->data, src0->data, size0, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync((char *) dst->data + size0, src1->data, size1, cudaMemcpyDeviceToDevice, stream));
}
} else {
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
auto launch_kernel = [&](auto dim) {
concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
concat_non_cont<T, dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
(const char *) src0->data, (const char *) src1->data, (char *) dst->data,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
@@ -203,3 +193,35 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
}
}
}
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
cudaStream_t stream = ctx.stream();
const int32_t dim = ((int32_t *) dst->op_params)[0];
GGML_ASSERT(src0->type == src1->type);
GGML_ASSERT(dst->type == src0->type);
GGML_ASSERT(!ggml_is_quantized(src0->type));
GGML_ASSERT(ggml_blck_size(src0->type) == 1);
switch (ggml_type_size(src0->type)) {
case 1:
concat_cuda<uint8_t>(src0, src1, dst, dim, stream);
break;
case 2:
concat_cuda<uint16_t>(src0, src1, dst, dim, stream);
break;
case 4:
concat_cuda<uint32_t>(src0, src1, dst, dim, stream);
break;
case 8:
concat_cuda<uint64_t>(src0, src1, dst, dim, stream);
break;
default:
GGML_ABORT("Unsupported type size: %zu", ggml_type_size(src0->type));
break;
}
}
+9 -1
View File
@@ -5345,7 +5345,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_CONCAT:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
ggml_type src1_type = op->src[1]->type;
return src0_type == src1_type &&
src0_type == op->type &&
!ggml_is_quantized(src0_type) &&
ggml_blck_size(src0_type) == 1 &&
(ggml_type_size(src0_type) == 1 ||
ggml_type_size(src0_type) == 2 ||
ggml_type_size(src0_type) == 4 ||
ggml_type_size(src0_type) == 8);
} break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
+10 -1
View File
@@ -1120,8 +1120,17 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_VIEW:
case GGML_OP_TRANSPOSE:
case GGML_OP_PERMUTE:
case GGML_OP_CONCAT:
return true;
case GGML_OP_CONCAT:
{
// kernel_concat copies one float-sized value per element.
// Other scalar types need a type-generic copy kernel first.
const enum ggml_type src0_type = op->src[0]->type;
const enum ggml_type src1_type = op->src[1]->type;
return src0_type == src1_type &&
src0_type == op->type &&
(src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_I32);
}
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL:
+5
View File
@@ -8849,7 +8849,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (int v : { 0, 1, 2, 3 }) {
for (int dim : { 0, 1, 2, 3, }) {
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
test_cases.emplace_back(new test_concat(GGML_TYPE_F16, {11, 12, 13, 14}, 7, dim, v));
test_cases.emplace_back(new test_concat(GGML_TYPE_BF16, {11, 12, 13, 14}, 7, dim, v));
test_cases.emplace_back(new test_concat(GGML_TYPE_I8, {11, 12, 13, 14}, 7, dim, v));
test_cases.emplace_back(new test_concat(GGML_TYPE_I16, {11, 12, 13, 14}, 7, dim, v));
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
test_cases.emplace_back(new test_concat(GGML_TYPE_I64, {11, 12, 13, 14}, 7, dim, v));
}
}