ggml: support concat for scalar types at cuda backend (#24011 )

* cuda: support concat for scalar types * Update concat.cu * fix metal ci issue
[SYCL] Fix CI build & release for SYCL backend (#24387 )
2026-06-12 08:46:44 +02:00 · 2026-06-12 09:32:44 +03:00 · 2026-06-12 09:30:24 +03:00
6 changed files with 404 additions and 389 deletions
@@ -34,129 +34,108 @@ env:
  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
+  ubuntu-24-sycl:
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON

-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  ubuntu-24-sycl:
-#    strategy:
-#      matrix:
-#        build: [fp32]
-#        include:
-#          - build: fp32
-#            fp16: OFF
-#
-#    runs-on: ubuntu-24.04
-#
-#    env:
-#      ONEAPI_ROOT: /opt/intel/oneapi/
-#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-#      LEVEL_ZERO_VERSION: "1.28.2"
-#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-#
-#    continue-on-error: true
-#
-#    steps:
-#      - uses: actions/checkout@v6
-#
-#      - name: Use oneAPI Installation Cache
-#        uses: actions/cache@v5
-#        id: cache-sycl
-#        with:
-#          path: ${{ env.ONEAPI_ROOT }}
-#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-#
-#      - name: Download & Install oneAPI
-#        shell: bash
-#        if: steps.cache-sycl.outputs.cache-hit != 'true'
-#        run: |
-#          cd /tmp
-#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-#
-#      - name: Install Level Zero SDK
-#        shell: bash
-#        run: |
-#          cd /tmp
-#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-#
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: sycl-ubuntu-24-${{ matrix.build }}
-#          evict-old-files: 1d
-#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-#
-#      - name: Build
-#        id: cmake_build
-#        run: |
-#          source /opt/intel/oneapi/setvars.sh
-#          cmake -B build \
-#            -G "Ninja" \
-#            -DCMAKE_BUILD_TYPE=Release \
-#            -DGGML_SYCL=ON \
-#            -DCMAKE_C_COMPILER=icx \
-#            -DCMAKE_CXX_COMPILER=icpx \
-#            -DLLAMA_OPENSSL=OFF \
-#            -DGGML_NATIVE=OFF \
-#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-#          time cmake --build build --config Release -j $(nproc)
+    runs-on: ubuntu-24.04

-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  windows-latest-sycl:
-#    runs-on: windows-2022
-#
-#    defaults:
-#      run:
-#        shell: bash
-#
-#    env:
-#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: Use oneAPI Installation Cache
-#        uses: actions/cache@v5
-#        id: cache-sycl
-#        with:
-#          path: ${{ env.ONEAPI_ROOT }}
-#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-#
-#      - name: Download & Install oneAPI
-#        shell: bash
-#        if: steps.cache-sycl.outputs.cache-hit != 'true'
-#        run: |
-#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-#
-#      - name: Install Level Zero SDK
-#        shell: pwsh
-#        run: |
-#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: sycl-windows-latest
-#          variant: ccache
-#          evict-old-files: 1d
-#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-#
-#      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-#
-#      - name: Build
-#        id: cmake_build
-#        run:  examples/sycl/win-build-sycl.bat
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+      LEVEL_ZERO_VERSION: "1.28.2"
+      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+
+    continue-on-error: true
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Download & Install oneAPI
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: Install Level Zero SDK
+        shell: bash
+        run: |
+          cd /tmp
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: sycl-ubuntu-24-${{ matrix.build }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Download & Install oneAPI
+        shell: bash
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      - name: Install Level Zero SDK
+        shell: pwsh
+        run: |
+          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: sycl-windows-latest
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
@@ -754,210 +754,202 @@ jobs:
          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  windows-sycl:
-#
-#    runs-on: windows-2022
-#
-#    defaults:
-#      run:
-#        shell: bash
-#
-#    env:
-#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: Use oneAPI Installation Cache
-#        uses: actions/cache@v5
-#        id: cache-sycl
-#        with:
-#          path: ${{ env.ONEAPI_ROOT }}
-#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-#
-#      - name: Download & Install oneAPI
-#        shell: bash
-#        if: steps.cache-sycl.outputs.cache-hit != 'true'
-#        run: |
-#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-#
-#      - name: Install Level Zero SDK
-#        shell: pwsh
-#        run: |
-#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-#
-#      - name: Setup Node.js
-#        uses: actions/setup-node@v6
-#        with:
-#          node-version: "24"
-#          cache: "npm"
-#          cache-dependency-path: "tools/ui/package-lock.json"
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: release-windows-2022-x64-sycl
-#
-#      - name: Build
-#        id: cmake_build
-#        shell: cmd
-#        run: |
-#          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-#          cmake -G "Ninja" -B build ^
-#            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-#            -DCMAKE_BUILD_TYPE=Release ^
-#            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-#            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-#            -DLLAMA_BUILD_BORINGSSL=ON
-#          cmake --build build --target ggml-sycl -j
-#
-#      - name: Build the release package
-#        id: pack_artifacts
-#        run: |
-#          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-#
-#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-#
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-#          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
-#          if [ -n "$ZE_LOADER_DLL" ]; then
-#            echo "Using Level Zero loader: $ZE_LOADER_DLL"
-#            cp "$ZE_LOADER_DLL" ./build/bin
-#          else
-#            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
-#          fi
-#
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
-#
-#          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-#
-#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
-#          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
-#
-#          echo "cp oneAPI running time dll files to ./build/bin done"
-#          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
-#
-#      - name: Upload the release package
-#        uses: actions/upload-artifact@v6
-#        with:
-#          path: llama-bin-win-sycl-x64.zip
-#          name: llama-bin-win-sycl-x64.zip
+  windows-sycl:

-# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
-#       in order to enable it again, we have to provision dedicated runners  to run it
-#  ubuntu-24-sycl:
-#
-#    strategy:
-#      matrix:
-#        build: [fp32]
-#        include:
-#          - build: fp32
-#            fp16: OFF
-#
-#    runs-on: ubuntu-24.04
-#
-#    env:
-#      ONEAPI_ROOT: /opt/intel/oneapi/
-#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-#      LEVEL_ZERO_VERSION: "1.28.2"
-#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#        with:
-#          fetch-depth: 0
-#
-#      - name: Use oneAPI Installation Cache
-#        uses: actions/cache@v5
-#        id: cache-sycl
-#        with:
-#          path: ${{ env.ONEAPI_ROOT }}
-#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-#
-#      - name: Download & Install oneAPI
-#        shell: bash
-#        if: steps.cache-sycl.outputs.cache-hit != 'true'
-#        run: |
-#          cd /tmp
-#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-#
-#      - name: Install Level Zero SDK
-#        shell: bash
-#        run: |
-#          cd /tmp
-#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-#
-#      - name: Setup Node.js
-#        uses: actions/setup-node@v6
-#        with:
-#          node-version: "24"
-#          cache: "npm"
-#          cache-dependency-path: "tools/ui/package-lock.json"
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: release-ubuntu-24.04-sycl
-#
-#      - name: Build
-#        id: cmake_build
-#        run: |
-#          source /opt/intel/oneapi/setvars.sh
-#          cmake -B build \
-#            -G "Ninja" \
-#            -DCMAKE_BUILD_TYPE=Release \
-#            -DGGML_SYCL=ON \
-#            -DCMAKE_C_COMPILER=icx \
-#            -DCMAKE_CXX_COMPILER=icpx \
-#            -DLLAMA_OPENSSL=OFF \
-#            -DGGML_NATIVE=OFF \
-#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-#          time cmake --build build --config Release -j $(nproc)
-#
-#      - name: Determine tag name
-#        id: tag
-#        uses: ./.github/actions/get-tag-name
-#
-#      - name: Pack artifacts
-#        id: pack_artifacts
-#        run: |
-#          cp LICENSE ./build/bin/
-#          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-#
-#      - name: Upload artifacts
-#        uses: actions/upload-artifact@v6
-#        with:
-#          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-#          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Download & Install oneAPI
+        shell: bash
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      - name: Install Level Zero SDK
+        shell: pwsh
+        run: |
+          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2022-x64-sycl
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        run: |
+          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+          cmake -G "Ninja" -B build ^
+            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
+            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
+            -DLLAMA_BUILD_BORINGSSL=ON
+          cmake --build build --target ggml-sycl -j %NUMBER_OF_PROCESSORS%
+
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-windows-2022-x64-sycl
+
+      - name: Build the release package
+        id: pack_artifacts
+        run: |
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
+          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
+          if [ -n "$ZE_LOADER_DLL" ]; then
+            echo "Using Level Zero loader: $ZE_LOADER_DLL"
+            cp "$ZE_LOADER_DLL" ./build/bin
+          else
+            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
+          fi
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
+
+          echo "cp oneAPI running time dll files to ./build/bin done"
+          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
+
+      - name: Upload the release package
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-bin-win-sycl-x64.zip
+          name: llama-bin-win-sycl-x64.zip
+
+  ubuntu-24-sycl:
+
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON
+
+    runs-on: ubuntu-24.04
+
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+      LEVEL_ZERO_VERSION: "1.28.2"
+      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Download & Install oneAPI
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: Install Level Zero SDK
+        shell: bash
+        run: |
+          cd /tmp
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-ubuntu-24.04-sycl-${{ matrix.build }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: ccache-clear
+        uses: ./.github/actions/ccache-clear
+        with:
+          key: release-ubuntu-24.04-sycl-${{ matrix.build }}
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz

  ubuntu-22-rocm:
    needs: [check-release]
@@ -1,16 +1,18 @@
 #include "concat.cuh"

+#include <stdint.h>
+
 // contiguous kernels
-template <int dim>
-static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
-                                                                                 const float * y,
-                                                                                 float *       dst,
-                                                                                 int64_t       ne00,
-                                                                                 int64_t       ne01,
-                                                                                 int64_t       ne02,
-                                                                                 int64_t       ne0,
-                                                                                 int64_t       ne1,
-                                                                                 int64_t       ne2) {
+template <typename T, int dim>
+static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_cont(const T * x,
+                                                                             const T * y,
+                                                                             T *       dst,
+                                                                             int64_t   ne00,
+                                                                             int64_t   ne01,
+                                                                             int64_t   ne02,
+                                                                             int64_t   ne0,
+                                                                             int64_t   ne1,
+                                                                             int64_t   ne2) {
    static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");

    const int64_t n = ne0 * ne1 * ne2;
@@ -50,37 +52,37 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont
    }
 }

-static void concat_f32_cuda(const float * x,
-                            const float * y,
-                            float *       dst,
-                            int64_t       ne00,
-                            int64_t       ne01,
-                            int64_t       ne02,
-                            int64_t       ne0,
-                            int64_t       ne1,
-                            int64_t       ne2,
-                            int           dim,
-                            cudaStream_t  stream) {
+template <typename T>
+static void concat_cont_cuda(const T * x,
+                             const T * y,
+                             T *       dst,
+                             int64_t   ne00,
+                             int64_t   ne01,
+                             int64_t   ne02,
+                             int64_t   ne0,
+                             int64_t   ne1,
+                             int64_t   ne2,
+                             int       dim,
+                             cudaStream_t stream) {
    const int64_t n          = ne0 * ne1 * ne2;
    const int     num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;

    if (dim == 0) {
        const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream);
-        ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
+        ggml_cuda_kernel_launch(concat_cont<T, 0>, launch_params, x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
        return;
    }
    if (dim == 1) {
-        concat_f32_cont<1>
-            <<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
+        concat_cont<T, 1><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
        return;
    }
-    concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
+    concat_cont<T, 2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
 }

 // non-contiguous kernel (slow)
-template <int dim>
+template <typename T, int dim>
 static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
-    concat_f32_non_cont(
+    concat_non_cont(
        const char * src0,
        const char * src1,
              char * dst,
@@ -107,61 +109,49 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
          uint64_t   nb0,
          uint64_t   nb1,
          uint64_t   nb2,
-          uint64_t   nb3){
+          uint64_t   nb3) {
    static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]");

    const int64_t i3 = blockIdx.z;
    const int64_t i2 = blockIdx.y;
    const int64_t i1 = blockIdx.x;

-    const float * x;
+    const T * x;

    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-            x = (const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
+            x = (const T *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
        } else {
            if constexpr (dim == 0) {
-                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
+                x = (const T *)(src1 + i3*nb13 + i2*nb12 + i1*nb11 + (i0 - ne00)*nb10);
            } else if constexpr (dim == 1) {
-                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
+                x = (const T *)(src1 + i3*nb13 + i2*nb12 + (i1 - ne01)*nb11 + i0*nb10);
            } else if constexpr (dim == 2) {
-                x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
+                x = (const T *)(src1 + i3*nb13 + (i2 - ne02)*nb12 + i1*nb11 + i0*nb10);
            } else if constexpr (dim == 3) {
-                x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
+                x = (const T *)(src1 + (i3 - ne03)*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
            }
        }

-        float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+        T * y = (T *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);

        *y = *x;
    }
 }

-
-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    cudaStream_t stream = ctx.stream();
-
-    const int32_t dim = ((int32_t *) dst->op_params)[0];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-
+template <typename T>
+static void concat_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, int dim, cudaStream_t stream) {
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-        const float * src0_d = (const float *)src0->data;
-        const float * src1_d = (const float *)src1->data;
-
-        float * dst_d = (float *)dst->data;
+        const T * src0_d = (const T *) src0->data;
+        const T * src1_d = (const T *) src1->data;
+        T *       dst_d  = (T *) dst->data;

        if (dim != 3) {
-            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-                concat_f32_cuda(
-                        src0_d + i3 * (src0->nb[3] / 4),
-                        src1_d + i3 * (src1->nb[3] / 4),
-                        dst_d + i3 * ( dst->nb[3] / 4),
+            for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) {
+                concat_cont_cuda(
+                        src0_d + i3*(src0->nb[3] / sizeof(T)),
+                        src1_d + i3*(src1->nb[3] / sizeof(T)),
+                        dst_d  + i3*( dst->nb[3] / sizeof(T)),
                        src0->ne[0], src0->ne[1], src0->ne[2],
                        dst->ne[0],  dst->ne[1],  dst->ne[2], dim, stream);
            }
@@ -169,13 +159,13 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
            const size_t size0 = ggml_nbytes(src0);
            const size_t size1 = ggml_nbytes(src1);

-            CUDA_CHECK(cudaMemcpyAsync(dst_d,           src0_d, size0, cudaMemcpyDeviceToDevice, stream));
-            CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync((char *) dst->data,         src0->data, size0, cudaMemcpyDeviceToDevice, stream));
+            CUDA_CHECK(cudaMemcpyAsync((char *) dst->data + size0, src1->data, size1, cudaMemcpyDeviceToDevice, stream));
        }
    } else {
        dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
        auto launch_kernel = [&](auto dim) {
-            concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
+            concat_non_cont<T, dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
                (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
                src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
@@ -203,3 +193,35 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        }
    }
 }
+
+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    cudaStream_t stream = ctx.stream();
+
+    const int32_t dim = ((int32_t *) dst->op_params)[0];
+
+    GGML_ASSERT(src0->type == src1->type);
+    GGML_ASSERT(dst->type  == src0->type);
+    GGML_ASSERT(!ggml_is_quantized(src0->type));
+    GGML_ASSERT(ggml_blck_size(src0->type) == 1);
+
+    switch (ggml_type_size(src0->type)) {
+        case 1:
+            concat_cuda<uint8_t>(src0, src1, dst, dim, stream);
+            break;
+        case 2:
+            concat_cuda<uint16_t>(src0, src1, dst, dim, stream);
+            break;
+        case 4:
+            concat_cuda<uint32_t>(src0, src1, dst, dim, stream);
+            break;
+        case 8:
+            concat_cuda<uint64_t>(src0, src1, dst, dim, stream);
+            break;
+        default:
+            GGML_ABORT("Unsupported type size: %zu", ggml_type_size(src0->type));
+            break;
+    }
+}
@@ -5345,7 +5345,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CONCAT:
            {
                ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+                ggml_type src1_type = op->src[1]->type;
+                return src0_type == src1_type &&
+                       src0_type == op->type &&
+                       !ggml_is_quantized(src0_type) &&
+                       ggml_blck_size(src0_type) == 1 &&
+                       (ggml_type_size(src0_type) == 1 ||
+                        ggml_type_size(src0_type) == 2 ||
+                        ggml_type_size(src0_type) == 4 ||
+                        ggml_type_size(src0_type) == 8);
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
@@ -1120,8 +1120,17 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
        case GGML_OP_VIEW:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_PERMUTE:
-        case GGML_OP_CONCAT:
            return true;
+        case GGML_OP_CONCAT:
+            {
+                // kernel_concat copies one float-sized value per element.
+                // Other scalar types need a type-generic copy kernel first.
+                const enum ggml_type src0_type = op->src[0]->type;
+                const enum ggml_type src1_type = op->src[1]->type;
+                return src0_type == src1_type &&
+                       src0_type == op->type &&
+                       (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_I32);
+            }
        case GGML_OP_ADD:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
@@ -8849,7 +8849,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    for (int v : { 0, 1, 2, 3 }) {
        for (int dim : { 0, 1, 2, 3, }) {
            test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_F16, {11, 12, 13, 14}, 7, dim, v));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_BF16, {11, 12, 13, 14}, 7, dim, v));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_I8, {11, 12, 13, 14}, 7, dim, v));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_I16, {11, 12, 13, 14}, 7, dim, v));
            test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
+            test_cases.emplace_back(new test_concat(GGML_TYPE_I64, {11, 12, 13, 14}, 7, dim, v));
        }
    }