mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-17 02:57:39 +02:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0f648573dd | |||
| b764b8f1d0 | |||
| 9241c3a2ac | |||
| b2b2bf988c | |||
| af4980bfed |
@@ -143,6 +143,47 @@ jobs:
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||
# how to debug it.
|
||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||
|
||||
+41
-5
@@ -1,5 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||
project("llama.cpp" C CXX)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
@@ -103,6 +104,8 @@ option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging"
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
@@ -121,8 +124,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||
#
|
||||
# Compile flags
|
||||
#
|
||||
if (LLAMA_SYCL)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
else()
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||
@@ -454,6 +461,32 @@ if (LLAMA_HIPBLAS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
if (LLAMA_SYCL)
|
||||
if ( NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
|
||||
endif()
|
||||
#todo: AOT
|
||||
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
if (LLAMA_SYCL_F16)
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
endif()
|
||||
add_compile_definitions(GGML_USE_SYCL)
|
||||
|
||||
add_compile_options(-I./) #include DPCT
|
||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||
|
||||
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
|
||||
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
endif()
|
||||
|
||||
function(get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
@@ -479,10 +512,12 @@ function(get_flags CCID CCVER)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
elseif (CCID MATCHES "Intel")
|
||||
# enable max optimization level when using Intel compiler
|
||||
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
add_link_options(-fuse-ld=lld -static-intel)
|
||||
if (NOT LLAMA_SYCL)
|
||||
# enable max optimization level when using Intel compiler
|
||||
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
add_link_options(-fuse-ld=lld -static-intel)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
@@ -799,6 +834,7 @@ add_library(ggml OBJECT
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||
|
||||
@@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||
- Mixed F16 / F32 precision
|
||||
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
|
||||
- CUDA, Metal and OpenCL GPU backend support
|
||||
- CUDA, Metal, OpenCL, SYCL GPU backend support
|
||||
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
|
||||
@@ -122,7 +122,8 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||
@@ -598,6 +599,15 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
||||
|
||||
- #### SYCL
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||
|
||||
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
|
||||
|
||||
|
||||
### Prepare Data & Run
|
||||
|
||||
```bash
|
||||
|
||||
+252
@@ -0,0 +1,252 @@
|
||||
# llama.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The llama.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_llama2.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
||||
@@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# with CUDA support
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# with SYCL support
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
```
|
||||
|
||||
@@ -10,6 +10,9 @@
|
||||
# # with CUDA support
|
||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with SYCL support
|
||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
@@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
if [ -z ${ONEAPI_ROOT} ]; then
|
||||
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
||||
fi
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
|
||||
+15
-11
@@ -42,6 +42,10 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
|
||||
#define GGML_USE_CUBLAS_SYCL
|
||||
#endif
|
||||
|
||||
int32_t get_num_physical_cores() {
|
||||
#ifdef __linux__
|
||||
// enumerate the set of thread siblings, num entries is num cores
|
||||
@@ -599,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -618,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
|
||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -643,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--numa") {
|
||||
@@ -1007,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
#endif
|
||||
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
@@ -1514,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
|
||||
@@ -23,6 +23,9 @@ else()
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
if (LLAMA_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(parallel)
|
||||
|
||||
@@ -2099,7 +2099,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
@@ -2125,7 +2125,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
|
||||
{
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
params.mul_mat_q = false;
|
||||
#else
|
||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
|
||||
@@ -2138,7 +2138,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
set(TARGET ls-sycl-device)
|
||||
add_executable(${TARGET} ls-sycl-device.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -0,0 +1,47 @@
|
||||
# llama.cpp/example/sycl
|
||||
|
||||
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
|
||||
|
||||
## Tool
|
||||
|
||||
|Tool Name| Function|Status|
|
||||
|-|-|-|
|
||||
|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
|
||||
|
||||
### ls-sycl-device
|
||||
|
||||
List all SYCL devices with ID, compute capability, max work group size, ect.
|
||||
|
||||
1. Build the llama.cpp for SYCL for all targets.
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. Execute
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
Executable
+20
@@ -0,0 +1,20 @@
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
@@ -0,0 +1,11 @@
|
||||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_backend_sycl_print_sycl_devices();
|
||||
return 0;
|
||||
}
|
||||
Executable
+19
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
export GGML_SYCL_DEVICE=$1
|
||||
else
|
||||
export GGML_SYCL_DEVICE=0
|
||||
fi
|
||||
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
|
||||
|
||||
Generated
+3
-3
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1705677747,
|
||||
"narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=",
|
||||
"lastModified": 1706191920,
|
||||
"narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261",
|
||||
"rev": "ae5c332cbb5827f6b1f02572496b141021de335f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -339,6 +339,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
||||
ggml_backend_cuda_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
extern void ggml_backend_sycl_reg_devices(void);
|
||||
ggml_backend_sycl_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
||||
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
+15197
File diff suppressed because it is too large
Load Diff
+27
@@ -0,0 +1,27 @@
|
||||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_SYCL_MAX_DEVICES 16
|
||||
#define GGML_SYCL_NAME "SYCL"
|
||||
|
||||
GGML_API void ggml_init_sycl(void);
|
||||
GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||
#include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
// floating point type used to accumulate sums
|
||||
@@ -2293,6 +2295,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
ggml_init_cublas();
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
ggml_cl_init();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
ggml_init_sycl();
|
||||
#endif
|
||||
|
||||
ggml_setup_op_has_task_pass();
|
||||
@@ -14701,6 +14705,12 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
||||
if (skip_cpu) {
|
||||
return;
|
||||
}
|
||||
#endif // GGML_USE_SYCL
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
@@ -20280,7 +20290,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
||||
}
|
||||
|
||||
int ggml_cpu_has_blas(void) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
@@ -20303,8 +20313,16 @@ int ggml_cpu_has_clblast(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sycl(void) {
|
||||
#if defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
|
||||
@@ -2266,6 +2266,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
//
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
# include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
# include "ggml-opencl.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
# include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
@@ -52,6 +54,7 @@
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cfloat>
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
@@ -1277,6 +1280,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_cuda_host_buffer_type();
|
||||
}
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_host_buffer_type();
|
||||
#elif defined(GGML_USE_CPU_HBM)
|
||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||
#endif
|
||||
@@ -1296,6 +1301,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#endif
|
||||
@@ -8133,6 +8140,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
|
||||
}
|
||||
|
||||
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
||||
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
||||
// if (k >= (int32_t)candidates->size) {
|
||||
// return;
|
||||
// }
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
k = std::max(k, (int) min_keep);
|
||||
@@ -8241,21 +8253,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
||||
return;
|
||||
}
|
||||
|
||||
llama_sample_softmax(ctx, candidates);
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
float scale = candidates->data[0].p; // scale by max prob
|
||||
size_t i = 1; // first token always matches
|
||||
bool min_p_applied = false;
|
||||
|
||||
for (; i < candidates->size; ++i) {
|
||||
if (candidates->data[i].p < p * scale && i >= min_keep) {
|
||||
break; // prob too small
|
||||
// if the candidates aren't sorted, try the unsorted implementation first
|
||||
if (!candidates->sorted) {
|
||||
std::vector<llama_token_data> filtered_tokens;
|
||||
|
||||
float max_logit = -FLT_MAX;
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
max_logit = std::max(max_logit, candidates->data[i].logit);
|
||||
}
|
||||
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
if (candidates->data[i].logit >= min_logit) {
|
||||
filtered_tokens.push_back(candidates->data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// if we have enough values the operation was a success
|
||||
if (filtered_tokens.size() >= min_keep) {
|
||||
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||
candidates->size = filtered_tokens.size();
|
||||
min_p_applied = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the matching tokens
|
||||
candidates->size = i;
|
||||
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
||||
if (!min_p_applied) {
|
||||
// Sort the logits in descending order
|
||||
if (!candidates->sorted) {
|
||||
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
candidates->sorted = true;
|
||||
}
|
||||
|
||||
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
||||
size_t i = 1; // first token always matches
|
||||
|
||||
for (; i < candidates->size; ++i) {
|
||||
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
||||
break; // prob too small
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the matching tokens
|
||||
candidates->size = i;
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
@@ -10184,6 +10231,16 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#endif
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
|
||||
@@ -6,6 +6,9 @@
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#include "ggml-sycl.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
||||
#else
|
||||
#define LLAMA_MAX_DEVICES 1
|
||||
#endif // GGML_USE_CUBLAS
|
||||
@@ -46,7 +49,7 @@
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 4
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_SYCL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
||||
@@ -239,10 +239,17 @@ static std::string var_to_str(ggml_type type) {
|
||||
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
||||
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
static bool inline _isinf(float f) {
|
||||
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
|
||||
}
|
||||
#else
|
||||
static bool inline _isinf(float f) { return std::isinf(f); }
|
||||
#endif
|
||||
|
||||
// accept FLT_MAX as infinity
|
||||
static bool isinf_or_max(float f) {
|
||||
return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
||||
return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
||||
}
|
||||
|
||||
static bool ggml_is_view_op(enum ggml_op op) {
|
||||
|
||||
+154
-15
@@ -5,11 +5,10 @@
|
||||
#undef NDEBUG
|
||||
#endif
|
||||
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void dump(const llama_token_data_array * candidates) {
|
||||
for (size_t i = 0; i < candidates->size; i++) {
|
||||
@@ -20,11 +19,11 @@ static void dump(const llama_token_data_array * candidates) {
|
||||
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
||||
|
||||
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
||||
size_t n_vocab = probs.size();
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
@@ -41,11 +40,11 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
|
||||
}
|
||||
|
||||
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
@@ -62,11 +61,11 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
|
||||
}
|
||||
|
||||
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
||||
size_t n_vocab = probs.size();
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
@@ -81,12 +80,33 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
|
||||
}
|
||||
}
|
||||
|
||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_min_p(nullptr, &candidates_p, p, 1);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
@@ -107,11 +127,11 @@ static void test_repetition_penalties(
|
||||
) {
|
||||
GGML_ASSERT(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
@@ -128,6 +148,88 @@ static void test_repetition_penalties(
|
||||
}
|
||||
}
|
||||
|
||||
static void test_sampler_queue(
|
||||
const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
|
||||
) {
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
const float logit = logf(token_id);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
llama_token min_token_id = 0;
|
||||
const llama_token max_token_id = n_vocab-1;
|
||||
|
||||
for (auto s : samplers_sequence) {
|
||||
switch (s){
|
||||
case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
|
||||
case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break;
|
||||
case 'y': GGML_ASSERT(false && "typical test not implemented"); break;
|
||||
case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
|
||||
case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
|
||||
case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
|
||||
default : GGML_ASSERT(false && "Unknown sampler"); break;
|
||||
}
|
||||
|
||||
llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
|
||||
|
||||
const int size = candidates_p.size;
|
||||
|
||||
if (s == 'k') {
|
||||
const int expected_size = std::min(size, top_k);
|
||||
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
|
||||
|
||||
GGML_ASSERT(size == expected_size);
|
||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
||||
} else if (s == 'p') {
|
||||
const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
|
||||
const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
|
||||
|
||||
min_token_id = n_vocab;
|
||||
int expected_size = 0;
|
||||
int cumsum = 0;
|
||||
do { // do-while because always at least one token is sampled
|
||||
min_token_id--;
|
||||
expected_size++;
|
||||
|
||||
cumsum += min_token_id;
|
||||
} while (cumsum < softmax_numerator_target);
|
||||
|
||||
// token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
|
||||
if (min_token_id == 1) {
|
||||
min_token_id--;
|
||||
expected_size += 1;
|
||||
}
|
||||
|
||||
GGML_ASSERT(size == expected_size);
|
||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
||||
} else if (s == 'm') {
|
||||
int expected_size = ceilf((1.0f-min_p) * n_vocab);
|
||||
expected_size = std::max(expected_size, 1);
|
||||
expected_size = std::min(expected_size, size);
|
||||
|
||||
min_token_id = floorf(min_p * n_vocab);
|
||||
min_token_id = std::max(min_token_id, 1);
|
||||
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
|
||||
min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
|
||||
|
||||
GGML_ASSERT(size == expected_size);
|
||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
|
||||
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
ggml_time_init();
|
||||
|
||||
@@ -139,6 +241,15 @@ int main(void) {
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
|
||||
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.26f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.49f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.51f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.74f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
|
||||
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
|
||||
@@ -154,6 +265,34 @@ int main(void) {
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
|
||||
|
||||
test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
|
||||
test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
|
||||
|
||||
test_sampler_queue(10000, "k", 100, 1.0000f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
|
||||
|
||||
test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
|
||||
test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
|
||||
|
||||
test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
return 0;
|
||||
|
||||
Reference in New Issue
Block a user