Compare commits

...

7 Commits

Author SHA1 Message Date
Georgi Gerganov 554c247caf ggml : remove OpenCL (#7735)
ggml-ci
2024-06-04 21:23:20 +03:00
Georgi Gerganov 0cd6bd3483 llama : remove beam search (#7736) 2024-06-04 21:23:05 +03:00
Georgi Gerganov 5ca0944a15 readme : remove obsolete Zig instructions (#7471) 2024-06-04 19:43:01 +03:00
slaren adc9ff3841 llama-bench : allow using a different printer for stderr with -oe (#7722)
compare-commits.sh : hide stdout, use -oe to print markdown
2024-06-04 14:32:42 +02:00
Daniele 987d743d6b Improve hipBLAS support in CMake (#7696)
* Improve hipBLAS support in CMake

This improves the detection of the correct CMAKE_PREFIX_PATH when using different distributions or a self-built ROCm SDK.

* Set ROCM_PATH correctly
2024-06-04 14:09:15 +02:00
zhouwg b226c1227b refine .gitignore (#7688)
This adds tags and android ndk into the git ignore list
2024-06-04 21:21:26 +10:00
jaime-m-p 3b38d48609 Per token attributes (#7685)
* Add per token attributes enum
* Using phi-3 for testing 'rstrip'
* Using jina-v2 for testing 'lstrip'
* Brute force test for 'lstrip' and 'rstrip'
* Implement 'rstrip' and 'lstrip'
* Update phi-3 GGUF file (obsolete since 917dc8c)
* Replace llama_token_type with llama_token_attribs
2024-06-04 09:17:17 +02:00
29 changed files with 297 additions and 3269 deletions
+2 -34
View File
@@ -688,8 +688,6 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.3.261.1
@@ -706,8 +704,6 @@ jobs:
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
@@ -732,27 +728,6 @@ jobs:
run: |
git submodule update --init kompute
- name: Download OpenCL SDK
id: get_opencl
if: ${{ matrix.build == 'clblast-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
}
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@@ -786,13 +761,6 @@ jobs:
cmake -S . -B build ${{ matrix.defines }}
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add clblast.dll
id: add_clblast_dll
if: ${{ matrix.build == 'clblast-x64' }}
run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas-x64' }}
@@ -816,7 +784,7 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@@ -1071,7 +1039,7 @@ jobs:
# hypervisor: 'qemu'
# run: |
# sudo pkg update
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
release:
+2
View File
@@ -34,9 +34,11 @@ ggml-metal-embed.metal
lcov-report/
gcovr-report/
tags
build*
!build.zig
cmake-build-*
android-ndk-*
out/
tmp/
+11 -23
View File
@@ -111,7 +111,6 @@ option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for Flas
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
@@ -502,22 +501,6 @@ if (LLAMA_RPC)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
set(GGML_HEADERS_OPENCL ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
add_compile_definitions(GGML_USE_CLBLAST)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
endif()
endif()
if (LLAMA_VULKAN)
find_package(Vulkan)
if (Vulkan_FOUND)
@@ -557,12 +540,17 @@ if (LLAMA_VULKAN)
endif()
if (LLAMA_HIPBLAS)
if ($ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
if (NOT EXISTS $ENV{ROCM_PATH})
if (NOT EXISTS /opt/rocm)
set(ROCM_PATH /usr)
else()
set(ROCM_PATH /opt/rocm)
endif()
else()
set(ROCM_PATH /opt/rocm)
set(ROCM_PATH $ENV{ROCM_PATH})
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
# CMake on Windows doesn't support the HIP language yet
if(WIN32)
@@ -1260,7 +1248,6 @@ add_library(ggml OBJECT
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
@@ -1348,8 +1335,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
"${GGML_HEADERS_CUDA}"
"${GGML_HEADERS_METAL}"
"${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
+1 -22
View File
@@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@@ -547,23 +547,6 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h
$(NVCC_COMPILE)
endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
MK_LDFLAGS += -lclblast -framework OpenCL
else
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
endif
OBJS += ggml-opencl.o
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_CLBLAST
ifdef LLAMA_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
@@ -914,10 +897,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+1 -1
View File
@@ -29,7 +29,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
## News
+3 -124
View File
@@ -77,7 +77,7 @@ variety of hardware - locally and in the cloud.
- AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan, SYCL, and (partial) OpenCL backend support
- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
@@ -364,17 +364,6 @@ In order to build llama.cpp you have four different options.
cmake --build build --config Debug
```
- Using `Zig` (version 0.11 or later):
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
it's also possible to cross compile for other operating systems and architectures:
```bash
zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
```
The `zig targets` command will give you valid options to use.
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -382,16 +371,11 @@ In order to build llama.cpp you have four different options.
3. Install compilation dependencies.
```bash
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
opencl clblast openblas
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
the instructions for use and activate this options in this document below.
### Homebrew
On Mac and Linux, the homebrew package manager can be used via
@@ -410,7 +394,7 @@ argument.
### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
- #### Accelerate Framework:
@@ -564,111 +548,6 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
- <details>
<summary>Installing the OpenCL SDK from source</summary>
```sh
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
cd OpenCL-SDK
cmake -B build -DBUILD_DOCS=OFF \
-DBUILD_EXAMPLES=OFF \
-DBUILD_TESTING=OFF \
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
-DOPENCL_SDK_TEST_SAMPLES=OFF
cmake --build build
cmake --install build --prefix /some/path
```
</details>
##### Installing CLBlast
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
Linux packaging:
Fedora Linux:
```bash
sudo dnf install clblast
```
Alternatively, they may be built from source.
- <details>
<summary>Windows:</summary>
```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git
cd CLBlast
cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/CLBlast
```
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
- <details>
<summary>Unix:</summary>
```sh
git clone https://github.com/CNugteren/CLBlast.git
cd CLBlast
cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake --build build --config Release
cmake --install build --prefix /some/path
```
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
</details>
##### Building Llama with CLBlast
- Build with make:
```sh
make LLAMA_CLBLAST=1
```
- CMake (Unix):
```sh
cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cmake --build build --config Release
```
- CMake (Windows):
```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP
```
##### Running Llama with CLBlast
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
The selection can be a number (starting from 0) or a text string to search:
```sh
GGML_OPENCL_PLATFORM=1 ./main ...
GGML_OPENCL_DEVICE=2 ./main ...
GGML_OPENCL_PLATFORM=Intel ./main ...
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
```
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
Using the variables it is possible to select a CPU-based driver as well, if so desired.
You can get a list of platforms and devices from the `clinfo -l` command, etc.
- #### Vulkan
**With docker**:
-1
View File
@@ -2844,7 +2844,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
-1
View File
@@ -15,7 +15,6 @@ else()
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
-5
View File
@@ -1,5 +0,0 @@
set(TARGET beam-search)
add_executable(${TARGET} beam-search.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
-188
View File
@@ -1,188 +0,0 @@
#include "common.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
// Used for debugging to print out beam tokens.
struct ostream_beam_view {
llama_context * ctx;
llama_beam_view beam_view;
};
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
}
return os << ')';
}
// Put here anything you want back in beam_search_callback().
struct beam_search_callback_data {
llama_context * ctx;
std::vector<llama_token> response;
};
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
callback_data.response.resize(callback_data.response.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%zu", n);
}
fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
int main(int argc, char ** argv)
{
gpt_params params;
//params.n_gpu_layers = 200;
//---------------------------------
// Print help :
//---------------------------------
if ( argc < 2 || argv[1][0] == '-' )
{
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
params.model = argv[1];
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
if ( argc > 3 )
{
params.prompt = argv[3];
}
if ( params.prompt.empty() )
{
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
}
//---------------------------------
// Init LLM :
//---------------------------------
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}
fprintf( stderr, "\n\n" );
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
std::cout << llama_token_to_piece(ctx, id);
}
std::cout << std::flush;
int n_past = 0;
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
{
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1;
}
n_past += tokens_list.size();
beam_search_callback_data callback_data{ctx, {}};
size_t const beam_width = static_cast<size_t>(params.n_beams);
int const n_predict = 256;
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
std::cout << "\n\n";
for (llama_token const token_id : callback_data.response) {
std::cout << llama_token_to_piece(ctx,token_id);
}
std::cout << std::endl;
llama_free( ctx );
llama_free_model( model );
llama_backend_free();
return 0;
}
+3 -6
View File
@@ -162,7 +162,7 @@ $ ./llama-bench -o csv
```
```csv
build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
```
@@ -179,7 +179,6 @@ $ ./llama-bench -o json
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
@@ -210,7 +209,6 @@ $ ./llama-bench -o json
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
@@ -253,7 +251,6 @@ CREATE TABLE IF NOT EXISTS test (
build_commit TEXT,
build_number INTEGER,
cuda INTEGER,
opencl INTEGER,
metal INTEGER,
gpu_blas INTEGER,
blas INTEGER,
@@ -279,6 +276,6 @@ CREATE TABLE IF NOT EXISTS test (
stddev_ts REAL
);
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
```
+95 -61
View File
@@ -140,10 +140,11 @@ static std::string get_gpu_info() {
}
// command line params
enum output_formats {CSV, JSON, MARKDOWN, SQL};
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) {
switch (format) {
case NONE: return "none";
case CSV: return "csv";
case JSON: return "json";
case MARKDOWN: return "md";
@@ -152,6 +153,23 @@ static const char * output_format_str(output_formats format) {
}
}
static bool output_format_from_str(const std::string & s, output_formats & format) {
if (s == "none") {
format = NONE;
} else if (s == "csv") {
format = CSV;
} else if (s == "json") {
format = JSON;
} else if (s == "md") {
format = MARKDOWN;
} else if (s == "sql") {
format = SQL;
} else {
return false;
}
return true;
}
static const char * split_mode_str(llama_split_mode mode) {
switch (mode) {
case LLAMA_SPLIT_MODE_NONE: return "none";
@@ -190,31 +208,33 @@ struct cmd_params {
int reps;
bool verbose;
output_formats output_format;
output_formats output_format_stderr;
};
static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512},
/* n_gen */ {128},
/* n_pg */ {},
/* n_batch */ {2048},
/* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
/* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* flash_attn */ {false},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512},
/* n_gen */ {128},
/* n_pg */ {},
/* n_batch */ {2048},
/* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
/* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
/* flash_attn */ {false},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN,
/* output_format_stderr */ NONE,
};
static void print_usage(int /* argc */, char ** argv) {
@@ -243,6 +263,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -284,6 +305,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.verbose = cmd_params_defaults.verbose;
params.output_format = cmd_params_defaults.output_format;
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps;
for (int i = 1; i < argc; i++) {
@@ -493,18 +515,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
if (argv[i] == std::string("csv")) {
params.output_format = CSV;
} else if (argv[i] == std::string("json")) {
params.output_format = JSON;
} else if (argv[i] == std::string("md")) {
params.output_format = MARKDOWN;
} else if (argv[i] == std::string("sql")) {
params.output_format = SQL;
} else {
invalid_param = !output_format_from_str(argv[i], params.output_format);
} else if (arg == "-oe" || arg == "--output-err") {
if (++i >= argc) {
invalid_param = true;
break;
}
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else {
@@ -706,7 +723,6 @@ struct test {
static const std::string build_commit;
static const int build_number;
static const bool cuda;
static const bool opencl;
static const bool vulkan;
static const bool kompute;
static const bool metal;
@@ -795,9 +811,6 @@ struct test {
if (cuda) {
return GGML_CUDA_NAME;
}
if (opencl) {
return "OpenCL";
}
if (vulkan) {
return "Vulkan";
}
@@ -826,7 +839,7 @@ struct test {
static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = {
"build_commit", "build_number",
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
"cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
@@ -852,7 +865,7 @@ struct test {
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL;
@@ -881,7 +894,7 @@ struct test {
}
std::vector<std::string> values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
@@ -910,7 +923,6 @@ struct test {
const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cuda();
const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute();
const bool test::metal = !!ggml_cpu_has_metal();
@@ -1278,6 +1290,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
(void) user_data;
}
static std::unique_ptr<printer> create_printer(output_formats format) {
switch (format) {
case NONE:
return nullptr;
case CSV:
return std::unique_ptr<printer>(new csv_printer());
case JSON:
return std::unique_ptr<printer>(new json_printer());
case MARKDOWN:
return std::unique_ptr<printer>(new markdown_printer());
case SQL:
return std::unique_ptr<printer>(new sql_printer());
}
GGML_ASSERT(false);
}
int main(int argc, char ** argv) {
// try to set locale for unicode characters in markdown
setlocale(LC_CTYPE, ".UTF-8");
@@ -1304,26 +1332,18 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa);
// initialize printer
std::unique_ptr<printer> p;
switch (params.output_format) {
case CSV:
p.reset(new csv_printer());
break;
case JSON:
p.reset(new json_printer());
break;
case MARKDOWN:
p.reset(new markdown_printer());
break;
case SQL:
p.reset(new sql_printer());
break;
default:
assert(false);
exit(1);
std::unique_ptr<printer> p = create_printer(params.output_format);
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
if (p) {
p->fout = stdout;
p->print_header(params);
}
if (p_err) {
p_err->fout = stderr;
p_err->print_header(params);
}
p->fout = stdout;
p->print_header(params);
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
@@ -1381,7 +1401,15 @@ int main(int argc, char ** argv) {
t.samples_ns.push_back(t_ns);
}
p->print_test(t);
if (p) {
p->print_test(t);
fflush(p->fout);
}
if (p_err) {
p_err->print_test(t);
fflush(p_err->fout);
}
llama_print_timings(ctx);
@@ -1390,7 +1418,13 @@ int main(int argc, char ** argv) {
llama_free_model(lmodel);
p->print_footer();
if (p) {
p->print_footer();
}
if (p_err) {
p_err->print_footer();
}
llama_backend_free();
+3 -5
View File
@@ -8,16 +8,14 @@ Because this example is "outside of the source tree", it is important to first b
### Considerations
When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
### Build llama.cpp and install to C:\LlamaCPP directory
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
```cmd
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP
```
@@ -27,7 +25,7 @@ cmake --install build --prefix C:/LlamaCPP
```cmd
cd ..\examples\main-cmake-pkg
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/MyLlamaApp
```
-1
View File
@@ -159,7 +159,6 @@
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
}
// lib.optionalAttrs pkgs.stdenv.isLinux {
opencl = config.packages.default.override { useOpenCL = true; };
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
mpi-cpu = config.packages.default.override { useMpi = true; };
+1 -1
View File
@@ -1,7 +1,7 @@
// An interface allowing to compute ggml_cgraph with Metal
//
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
//
// How it works?
//
-2305
View File
File diff suppressed because it is too large Load Diff
-36
View File
@@ -1,36 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
GGML_API void ggml_cl_init(void);
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
// GGML_API void * ggml_cl_host_malloc(size_t size);
// GGML_API void ggml_cl_host_free(void * ptr);
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
// backend API
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
#ifdef __cplusplus
}
#endif
+3 -59
View File
@@ -297,17 +297,12 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
#include "ggml-opencl.h"
#endif
#elif defined(GGML_USE_OPENBLAS)
#if defined(GGML_BLAS_USE_MKL)
#include <mkl.h>
#else
#include <cblas.h>
#endif
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif
// floating point type used to accumulate sums
@@ -3380,10 +3375,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}
#if defined(GGML_USE_CLBLAST)
ggml_cl_init();
#endif
ggml_setup_op_has_task_pass();
is_first_call = false;
@@ -9053,17 +9044,6 @@ static void ggml_compute_forward_add_f32(
const int ith = params->ith;
const int nth = params->nth;
#ifdef GGML_USE_CLBLAST
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
// TODO: OpenCL kernel support full broadcast
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
if (ith == 0) {
ggml_cl_add(src0, src1, dst);
}
return;
}
#endif
const int nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS
@@ -10171,17 +10151,6 @@ static void ggml_compute_forward_mul_f32(
const int ith = params->ith;
const int nth = params->nth;
#if defined(GGML_USE_CLBLAST)
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
// TODO: OpenCL kernel support full broadcast
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
if (ith == 0) {
ggml_cl_mul(src0, src1, dst);
}
return;
}
#endif
const int64_t nr = ggml_nrows(src0);
GGML_TENSOR_BINARY_OP_LOCALS
@@ -12417,15 +12386,6 @@ static void ggml_compute_forward_mul_mat(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
#if defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
return;
}
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
const int64_t ne_plane = ne01*ne00;
@@ -12873,8 +12833,6 @@ static void ggml_compute_forward_out_prod_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
// TODO: #if defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
bool use_blas = ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
@@ -13072,7 +13030,7 @@ static void ggml_compute_forward_out_prod_q_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (params->type == GGML_TASK_TYPE_INIT) {
if (ith != 0) {
@@ -19546,11 +19504,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
{
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
#if defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
} else
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node)) {
if (node->src[0]->type != GGML_TYPE_F32) {
@@ -22859,7 +22812,7 @@ int ggml_cpu_has_wasm_simd(void) {
}
int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;
@@ -22874,14 +22827,6 @@ int ggml_cpu_has_cuda(void) {
#endif
}
int ggml_cpu_has_clblast(void) {
#if defined(GGML_USE_CLBLAST)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_vulkan(void) {
#if defined(GGML_USE_VULKAN)
return 1;
@@ -22915,8 +22860,7 @@ int ggml_cpu_has_rpc(void) {
}
int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
ggml_cpu_has_sycl();
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
}
int ggml_cpu_has_sse3(void) {
-1
View File
@@ -2425,7 +2425,6 @@ extern "C" {
GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void);
GGML_API int ggml_cpu_has_gpublas (void);
+106 -307
View File
@@ -13,8 +13,6 @@
#ifdef GGML_USE_CUDA
# include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
@@ -2149,12 +2147,12 @@ struct llama_control_vector {
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
using tattr = llama_token_attr;
struct token_data {
token text;
float score;
ttype type;
tattr attr;
};
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@@ -2406,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
buft = ggml_backend_vk_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
@@ -2530,10 +2526,6 @@ static bool llama_kv_cache_init(
}
}
#ifdef GGML_USE_CLBLAST
offload = false;
#endif
// count used buffer types
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
if (offload) {
@@ -4750,7 +4742,20 @@ static void llm_load_vocab(
auto & token_data = vocab.id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores ? scores[i] : 0.0f;
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
switch(toktypes[i]) {
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
}
}
}
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
@@ -4841,7 +4846,7 @@ static void llm_load_vocab(
// build special tokens cache
{
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
vocab.cache_special_tokens.push_back(id);
}
}
@@ -4871,6 +4876,59 @@ static void llm_load_vocab(
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
}
// Handle per token attributes
//NOTE: Each model customizes per token attributes.
//NOTE: Per token attributes are missing from the GGUF file.
//TODO: Extract attributes from GGUF file.
{
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
for (auto substr : substrs) {
if (str.find(substr) < std::string::npos) {
return true;
}
}
return false;
};
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
uint32_t current = vocab.id_to_token.at(id).attr;
current = value ? (current | attr) : (current & ~attr);
vocab.id_to_token[id].attr = (llama_token_attr) current;
};
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
};
std::string model_name;
std::string tokenizer_pre;
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
// model name to lowercase
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
[] (const std::string::value_type x) {
return std::tolower(x);
}
);
// set attributes by model/tokenizer name
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
for (auto id : vocab.cache_special_tokens) {
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (auto token : {"</s>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
}
}
}
}
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -12620,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
}
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
}
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
}
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
}
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
}
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@@ -13258,7 +13316,8 @@ struct fragment_buffer_variant {
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
// for each special token
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
const auto & special_token = vocab.id_to_token[special_id].text;
const auto & data = vocab.id_to_token[special_id];
const auto & special_token = data.text;
// for each text fragment
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@@ -13295,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
if (match > raw_text_base_offset) {
// left
const int64_t left_reminder_offset = raw_text_base_offset + 0;
const int64_t left_reminder_length = match - raw_text_base_offset;
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
int64_t left_reminder_length = match - raw_text_base_offset;
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
left_reminder_length--;
}
}
if (left_reminder_length > 0) {
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
it++;
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
#endif
it++;
}
// special token
@@ -13310,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
// right
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
const int64_t right_reminder_offset = match + special_token.length();
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
int64_t right_reminder_offset = match + special_token.length();
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
right_reminder_offset++;
right_reminder_length--;
}
}
if (right_reminder_length > 0) {
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
it++;
}
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
#endif
it++;
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
@@ -13365,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
// tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns []
static const bool rtrim = true; //TODO: as param
bool is_prev_special = false;
bool special_token_rtrim = false;
if (add_special && vocab.special_add_bos != 0) {
GGML_ASSERT(vocab.special_bos_id != -1);
@@ -13377,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer
// TODO: It's likely possible to get rid of this string copy entirely
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
// and passing 'add space prefix' as bool argument
//
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
if (special_token_rtrim) {
size_t num_whitespaces = 0;
while (isspace(raw_text[num_whitespaces])) {
num_whitespaces++;
}
if (num_whitespaces == raw_text.size()) {
continue; // skip if all whitespaces
}
raw_text = raw_text.substr(num_whitespaces);
}
if (vocab.add_space_prefix) {
if (!output.size() || is_prev_special) { // prefix with space if first token
raw_text = " " + raw_text;
@@ -13411,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
is_prev_special = true;
// phi-3 special tokens without rtrim, works fine for llama-spm too
special_token_rtrim = rtrim
&& fragment.token != vocab.special_bos_id
&& fragment.token != vocab.special_unk_id
&& fragment.token != vocab.special_eos_id;
}
}
@@ -14650,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
//
// Beam search
//
struct llama_beam {
std::vector<llama_token> tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
// Sort beams by probability. In case of ties, prefer beams at eob.
bool operator<(const llama_beam & rhs) const {
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
}
// Shift off first n tokens and discard them.
void shift_tokens(const size_t n) {
if (n) {
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
tokens.resize(tokens.size() - n);
}
}
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
};
// A struct for calculating logit-related info.
struct llama_logit_info {
const float * const logits;
const int n_vocab;
const float max_l;
const float normalizer;
struct sum_exp {
float max_l;
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
};
llama_logit_info(llama_context * ctx)
: logits(llama_get_logits(ctx))
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
, max_l(*std::max_element(logits, logits + n_vocab))
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
{ }
llama_token_data get_token_data(const llama_token token_id) const {
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
return {token_id, logits[token_id], p};
}
// Return top k token_data by logit.
std::vector<llama_token_data> top_k(size_t k) {
std::vector<llama_token_data> min_heap; // min-heap by logit
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
min_heap.reserve(k_min);
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
min_heap.push_back(get_token_data(token_id));
}
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
std::make_heap(min_heap.begin(), min_heap.end(), comp);
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
if (min_heap.front().logit < logits[token_id]) {
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
min_heap.back().id = token_id;
min_heap.back().logit = logits[token_id];
std::push_heap(min_heap.begin(), min_heap.end(), comp);
}
}
return min_heap;
}
float probability_from_logit(float logit) const {
return normalizer * std::exp(logit - max_l);
}
};
struct llama_beam_search_data {
llama_context * ctx;
size_t n_beams;
int n_past;
int n_predict;
std::vector<llama_beam> beams;
std::vector<llama_beam> next_beams;
// Re-calculated on each loop iteration
size_t common_prefix_length;
// Used to communicate to/from callback on beams state.
std::vector<llama_beam_view> beam_views;
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
: ctx(ctx)
, n_beams(n_beams)
, n_past(n_past)
, n_predict(n_predict)
, beam_views(n_beams) {
beams.reserve(n_beams);
next_beams.reserve(n_beams);
}
// Collapse beams to a single beam given by index.
void collapse_beams(const size_t beam_idx) {
if (0u < beam_idx) {
std::swap(beams[0], beams[beam_idx]);
}
beams.resize(1);
}
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
// The repetitive patterns below reflect the 2 stages of heaps:
// * Gather elements until the vector is full, then call std::make_heap() on it.
// * If the heap is full and a new element is found that should be included, pop the
// least element to the back(), replace it with the new, then push it into the heap.
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
// Min-heaps use a greater-than comparator.
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
if (beam.eob) {
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
if (next_beams.size() < n_beams) {
next_beams.push_back(std::move(beam));
if (next_beams.size() == n_beams) {
std::make_heap(next_beams.begin(), next_beams.end(), comp);
}
} else if (next_beams.front().p < beam.p) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = std::move(beam);
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
} else {
// beam is not at end-of-sentence, so branch with next top_k tokens.
if (!beam.tokens.empty()) {
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
}
llama_logit_info logit_info(ctx);
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
size_t i=0;
if (next_beams.size() < n_beams) {
for (; next_beams.size() < n_beams ; ++i) {
llama_beam next_beam = beam;
next_beam.tokens.push_back(next_tokens[i].id);
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
next_beams.push_back(std::move(next_beam));
}
std::make_heap(next_beams.begin(), next_beams.end(), comp);
} else {
for (; next_beams.front().p == 0.0f ; ++i) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = beam;
next_beams.back().tokens.push_back(next_tokens[i].id);
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
}
for (; i < n_beams ; ++i) {
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
if (next_beams.front().p < next_p) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = beam;
next_beams.back().tokens.push_back(next_tokens[i].id);
next_beams.back().p = next_p;
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
}
}
}
// Find common_prefix_length based on beams.
// Requires beams is not empty.
size_t find_common_prefix_length() {
size_t common_prefix_length = beams[0].tokens.size();
for (size_t i = 1 ; i < beams.size() ; ++i) {
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
if (beams[0].tokens[j] != beams[i].tokens[j]) {
common_prefix_length = j;
break;
}
}
}
return common_prefix_length;
}
// Construct beams_state to send back to caller via the callback function.
// Side effect: set common_prefix_length = find_common_prefix_length();
llama_beams_state get_beams_state(const bool last_call) {
for (size_t i = 0 ; i < beams.size() ; ++i) {
beam_views[i] = beams[i].view();
}
common_prefix_length = find_common_prefix_length();
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
}
// Loop:
// * while i < n_predict, AND
// * any of the beams have not yet reached end-of-beam (eob), AND
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
// (since all other beam probabilities can only decrease)
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
!beams[top_beam_index()].eob ; ++i) {
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
if (common_prefix_length) {
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
n_past += common_prefix_length;
}
// Zero-out next_beam probabilities to place them last in following min-heap.
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
for (llama_beam & beam : beams) {
beam.shift_tokens(common_prefix_length);
fill_next_beams_by_top_probabilities(beam);
}
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
beams.swap(next_beams);
renormalize_beam_probabilities(beams);
}
collapse_beams(top_beam_index());
callback(callback_data, get_beams_state(true));
}
// As beams grow, the cumulative probabilities decrease.
// Renormalize them to avoid floating point underflow.
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
}
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
size_t top_beam_index() {
return std::max_element(beams.begin(), beams.end()) - beams.begin();
}
// Copy (p,eob) for each beam which may have been changed by the callback.
void update_beams_from_beam_views() {
for (size_t i = 0 ; i < beams.size() ; ++i) {
beams[i].p = beam_views[i].p;
beams[i].eob = beam_views[i].eob;
}
}
};
void llama_beam_search(llama_context * ctx,
llama_beam_search_callback_fn_t callback, void * callback_data,
size_t n_beams, int n_past, int n_predict) {
assert(ctx);
const int64_t t_start_sample_us = ggml_time_us();
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
beam_search_data.loop(callback, callback_data);
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
ctx->n_sample++;
}
//
// quantization
//
@@ -16114,7 +15913,7 @@ bool llama_supports_mlock(void) {
}
bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
@@ -18221,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
return model->vocab.id_to_token[token].score;
}
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
return model->vocab.id_to_token[token].type;
return model->vocab.id_to_token[token].attr;
}
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
+17 -43
View File
@@ -97,7 +97,7 @@ extern "C" {
LLAMA_ROPE_TYPE_GLM = 4,
};
enum llama_token_type {
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
LLAMA_TOKEN_TYPE_NORMAL = 1,
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
@@ -107,6 +107,20 @@ extern "C" {
LLAMA_TOKEN_TYPE_BYTE = 6,
};
enum llama_token_attr {
LLAMA_TOKEN_ATTR_UNDEFINED = 0,
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 1,
LLAMA_TOKEN_ATTR_UNUSED = 1 << 2,
LLAMA_TOKEN_ATTR_NORMAL = 1 << 3,
LLAMA_TOKEN_ATTR_CONTROL = 1 << 4, // SPECIAL?
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 5,
LLAMA_TOKEN_ATTR_BYTE = 1 << 6,
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 7,
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 8,
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 9,
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 10,
};
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
@@ -821,7 +835,7 @@ extern "C" {
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
@@ -1042,49 +1056,9 @@ extern "C" {
llama_token token);
//
// Beam search
// Model split
//
struct llama_beam_view {
const llama_token * tokens;
size_t n_tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Callback should set this to true when a beam is at end-of-beam.
};
// Passed to beam_search_callback function.
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
// These pointers are valid only during the synchronous callback, so should not be saved.
struct llama_beams_state {
struct llama_beam_view * beam_views;
size_t n_beams; // Number of elements in beam_views[].
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
bool last_call; // True iff this is the last callback invocation.
};
// Type of pointer to the beam_search_callback function.
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
/// @details Deterministically returns entire sentence constructed by a beam search.
/// @param ctx Pointer to the llama_context.
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
/// @param callback_data A pointer that is simply passed back to callback.
/// @param n_beams Number of beams to use.
/// @param n_past Number of tokens already evaluated.
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
LLAMA_API void llama_beam_search(
struct llama_context * ctx,
llama_beam_search_callback_fn_t callback,
void * callback_data,
size_t n_beams,
int32_t n_past,
int32_t n_predict);
/// @details Build a split GGUF final path for this chunk.
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
// Returns the split_path length.
Binary file not shown.
-5
View File
@@ -5,7 +5,6 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(LLAMA_BLAS @LLAMA_BLAS@)
set(LLAMA_CUDA @LLAMA_CUDA@)
set(LLAMA_METAL @LLAMA_METAL@)
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
@@ -36,10 +35,6 @@ if (LLAMA_METAL)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast REQUIRED)
endif()
if (LLAMA_HIPBLAS)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
+9 -7
View File
@@ -10,16 +10,18 @@ set -x
bench_args="${@:3}"
rm -f llama-bench.sqlite
rm -f llama-bench.sqlite > /dev/null
# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)
git checkout $1
make clean && make -j32 $make_opts llama-bench
./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
git checkout $1 > /dev/null
make clean > /dev/null
make -j$(nproc) $make_opts llama-bench > /dev/null
./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
git checkout $2
make clean && make -j32 $make_opts llama-bench
./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
git checkout $2 > /dev/null
make clean > /dev/null
make -j$(nproc) $make_opts llama-bench > /dev/null
./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
./scripts/compare-llama-bench.py -b $1 -c $2
+3 -3
View File
@@ -19,17 +19,17 @@ logger = logging.getLogger("compare-llama-bench")
# Properties by which to differentiate results per commit:
KEY_PROPERTIES = [
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
]
# Properties that are boolean and are converted to Yes/No for the table:
BOOL_PROPERTIES = ["cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
# Header names for the table:
PRETTY_NAMES = {
"cuda": "CUDA", "opencl": "OpenCL", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
"cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
"gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
+3 -8
View File
@@ -3,7 +3,7 @@
# Helper script for deploying llama.cpp server with a single Bash command
#
# - Works on Linux and macOS
# - Supports: CPU, CUDA, Metal, OpenCL
# - Supports: CPU, CUDA, Metal
# - Can run all GGUF models from HuggingFace
# - Can serve requests in parallel
# - Always builds latest llama.cpp from GitHub
@@ -19,7 +19,7 @@
# --port: port number, default is 8888
# --repo: path to a repo containing GGUF model files
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
# --backend: cpu, cuda, metal, opencl, depends on the OS
# --backend: cpu, cuda, metal, depends on the OS
# --gpu-id: gpu id, default is 0
# --n-parallel: number of parallel requests, default is 8
# --n-kv: KV cache size, default is 4096
@@ -72,7 +72,7 @@ function print_usage {
printf " --port: port number, default is 8888\n"
printf " --repo: path to a repo containing GGUF model files\n"
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
printf " --backend: cpu, cuda, metal, depends on the OS\n"
printf " --gpu-id: gpu id, default is 0\n"
printf " --n-parallel: number of parallel requests, default is 8\n"
printf " --n-kv: KV cache size, default is 4096\n"
@@ -387,9 +387,6 @@ elif [[ "$backend" == "cpu" ]]; then
elif [[ "$backend" == "metal" ]]; then
printf "[+] Building with Metal backend\n"
make -j server $log
elif [[ "$backend" == "opencl" ]]; then
printf "[+] Building with OpenCL backend\n"
LLAMA_CLBLAST=1 make -j server $log
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
@@ -407,8 +404,6 @@ elif [[ "$backend" == "cpu" ]]; then
args="-ngl 0"
elif [[ "$backend" == "metal" ]]; then
args="-ngl 999"
elif [[ "$backend" == "opencl" ]]; then
args="-ngl 999"
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
-4
View File
@@ -106,8 +106,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# src/ggml-kompute.h -> ggml-kompute.h
# src/ggml-metal.h -> ggml-metal.h
# src/ggml-metal.m -> ggml-metal.m
# src/ggml-opencl.cpp -> ggml-opencl.cpp
# src/ggml-opencl.h -> ggml-opencl.h
# src/ggml-quants.c -> ggml-quants.c
# src/ggml-quants.h -> ggml-quants.h
# src/ggml-rpc.cpp -> ggml-rpc.cpp
@@ -143,8 +141,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
-e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
-2
View File
@@ -14,8 +14,6 @@ cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp
+34 -16
View File
@@ -156,17 +156,39 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
'<s>a', # Phi-3 fail
'<unk><|endoftext|><s>', # Phi-3 fail
'a\na', # TODO: Bert fail
'a </s> b', # rstrip phi-3
'a <mask> b', # lstrip jina-v2
]
def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
special_tokens = set(tokenizer.all_special_tokens)
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
special_tokens = list(sorted(special_tokens))
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
"""Brute force check all vocab words"""
yield from vocab
def generator_added_lr_strip(tokenizer) -> Iterator[str]:
WHITESPACES = ["", " ", " ", " "]
special_tokens = list(tokenizer.all_special_tokens)
added_tokens = list(tokenizer.added_tokens_encoder)
all_tokens = list(sorted(set(special_tokens + added_tokens)))
for token in all_tokens:
for lstrip in WHITESPACES:
for rstrip in WHITESPACES:
yield lstrip + token + rstrip
yield "a" + lstrip + token + rstrip
yield lstrip + token + rstrip + "z"
yield "a" + lstrip + token + rstrip + "z"
def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]:
special_tokens = list(tokenizer.all_special_tokens)
added_tokens = list(tokenizer.added_tokens_encoder)
separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
all_tokens = list(sorted(set(special_tokens + added_tokens + separations)))
rand = random.Random()
for m in range(iterations):
rand.seed(m)
words = rand.choices(special_tokens, k=500)
words = rand.choices(all_tokens, k=500)
if words[0] == tokenizer.bos_token: # skip spam warning of double BOS
while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
words.pop(0)
@@ -175,11 +197,6 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
yield "".join(words)
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
"""Brute force check all vocab words"""
yield from vocab
def generator_random_chars(iterations=100) -> Iterator[str]:
"""Brute force random text with simple characters"""
@@ -274,8 +291,8 @@ def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, g
ids2 = func_tokenize2(text)
if ids1 != ids2:
i = find_first_mismatch(ids1, ids2)
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
logger.info(" TokenIDs: " + str(ids1))
logger.info(" Expected: " + str(ids2))
raise Exception()
@@ -309,8 +326,9 @@ def main(argv: list[str] = None):
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
@@ -322,14 +340,14 @@ def main(argv: list[str] = None):
if __name__ == "__main__":
# main()
path_tokenizers = "./models/tokenizers/"
path_tokenizers = "./models/tokenizers/"
path_vocab_format = "./models/ggml-vocab-%s.gguf"
# import os
# tokenizers = os.listdir(path_tokenizers)
tokenizers = [
# "llama-spm", # SPM
# "phi-3", # SPM
"llama-spm", # SPM
"phi-3", # SPM
"jina-v2-en", # WPM
"bert-bge", # WPM
]