mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 10:46:43 +02:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c90dbe026b | |||
| b90dc566c1 | |||
| 1442677f92 | |||
| 554c247caf | |||
| 0cd6bd3483 | |||
| 5ca0944a15 | |||
| adc9ff3841 | |||
| 987d743d6b | |||
| b226c1227b | |||
| 3b38d48609 |
@@ -688,8 +688,6 @@ jobs:
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.6.0
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.3.261.1
|
||||
|
||||
@@ -706,8 +704,6 @@ jobs:
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute-x64'
|
||||
@@ -732,27 +728,6 @@ jobs:
|
||||
run: |
|
||||
git submodule update --init kompute
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
||||
mkdir $env:RUNNER_TEMP/opencl
|
||||
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
|
||||
|
||||
- name: Download CLBlast
|
||||
id: get_clblast
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
|
||||
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
|
||||
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
|
||||
$txt = Get-Content -Path $f -Raw
|
||||
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
|
||||
}
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
@@ -786,13 +761,6 @@ jobs:
|
||||
cmake -S . -B build ${{ matrix.defines }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add clblast.dll
|
||||
id: add_clblast_dll
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
@@ -816,7 +784,7 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
@@ -1071,7 +1039,7 @@ jobs:
|
||||
# hypervisor: 'qemu'
|
||||
# run: |
|
||||
# sudo pkg update
|
||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
|
||||
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
||||
|
||||
release:
|
||||
|
||||
@@ -34,9 +34,11 @@ ggml-metal-embed.metal
|
||||
lcov-report/
|
||||
gcovr-report/
|
||||
|
||||
tags
|
||||
build*
|
||||
!build.zig
|
||||
cmake-build-*
|
||||
android-ndk-*
|
||||
out/
|
||||
tmp/
|
||||
|
||||
|
||||
+11
-23
@@ -111,7 +111,6 @@ option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for Flas
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
|
||||
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
|
||||
@@ -502,22 +501,6 @@ if (LLAMA_RPC)
|
||||
set(GGML_SOURCES_RPC ggml-rpc.cpp)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
message(STATUS "CLBlast found")
|
||||
|
||||
set(GGML_HEADERS_OPENCL ggml-opencl.h)
|
||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
|
||||
|
||||
add_compile_definitions(GGML_USE_CLBLAST)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
|
||||
else()
|
||||
message(WARNING "CLBlast not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN)
|
||||
find_package(Vulkan)
|
||||
if (Vulkan_FOUND)
|
||||
@@ -557,12 +540,17 @@ if (LLAMA_VULKAN)
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
if ($ENV{ROCM_PATH})
|
||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||
if (NOT EXISTS $ENV{ROCM_PATH})
|
||||
if (NOT EXISTS /opt/rocm)
|
||||
set(ROCM_PATH /usr)
|
||||
else()
|
||||
set(ROCM_PATH /opt/rocm)
|
||||
endif()
|
||||
else()
|
||||
set(ROCM_PATH /opt/rocm)
|
||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||
endif()
|
||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
||||
|
||||
# CMake on Windows doesn't support the HIP language yet
|
||||
if(WIN32)
|
||||
@@ -1260,7 +1248,6 @@ add_library(ggml OBJECT
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
@@ -1348,8 +1335,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
|
||||
"${GGML_HEADERS_CUDA}"
|
||||
"${GGML_HEADERS_METAL}"
|
||||
"${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
install(TARGETS ggml PUBLIC_HEADER)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
@@ -547,23 +547,6 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h
|
||||
$(NVCC_COMPILE)
|
||||
endif # LLAMA_CUDA
|
||||
|
||||
ifdef LLAMA_CLBLAST
|
||||
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
||||
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
||||
|
||||
# Mac provides OpenCL as a framework
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
MK_LDFLAGS += -lclblast -framework OpenCL
|
||||
else
|
||||
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
||||
endif
|
||||
OBJS += ggml-opencl.o
|
||||
|
||||
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # LLAMA_CLBLAST
|
||||
|
||||
ifdef LLAMA_VULKAN
|
||||
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
||||
MK_LDFLAGS += -lvulkan
|
||||
@@ -914,10 +897,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
+1
-1
@@ -29,7 +29,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
|
||||
|
||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
||||
|
||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||
|
||||
## News
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ variety of hardware - locally and in the cloud.
|
||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
||||
- Vulkan, SYCL, and (partial) OpenCL backend support
|
||||
- Vulkan and SYCL backend support
|
||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||
|
||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
||||
@@ -364,17 +364,6 @@ In order to build llama.cpp you have four different options.
|
||||
cmake --build build --config Debug
|
||||
```
|
||||
|
||||
- Using `Zig` (version 0.11 or later):
|
||||
|
||||
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
|
||||
it's also possible to cross compile for other operating systems and architectures:
|
||||
|
||||
```bash
|
||||
zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
|
||||
```
|
||||
|
||||
The `zig targets` command will give you valid options to use.
|
||||
|
||||
- Using `gmake` (FreeBSD):
|
||||
|
||||
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
|
||||
@@ -382,16 +371,11 @@ In order to build llama.cpp you have four different options.
|
||||
3. Install compilation dependencies.
|
||||
|
||||
```bash
|
||||
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
|
||||
opencl clblast openblas
|
||||
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
|
||||
|
||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
||||
```
|
||||
|
||||
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
|
||||
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
|
||||
the instructions for use and activate this options in this document below.
|
||||
|
||||
### Homebrew
|
||||
|
||||
On Mac and Linux, the homebrew package manager can be used via
|
||||
@@ -410,7 +394,7 @@ argument.
|
||||
|
||||
### BLAS Build
|
||||
|
||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
|
||||
|
||||
- #### Accelerate Framework:
|
||||
|
||||
@@ -564,111 +548,6 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
|
||||
- #### CLBlast
|
||||
|
||||
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
||||
|
||||
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
|
||||
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
|
||||
|
||||
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
|
||||
|
||||
- <details>
|
||||
<summary>Installing the OpenCL SDK from source</summary>
|
||||
|
||||
```sh
|
||||
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
|
||||
cd OpenCL-SDK
|
||||
cmake -B build -DBUILD_DOCS=OFF \
|
||||
-DBUILD_EXAMPLES=OFF \
|
||||
-DBUILD_TESTING=OFF \
|
||||
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
|
||||
-DOPENCL_SDK_TEST_SAMPLES=OFF
|
||||
cmake --build build
|
||||
cmake --install build --prefix /some/path
|
||||
```
|
||||
</details>
|
||||
|
||||
##### Installing CLBlast
|
||||
|
||||
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
|
||||
|
||||
Linux packaging:
|
||||
Fedora Linux:
|
||||
```bash
|
||||
sudo dnf install clblast
|
||||
```
|
||||
|
||||
Alternatively, they may be built from source.
|
||||
|
||||
- <details>
|
||||
<summary>Windows:</summary>
|
||||
|
||||
```cmd
|
||||
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
|
||||
git clone https://github.com/CNugteren/CLBlast.git
|
||||
cd CLBlast
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix C:/CLBlast
|
||||
```
|
||||
|
||||
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
|
||||
|
||||
- <details>
|
||||
<summary>Unix:</summary>
|
||||
|
||||
```sh
|
||||
git clone https://github.com/CNugteren/CLBlast.git
|
||||
cd CLBlast
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix /some/path
|
||||
```
|
||||
|
||||
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
|
||||
</details>
|
||||
|
||||
##### Building Llama with CLBlast
|
||||
|
||||
- Build with make:
|
||||
```sh
|
||||
make LLAMA_CLBLAST=1
|
||||
```
|
||||
- CMake (Unix):
|
||||
```sh
|
||||
cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
cmake --build build --config Release
|
||||
```
|
||||
- CMake (Windows):
|
||||
```cmd
|
||||
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix C:/LlamaCPP
|
||||
```
|
||||
|
||||
##### Running Llama with CLBlast
|
||||
|
||||
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
|
||||
|
||||
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
|
||||
The selection can be a number (starting from 0) or a text string to search:
|
||||
|
||||
```sh
|
||||
GGML_OPENCL_PLATFORM=1 ./main ...
|
||||
GGML_OPENCL_DEVICE=2 ./main ...
|
||||
GGML_OPENCL_PLATFORM=Intel ./main ...
|
||||
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
|
||||
```
|
||||
|
||||
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
|
||||
Using the variables it is possible to select a CPU-based driver as well, if so desired.
|
||||
|
||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
||||
|
||||
- #### Vulkan
|
||||
|
||||
**With docker**:
|
||||
|
||||
+561
-259
File diff suppressed because it is too large
Load Diff
+82
-23
@@ -60,7 +60,7 @@ struct gpt_params {
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
@@ -99,23 +99,23 @@ struct gpt_params {
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
std::string model = ""; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model = ""; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string prompt = "";
|
||||
std::string prompt_file = ""; // store the external prompt file name
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string prompt_file = ""; // store the external prompt file name
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
|
||||
// TODO: avoid tuple, use struct
|
||||
@@ -127,8 +127,8 @@ struct gpt_params {
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
|
||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
// (which is more convenient to use for plotting)
|
||||
//
|
||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
@@ -142,19 +142,17 @@ struct gpt_params {
|
||||
|
||||
bool kl_divergence = false; // compute KL divergence
|
||||
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool usage = false; // print usage
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||
bool special = false; // enable special token output
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||
bool multiline_input = false; // reverse the usage of `\`
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
@@ -162,10 +160,10 @@ struct gpt_params {
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool ignore_eos = false; // ignore generated EOS tokens
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose = false;
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
bool infill = false; // use infill mode
|
||||
@@ -180,6 +178,47 @@ struct gpt_params {
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// server params
|
||||
int32_t port = 8080;
|
||||
int32_t timeout_read = 600;
|
||||
int32_t timeout_write = timeout_read;
|
||||
int32_t n_threads_http = -1;
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = "";
|
||||
std::string chat_template = "";
|
||||
std::string system_prompt = "";
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
std::string ssl_file_key = "";
|
||||
std::string ssl_file_cert = "";
|
||||
|
||||
bool endpoint_slots = true;
|
||||
bool endpoint_metrics = false;
|
||||
|
||||
bool log_json = false;
|
||||
|
||||
std::string slot_save_path;
|
||||
|
||||
// batched-bench params
|
||||
bool is_pp_shared = false;
|
||||
|
||||
std::vector<int32_t> n_pp;
|
||||
std::vector<int32_t> n_tg;
|
||||
std::vector<int32_t> n_pl;
|
||||
|
||||
// retrieval params
|
||||
std::vector<std::string> context_files; // context files to embed
|
||||
|
||||
int32_t chunk_size = 64; // chunk size for context embedding
|
||||
|
||||
std::string chunk_separator = "\n"; // chunk separator for context embedding
|
||||
|
||||
// passkey params
|
||||
int32_t n_junk = 250; // number of times to repeat the junk text
|
||||
int32_t i_pos = -1; // position of the passkey in the junk text
|
||||
};
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
@@ -199,7 +238,20 @@ std::vector<std::string> string_split(std::string input, char separator);
|
||||
|
||||
std::string string_strip(const std::string & str);
|
||||
std::string string_get_sortable_timestamp();
|
||||
std::string string_random_prompt(std::mt19937 & rng);
|
||||
|
||||
template<class T>
|
||||
static std::vector<T> string_split(const std::string & str, char delim) {
|
||||
std::vector<T> values;
|
||||
std::istringstream str_stream(str);
|
||||
std::string token;
|
||||
while (std::getline(str_stream, token, delim)) {
|
||||
T value;
|
||||
std::istringstream token_stream(token);
|
||||
token_stream >> value;
|
||||
values.push_back(value);
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
void string_process_escapes(std::string & input);
|
||||
@@ -282,6 +334,13 @@ std::string llama_detokenize_bpe(
|
||||
// defaults to true when model type is SPM, otherwise false.
|
||||
bool llama_should_add_bos_token(const llama_model * model);
|
||||
|
||||
//
|
||||
// Chat template utils
|
||||
//
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
bool llama_chat_verify_template(const std::string & tmpl);
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
|
||||
@@ -15,7 +15,6 @@ else()
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(beam-search)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
|
||||
@@ -10,16 +10,16 @@ There are 2 modes of operation:
|
||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||
|
||||
```bash
|
||||
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
||||
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||
|
||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||
|
||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||
|
||||
# custom set of batches
|
||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||
```
|
||||
|
||||
## Sample results
|
||||
|
||||
@@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
||||
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
||||
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
||||
return 1 ;
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int n_kv_max = 2048;
|
||||
int n_batch = 2048;
|
||||
int n_ubatch = 512;
|
||||
bool flash_attn = false;
|
||||
int is_pp_shared = 0;
|
||||
int n_gpu_layers = 0;
|
||||
int is_pp_shared = params.is_pp_shared;
|
||||
|
||||
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
|
||||
std::vector<int> n_tg = { 128, 256, };
|
||||
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
|
||||
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
||||
if (argc >= 3) {
|
||||
n_kv_max = std::atoi(argv[2]);
|
||||
}
|
||||
|
||||
if (argc >= 4) {
|
||||
n_batch = std::atoi(argv[3]);
|
||||
}
|
||||
|
||||
if (argc >= 5) {
|
||||
n_ubatch = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
flash_attn = std::atoi(argv[5]);
|
||||
}
|
||||
|
||||
if (argc >= 7) {
|
||||
is_pp_shared = std::atoi(argv[6]);
|
||||
}
|
||||
|
||||
if (argc >= 8) {
|
||||
n_gpu_layers = std::atoi(argv[7]);
|
||||
}
|
||||
|
||||
if (argc >= 9) {
|
||||
n_pp = parse_list(argv[8]);
|
||||
}
|
||||
|
||||
if (argc >= 10) {
|
||||
n_tg = parse_list(argv[9]);
|
||||
}
|
||||
|
||||
if (argc >= 11) {
|
||||
n_pl = parse_list(argv[10]);
|
||||
}
|
||||
std::vector<int> n_pp = params.n_pp;
|
||||
std::vector<int> n_tg = params.n_tg;
|
||||
std::vector<int> n_pl = params.n_pl;
|
||||
|
||||
// init LLM
|
||||
|
||||
@@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
const std::vector<float> t_split(llama_max_devices(), 0.0f);
|
||||
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
model_params.tensor_split = t_split.data();
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
@@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = n_kv_max;
|
||||
ctx_params.n_batch = n_batch;
|
||||
ctx_params.n_ubatch = n_ubatch;
|
||||
ctx_params.flash_attn = flash_attn;
|
||||
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||
|
||||
// ensure enough sequences are available
|
||||
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
||||
@@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int32_t n_kv_max = llama_n_ctx(ctx);
|
||||
|
||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||
|
||||
// decode in batches of ctx_params.n_batch tokens
|
||||
@@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
The example demonstrates batched generation from a given prompt
|
||||
|
||||
```bash
|
||||
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
|
||||
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||
|
||||
...
|
||||
|
||||
|
||||
@@ -7,48 +7,31 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
|
||||
return 1 ;
|
||||
params.prompt = "Hello my name is";
|
||||
params.n_predict = 32;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
// number of parallel batches
|
||||
int n_parallel = 1;
|
||||
int n_parallel = params.n_parallel;
|
||||
|
||||
// total length of the sequences including the prompt
|
||||
int n_len = 32;
|
||||
|
||||
// number of layers to offload to the GPU
|
||||
int n_gpu_layers = 0;
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
||||
if (argc >= 3) {
|
||||
params.prompt = argv[2];
|
||||
}
|
||||
|
||||
if (argc >= 4) {
|
||||
n_parallel = std::atoi(argv[3]);
|
||||
}
|
||||
|
||||
if (argc >= 5) {
|
||||
n_len = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
n_gpu_layers = std::atoi(argv[5]);
|
||||
}
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
||||
string_process_escapes(params.prompt);
|
||||
int n_predict = 32;
|
||||
|
||||
// init LLM
|
||||
|
||||
@@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
@@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> tokens_list;
|
||||
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
||||
|
||||
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
|
||||
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
||||
|
||||
// initialize the context
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = n_kv_req;
|
||||
ctx_params.n_batch = std::max(n_len, n_parallel);
|
||||
ctx_params.n_seq_max = n_parallel;
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
@@ -93,9 +70,9 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
@@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
while (n_cur <= n_len) {
|
||||
while (n_cur <= n_predict) {
|
||||
// prepare the next batch
|
||||
llama_batch_clear(batch);
|
||||
|
||||
@@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
|
||||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of generation? -> mark the stream as finished
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||
i_batch[i] = -1;
|
||||
LOG_TEE("\n");
|
||||
if (n_parallel > 1) {
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET beam-search)
|
||||
add_executable(${TARGET} beam-search.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
@@ -1,188 +0,0 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
// Used for debugging to print out beam tokens.
|
||||
struct ostream_beam_view {
|
||||
llama_context * ctx;
|
||||
llama_beam_view beam_view;
|
||||
};
|
||||
|
||||
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
||||
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
||||
}
|
||||
return os << ')';
|
||||
}
|
||||
|
||||
// Put here anything you want back in beam_search_callback().
|
||||
struct beam_search_callback_data {
|
||||
llama_context * ctx;
|
||||
std::vector<llama_token> response;
|
||||
};
|
||||
|
||||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
||||
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
|
||||
}
|
||||
|
||||
// Function matching type llama_beam_search_callback_fn_t.
|
||||
// Custom callback example is called each time the beams lengths increase:
|
||||
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||
// This is also called when the stop condition is met.
|
||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
||||
// Mark beams as EOS as needed.
|
||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||
llama_beam_view& beam_view = beams_state.beam_views[i];
|
||||
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
||||
beam_view.eob = true;
|
||||
}
|
||||
}
|
||||
printf(","); // Show progress
|
||||
if (const size_t n = beams_state.common_prefix_length) {
|
||||
callback_data.response.resize(callback_data.response.size() + n);
|
||||
assert(0u < beams_state.n_beams);
|
||||
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
||||
printf("%zu", n);
|
||||
}
|
||||
fflush(stdout);
|
||||
#if 1 // DEBUG: print current beams for this iteration
|
||||
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
gpt_params params;
|
||||
//params.n_gpu_layers = 200;
|
||||
|
||||
//---------------------------------
|
||||
// Print help :
|
||||
//---------------------------------
|
||||
|
||||
if ( argc < 2 || argv[1][0] == '-' )
|
||||
{
|
||||
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
//---------------------------------
|
||||
// Load parameters :
|
||||
//---------------------------------
|
||||
|
||||
params.model = argv[1];
|
||||
|
||||
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
||||
|
||||
if ( argc > 3 )
|
||||
{
|
||||
params.prompt = argv[3];
|
||||
}
|
||||
|
||||
if ( params.prompt.empty() )
|
||||
{
|
||||
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
||||
}
|
||||
|
||||
//---------------------------------
|
||||
// Init LLM :
|
||||
//---------------------------------
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
||||
|
||||
if ( model == NULL )
|
||||
{
|
||||
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
||||
return 1;
|
||||
}
|
||||
|
||||
//---------------------------------
|
||||
// Tokenize the prompt :
|
||||
//---------------------------------
|
||||
|
||||
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const size_t max_context_size = llama_n_ctx( ctx );
|
||||
const size_t max_tokens_list_size = max_context_size - 4 ;
|
||||
|
||||
if (tokens_list.size() > max_tokens_list_size)
|
||||
{
|
||||
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
||||
__func__ , tokens_list.size() , max_tokens_list_size );
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf( stderr, "\n\n" );
|
||||
|
||||
// Print the tokens from the prompt :
|
||||
|
||||
for( auto id : tokens_list )
|
||||
{
|
||||
std::cout << llama_token_to_piece(ctx, id);
|
||||
}
|
||||
std::cout << std::flush;
|
||||
|
||||
int n_past = 0;
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
|
||||
{
|
||||
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
||||
return 1;
|
||||
}
|
||||
n_past += tokens_list.size();
|
||||
|
||||
beam_search_callback_data callback_data{ctx, {}};
|
||||
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
||||
int const n_predict = 256;
|
||||
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
|
||||
|
||||
std::cout << "\n\n";
|
||||
for (llama_token const token_id : callback_data.response) {
|
||||
std::cout << llama_token_to_piece(ctx,token_id);
|
||||
}
|
||||
std::cout << std::endl;
|
||||
|
||||
llama_free( ctx );
|
||||
llama_free_model( model );
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -63,6 +63,7 @@ int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -79,9 +80,6 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
callback_data cb_data;
|
||||
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@@ -41,7 +41,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 2b. Test the sharded model is loading properly
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
@@ -51,7 +51,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 3b. Test the merged model is loading properly
|
||||
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
@@ -61,7 +61,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 4b. Test the sharded model is loading properly
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
@@ -71,7 +71,7 @@ echo
|
||||
#echo
|
||||
|
||||
# 5b. Test the merged model is loading properly
|
||||
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
|
||||
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
|
||||
#echo PASS
|
||||
#echo
|
||||
|
||||
@@ -81,7 +81,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 6b. Test the sharded model is loading properly
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
|
||||
@@ -153,7 +153,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -533,7 +533,6 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
StatParams sparams;
|
||||
std::string prev_result_file;
|
||||
std::string combine_files;
|
||||
@@ -581,7 +580,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
params.n_batch = 512;
|
||||
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -597,9 +598,6 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
sparams.dataset = params.prompt_file;
|
||||
g_collector.set_parameters(std::move(sparams));
|
||||
|
||||
+9
-125
@@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
if (params.instruct) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'main' tool for instruct mode\n", __func__);
|
||||
printf("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (params.chatml) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
|
||||
printf("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (!params.antiprompt.empty()) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
||||
printf("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||
@@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (params.random_prompt) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
|
||||
printf("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
if (!params.path_prompt_cache.empty()) {
|
||||
printf("\n************\n");
|
||||
printf("%s: infill does not support prompt caching\n", __func__);
|
||||
printf("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
@@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_context * ctx_guidance = NULL;
|
||||
|
||||
g_model = &model;
|
||||
g_ctx = &ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (sparams.cfg_scale > 1.f) {
|
||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||
@@ -273,25 +235,6 @@ int main(int argc, char ** argv) {
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
std::vector<llama_token> guidance_inp;
|
||||
int guidance_offset = 0;
|
||||
int original_prompt_len = 0;
|
||||
if (ctx_guidance) {
|
||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
|
||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||
|
||||
original_prompt_len = original_inp.size();
|
||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
||||
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
@@ -319,15 +262,6 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (ctx_guidance) {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
@@ -395,12 +329,11 @@ int main(int argc, char ** argv) {
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool input_echo = true;
|
||||
bool input_echo = true;
|
||||
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
int n_past_guidance = 0;
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
|
||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||
@@ -410,7 +343,6 @@ int main(int argc, char ** argv) {
|
||||
console::set_display(console::prompt);
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
@@ -436,7 +368,7 @@ int main(int argc, char ** argv) {
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
@@ -453,11 +385,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
if (ctx_guidance) {
|
||||
n_past_guidance -= n_discard;
|
||||
}
|
||||
|
||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
||||
LOG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||
|
||||
@@ -465,45 +393,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate tokens in batches
|
||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
||||
|
||||
if (ctx_guidance) {
|
||||
int input_size = 0;
|
||||
llama_token * input_buf = NULL;
|
||||
|
||||
if (n_past_guidance < (int) guidance_inp.size()) {
|
||||
// Guidance context should have the same data with these modifications:
|
||||
//
|
||||
// * Replace the initial prompt
|
||||
// * Shift everything by guidance_offset
|
||||
embd_guidance = guidance_inp;
|
||||
if (embd.begin() + original_prompt_len < embd.end()) {
|
||||
embd_guidance.insert(
|
||||
embd_guidance.end(),
|
||||
embd.begin() + original_prompt_len,
|
||||
embd.end()
|
||||
);
|
||||
}
|
||||
|
||||
input_buf = embd_guidance.data();
|
||||
input_size = embd_guidance.size();
|
||||
|
||||
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
|
||||
} else {
|
||||
input_buf = embd.data();
|
||||
input_size = embd.size();
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_size; i += params.n_batch) {
|
||||
int n_eval = std::min(input_size - i, params.n_batch);
|
||||
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past_guidance += n_eval;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
@@ -525,11 +414,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
embd_guidance.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
@@ -583,7 +470,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
|
||||
// deal with eot token in infill mode
|
||||
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||
if (is_interacting && !params.interactive_first) {
|
||||
@@ -644,7 +530,6 @@ int main(int argc, char ** argv) {
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp.push_back(llama_token_middle(model));
|
||||
embd.clear();
|
||||
embd_guidance.clear();
|
||||
n_remain = params.n_predict;
|
||||
n_past = 0;
|
||||
n_consumed = 0;
|
||||
@@ -751,7 +636,6 @@ int main(int argc, char ** argv) {
|
||||
llama_print_timings(ctx);
|
||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||
|
||||
if (ctx_guidance) { llama_free(ctx_guidance); }
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
|
||||
@@ -162,7 +162,7 @@ $ ./llama-bench -o csv
|
||||
```
|
||||
|
||||
```csv
|
||||
build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
|
||||
```
|
||||
@@ -179,7 +179,6 @@ $ ./llama-bench -o json
|
||||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"opencl": false,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
@@ -210,7 +209,6 @@ $ ./llama-bench -o json
|
||||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"opencl": false,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
@@ -253,7 +251,6 @@ CREATE TABLE IF NOT EXISTS test (
|
||||
build_commit TEXT,
|
||||
build_number INTEGER,
|
||||
cuda INTEGER,
|
||||
opencl INTEGER,
|
||||
metal INTEGER,
|
||||
gpu_blas INTEGER,
|
||||
blas INTEGER,
|
||||
@@ -279,6 +276,6 @@ CREATE TABLE IF NOT EXISTS test (
|
||||
stddev_ts REAL
|
||||
);
|
||||
|
||||
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
||||
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
||||
```
|
||||
|
||||
@@ -41,20 +41,6 @@ static std::string join(const std::vector<T> & values, const std::string & delim
|
||||
return str.str();
|
||||
}
|
||||
|
||||
template<class T>
|
||||
static std::vector<T> split(const std::string & str, char delim) {
|
||||
std::vector<T> values;
|
||||
std::istringstream str_stream(str);
|
||||
std::string token;
|
||||
while (std::getline(str_stream, token, delim)) {
|
||||
T value;
|
||||
std::istringstream token_stream(token);
|
||||
token_stream >> value;
|
||||
values.push_back(value);
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
template<typename T, typename F>
|
||||
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
||||
std::vector<std::string> str_values;
|
||||
@@ -140,10 +126,11 @@ static std::string get_gpu_info() {
|
||||
}
|
||||
|
||||
// command line params
|
||||
enum output_formats {CSV, JSON, MARKDOWN, SQL};
|
||||
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
|
||||
|
||||
static const char * output_format_str(output_formats format) {
|
||||
switch (format) {
|
||||
case NONE: return "none";
|
||||
case CSV: return "csv";
|
||||
case JSON: return "json";
|
||||
case MARKDOWN: return "md";
|
||||
@@ -152,6 +139,23 @@ static const char * output_format_str(output_formats format) {
|
||||
}
|
||||
}
|
||||
|
||||
static bool output_format_from_str(const std::string & s, output_formats & format) {
|
||||
if (s == "none") {
|
||||
format = NONE;
|
||||
} else if (s == "csv") {
|
||||
format = CSV;
|
||||
} else if (s == "json") {
|
||||
format = JSON;
|
||||
} else if (s == "md") {
|
||||
format = MARKDOWN;
|
||||
} else if (s == "sql") {
|
||||
format = SQL;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * split_mode_str(llama_split_mode mode) {
|
||||
switch (mode) {
|
||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||
@@ -190,31 +194,33 @@ struct cmd_params {
|
||||
int reps;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
output_formats output_format_stderr;
|
||||
};
|
||||
|
||||
static const cmd_params cmd_params_defaults = {
|
||||
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
||||
/* n_prompt */ {512},
|
||||
/* n_gen */ {128},
|
||||
/* n_pg */ {},
|
||||
/* n_batch */ {2048},
|
||||
/* n_ubatch */ {512},
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* rpc_servers */ {""},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* flash_attn */ {false},
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN
|
||||
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
||||
/* n_prompt */ {512},
|
||||
/* n_gen */ {128},
|
||||
/* n_pg */ {},
|
||||
/* n_batch */ {2048},
|
||||
/* n_ubatch */ {512},
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* rpc_servers */ {""},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* flash_attn */ {false},
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN,
|
||||
/* output_format_stderr */ NONE,
|
||||
};
|
||||
|
||||
static void print_usage(int /* argc */, char ** argv) {
|
||||
@@ -243,6 +249,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
printf("\n");
|
||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||
@@ -284,6 +291,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
|
||||
params.verbose = cmd_params_defaults.verbose;
|
||||
params.output_format = cmd_params_defaults.output_format;
|
||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||
params.reps = cmd_params_defaults.reps;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
@@ -300,28 +308,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<std::string>(argv[i], split_delim);
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
params.model.insert(params.model.end(), p.begin(), p.end());
|
||||
} else if (arg == "-p" || arg == "--n-prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
||||
} else if (arg == "-n" || arg == "--n-gen") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
||||
} else if (arg == "-pg") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<std::string>(argv[i], ',');
|
||||
auto p = string_split<std::string>(argv[i], ',');
|
||||
if (p.size() != 2) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
@@ -332,21 +340,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<std::string>(argv[i], split_delim);
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
std::vector<ggml_type> types;
|
||||
for (const auto & t : p) {
|
||||
ggml_type gt = ggml_type_from_name(t);
|
||||
@@ -362,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<std::string>(argv[i], split_delim);
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
std::vector<ggml_type> types;
|
||||
for (const auto & t : p) {
|
||||
ggml_type gt = ggml_type_from_name(t);
|
||||
@@ -378,14 +386,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||
} else if (arg == "-rpc" || arg == "--rpc") {
|
||||
if (++i >= argc) {
|
||||
@@ -398,7 +406,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<std::string>(argv[i], split_delim);
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
std::vector<llama_split_mode> modes;
|
||||
for (const auto & m : p) {
|
||||
llama_split_mode mode;
|
||||
@@ -420,13 +428,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.main_gpu = split<int>(argv[i], split_delim);
|
||||
params.main_gpu = string_split<int>(argv[i], split_delim);
|
||||
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
||||
} else if (arg == "--numa") {
|
||||
if (++i >= argc) {
|
||||
@@ -444,28 +452,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
||||
} else if (arg == "-mmp" || arg == "--mmap") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
||||
} else if (arg == "-embd" || arg == "--embeddings") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ts" || arg == "--tensor-split") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
for (auto ts : split<std::string>(argv[i], split_delim)) {
|
||||
for (auto ts : string_split<std::string>(argv[i], split_delim)) {
|
||||
// split string by ; and /
|
||||
const std::regex regex{R"([;/]+)"};
|
||||
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
|
||||
@@ -493,18 +501,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (argv[i] == std::string("csv")) {
|
||||
params.output_format = CSV;
|
||||
} else if (argv[i] == std::string("json")) {
|
||||
params.output_format = JSON;
|
||||
} else if (argv[i] == std::string("md")) {
|
||||
params.output_format = MARKDOWN;
|
||||
} else if (argv[i] == std::string("sql")) {
|
||||
params.output_format = SQL;
|
||||
} else {
|
||||
invalid_param = !output_format_from_str(argv[i], params.output_format);
|
||||
} else if (arg == "-oe" || arg == "--output-err") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
||||
} else if (arg == "-v" || arg == "--verbose") {
|
||||
params.verbose = true;
|
||||
} else {
|
||||
@@ -706,7 +709,6 @@ struct test {
|
||||
static const std::string build_commit;
|
||||
static const int build_number;
|
||||
static const bool cuda;
|
||||
static const bool opencl;
|
||||
static const bool vulkan;
|
||||
static const bool kompute;
|
||||
static const bool metal;
|
||||
@@ -795,9 +797,6 @@ struct test {
|
||||
if (cuda) {
|
||||
return GGML_CUDA_NAME;
|
||||
}
|
||||
if (opencl) {
|
||||
return "OpenCL";
|
||||
}
|
||||
if (vulkan) {
|
||||
return "Vulkan";
|
||||
}
|
||||
@@ -826,7 +825,7 @@ struct test {
|
||||
static const std::vector<std::string> & get_fields() {
|
||||
static const std::vector<std::string> fields = {
|
||||
"build_commit", "build_number",
|
||||
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
|
||||
"cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_ubatch",
|
||||
@@ -852,7 +851,7 @@ struct test {
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||
return BOOL;
|
||||
@@ -881,7 +880,7 @@ struct test {
|
||||
}
|
||||
std::vector<std::string> values = {
|
||||
build_commit, std::to_string(build_number),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
||||
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
||||
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
@@ -910,7 +909,6 @@ struct test {
|
||||
const std::string test::build_commit = LLAMA_COMMIT;
|
||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||
const bool test::cuda = !!ggml_cpu_has_cuda();
|
||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||
const bool test::kompute = !!ggml_cpu_has_kompute();
|
||||
const bool test::metal = !!ggml_cpu_has_metal();
|
||||
@@ -1278,6 +1276,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
|
||||
(void) user_data;
|
||||
}
|
||||
|
||||
static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||
switch (format) {
|
||||
case NONE:
|
||||
return nullptr;
|
||||
case CSV:
|
||||
return std::unique_ptr<printer>(new csv_printer());
|
||||
case JSON:
|
||||
return std::unique_ptr<printer>(new json_printer());
|
||||
case MARKDOWN:
|
||||
return std::unique_ptr<printer>(new markdown_printer());
|
||||
case SQL:
|
||||
return std::unique_ptr<printer>(new sql_printer());
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// try to set locale for unicode characters in markdown
|
||||
setlocale(LC_CTYPE, ".UTF-8");
|
||||
@@ -1304,26 +1318,18 @@ int main(int argc, char ** argv) {
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// initialize printer
|
||||
std::unique_ptr<printer> p;
|
||||
switch (params.output_format) {
|
||||
case CSV:
|
||||
p.reset(new csv_printer());
|
||||
break;
|
||||
case JSON:
|
||||
p.reset(new json_printer());
|
||||
break;
|
||||
case MARKDOWN:
|
||||
p.reset(new markdown_printer());
|
||||
break;
|
||||
case SQL:
|
||||
p.reset(new sql_printer());
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
exit(1);
|
||||
std::unique_ptr<printer> p = create_printer(params.output_format);
|
||||
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
||||
|
||||
if (p) {
|
||||
p->fout = stdout;
|
||||
p->print_header(params);
|
||||
}
|
||||
|
||||
if (p_err) {
|
||||
p_err->fout = stderr;
|
||||
p_err->print_header(params);
|
||||
}
|
||||
p->fout = stdout;
|
||||
p->print_header(params);
|
||||
|
||||
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
||||
|
||||
@@ -1381,7 +1387,15 @@ int main(int argc, char ** argv) {
|
||||
t.samples_ns.push_back(t_ns);
|
||||
}
|
||||
|
||||
p->print_test(t);
|
||||
if (p) {
|
||||
p->print_test(t);
|
||||
fflush(p->fout);
|
||||
}
|
||||
|
||||
if (p_err) {
|
||||
p_err->print_test(t);
|
||||
fflush(p_err->fout);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
@@ -1390,7 +1404,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_free_model(lmodel);
|
||||
|
||||
p->print_footer();
|
||||
if (p) {
|
||||
p->print_footer();
|
||||
}
|
||||
|
||||
if (p_err) {
|
||||
p_err->print_footer();
|
||||
}
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
|
||||
@@ -112,9 +112,12 @@ struct llava_context {
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
LOG_TEE("\n example usage:\n");
|
||||
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
||||
@@ -278,7 +281,7 @@ int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
show_additional_info(argc, argv);
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -290,8 +293,7 @@ int main(int argc, char ** argv) {
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
show_additional_info(argc, argv);
|
||||
print_usage(argc, argv, {});
|
||||
return 1;
|
||||
}
|
||||
auto model = llava_init(¶ms);
|
||||
|
||||
@@ -37,7 +37,8 @@ struct ngram_container {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -14,8 +14,10 @@ int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@@ -16,6 +16,7 @@ int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,16 +8,14 @@ Because this example is "outside of the source tree", it is important to first b
|
||||
|
||||
### Considerations
|
||||
|
||||
When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||
When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
|
||||
|
||||
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||
|
||||
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
|
||||
|
||||
```cmd
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix C:/LlamaCPP
|
||||
```
|
||||
@@ -27,7 +25,7 @@ cmake --install build --prefix C:/LlamaCPP
|
||||
|
||||
```cmd
|
||||
cd ..\examples\main-cmake-pkg
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix C:/MyLlamaApp
|
||||
```
|
||||
|
||||
@@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
|
||||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
|
||||
./main -m models/7B/ggml-model.bin --ignore-eos -n -1
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
|
||||
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
|
||||
```
|
||||
|
||||
## Common Options
|
||||
@@ -80,7 +80,6 @@ The `main` program provides several ways to interact with the LLaMA models using
|
||||
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
||||
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
||||
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
|
||||
- `--random-prompt`: Start with a randomized prompt.
|
||||
|
||||
## Interaction
|
||||
|
||||
|
||||
+8
-61
@@ -122,8 +122,10 @@ int main(int argc, char ** argv) {
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
@@ -180,9 +182,6 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
LOG("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
@@ -250,11 +249,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG("tokenize the prompt\n");
|
||||
if (params.chatml) {
|
||||
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||
}
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
@@ -332,37 +328,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
} else {
|
||||
params.n_keep += add_bos; // always keep the BOS token
|
||||
}
|
||||
|
||||
// prefix & suffix for instruct mode
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
|
||||
|
||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||
|
||||
// chatml prefix & suffix
|
||||
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
|
||||
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||
|
||||
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
|
||||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.emplace_back("### Instruction:\n\n");
|
||||
}
|
||||
// similar for chatml mode
|
||||
else if (params.chatml) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.emplace_back("<|im_start|>user\n");
|
||||
}
|
||||
else if (params.conversation) {
|
||||
if (params.conversation) {
|
||||
params.interactive_first = true;
|
||||
}
|
||||
|
||||
@@ -823,15 +795,13 @@ int main(int argc, char ** argv) {
|
||||
|
||||
is_interacting = true;
|
||||
printf("\n");
|
||||
} else if (params.instruct || params.chatml) {
|
||||
is_interacting = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting) {
|
||||
LOG("waiting for user input\n");
|
||||
|
||||
if (params.conversation || params.instruct || params.chatml) {
|
||||
if (params.conversation) {
|
||||
printf("\n> ");
|
||||
}
|
||||
|
||||
@@ -874,24 +844,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
// instruct mode: insert instruction prefix
|
||||
if (params.instruct && !is_antiprompt) {
|
||||
LOG("inserting instruction prefix\n");
|
||||
n_consumed = embd_inp.size();
|
||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||
}
|
||||
// chatml mode: insert user chat prefix
|
||||
if (params.chatml && !is_antiprompt) {
|
||||
LOG("inserting chatml prefix\n");
|
||||
n_consumed = embd_inp.size();
|
||||
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
||||
}
|
||||
if (params.escape) {
|
||||
string_process_escapes(buffer);
|
||||
}
|
||||
|
||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
|
||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||
@@ -900,17 +858,6 @@ int main(int argc, char ** argv) {
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
|
||||
|
||||
// instruct mode: insert response suffix
|
||||
if (params.instruct) {
|
||||
LOG("inserting instruction suffix\n");
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
}
|
||||
// chatml mode: insert assistant chat suffix
|
||||
if (params.chatml) {
|
||||
LOG("inserting chatml suffix\n");
|
||||
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
|
||||
}
|
||||
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
@@ -935,7 +882,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
|
||||
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
||||
LOG_TEE(" [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -100,7 +100,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,5 +8,5 @@ See the following PRs for more info:
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
|
||||
make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
|
||||
```
|
||||
|
||||
@@ -6,46 +6,32 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
|
||||
return 1 ;
|
||||
params.n_junk = 250;
|
||||
params.n_keep = 32;
|
||||
params.i_pos = -1;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int seed = -1;
|
||||
srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
|
||||
|
||||
int n_junk = 250; // number of times to repeat the junk text
|
||||
int n_keep = 32; // number of tokens in the prompt prefix
|
||||
int n_grp = 1; // if more than 1 - perform LongLM SelfExtend
|
||||
int i_pos = -1; // position of the passkey in the junk text
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
||||
if (argc >= 3) {
|
||||
n_junk = std::stoi(argv[2]);
|
||||
}
|
||||
|
||||
if (argc >= 4) {
|
||||
n_grp = std::stoi(argv[3]);
|
||||
}
|
||||
|
||||
if (argc >= 5) {
|
||||
i_pos = std::stoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
seed = std::stoi(argv[5]);
|
||||
}
|
||||
|
||||
if (seed == -1) {
|
||||
seed = time(NULL);
|
||||
}
|
||||
|
||||
srand(seed);
|
||||
int n_junk = params.n_junk;
|
||||
int n_keep = params.n_keep;
|
||||
int n_grp = params.grp_attn_n;
|
||||
int i_pos = params.i_pos;
|
||||
|
||||
if (i_pos == -1) {
|
||||
i_pos = rand() % n_junk;
|
||||
@@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
@@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// initialize the context
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||
|
||||
ctx_params.seed = seed;
|
||||
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
|
||||
ctx_params.n_batch = 512;
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
|
||||
|
||||
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
|
||||
|
||||
@@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
|
||||
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
|
||||
|
||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
|
||||
|
||||
int n_past = 0;
|
||||
|
||||
|
||||
@@ -1032,7 +1032,7 @@ struct winogrande_entry {
|
||||
std::vector<llama_token> seq_tokens[2];
|
||||
};
|
||||
|
||||
static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
|
||||
static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
|
||||
std::vector<winogrande_entry> result;
|
||||
std::istringstream in(prompt);
|
||||
std::string line;
|
||||
@@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
params.n_ctx = 512;
|
||||
params.logits_all = true;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.logits_all = true;
|
||||
|
||||
const int32_t n_ctx = params.n_ctx;
|
||||
|
||||
if (n_ctx <= 0) {
|
||||
@@ -2006,9 +2008,6 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -2027,6 +2026,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
|
||||
if (params.n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, params.n_ctx);
|
||||
|
||||
@@ -47,7 +47,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 3a. Test the requanted model is loading properly
|
||||
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
@@ -57,7 +57,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 4b. Test the requanted model is loading properly
|
||||
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
|
||||
@@ -4,72 +4,12 @@
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
|
||||
struct retrieval_params {
|
||||
std::vector<std::string> context_files; // context files to embed
|
||||
int32_t chunk_size = 64; // chunk size for context embedding
|
||||
std::string chunk_separator = "\n"; // chunk separator for context embedding
|
||||
};
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
|
||||
gpt_params_print_usage(argc, argv, gpt_params);
|
||||
printf("retrieval options:\n");
|
||||
printf(" --context-file FNAME file containing context to embed.\n");
|
||||
printf(" specify multiple files by providing --context-file option multiple times.\n");
|
||||
printf(" --chunk-size N minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
|
||||
printf(" --chunk-separator STRING\n");
|
||||
printf(" string to separate chunks (default: \"\\n\")\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
|
||||
int i = 1;
|
||||
std::string arg;
|
||||
while (i < argc) {
|
||||
arg = argv[i];
|
||||
bool invalid_gpt_param = false;
|
||||
if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
|
||||
if (invalid_gpt_param) {
|
||||
fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
|
||||
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
|
||||
exit(1);
|
||||
}
|
||||
// option was parsed by gpt_params_find_arg
|
||||
} else if (arg == "--context-file") {
|
||||
if (++i >= argc) {
|
||||
fprintf(stderr, "error: missing argument for --context-file\n");
|
||||
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
|
||||
exit(1);
|
||||
}
|
||||
std::ifstream file(argv[i]);
|
||||
if (!file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
|
||||
exit(1);
|
||||
}
|
||||
// store the external file name in params
|
||||
retrieval_params.context_files.push_back(argv[i]);
|
||||
} else if (arg == "--chunk-size") {
|
||||
if (++i >= argc) {
|
||||
fprintf(stderr, "error: missing argument for --chunk-size\n");
|
||||
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
|
||||
exit(1);
|
||||
}
|
||||
retrieval_params.chunk_size = std::stoi(argv[i]);
|
||||
} else if (arg == "--chunk-separator") {
|
||||
if (++i >= argc) {
|
||||
fprintf(stderr, "error: missing argument for --chunk-separator\n");
|
||||
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
|
||||
exit(1);
|
||||
}
|
||||
retrieval_params.chunk_separator = argv[i];
|
||||
} else {
|
||||
// unknown argument
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
|
||||
exit(1);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
struct chunk {
|
||||
@@ -171,33 +111,35 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
retrieval_params retrieval_params;
|
||||
|
||||
retrieval_params_parse(argc, argv, params, retrieval_params);
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// For BERT models, batch size must be equal to ubatch size
|
||||
params.n_ubatch = params.n_batch;
|
||||
params.embedding = true;
|
||||
|
||||
if (retrieval_params.chunk_size <= 0) {
|
||||
if (params.chunk_size <= 0) {
|
||||
fprintf(stderr, "chunk_size must be positive\n");
|
||||
return 1;
|
||||
}
|
||||
if (retrieval_params.context_files.empty()) {
|
||||
if (params.context_files.empty()) {
|
||||
fprintf(stderr, "context_files must be specified\n");
|
||||
return 1;
|
||||
}
|
||||
params.embedding = true;
|
||||
|
||||
print_build_info();
|
||||
|
||||
printf("processing files:\n");
|
||||
for (auto & context_file : retrieval_params.context_files) {
|
||||
for (auto & context_file : params.context_files) {
|
||||
printf("%s\n", context_file.c_str());
|
||||
}
|
||||
|
||||
std::vector<chunk> chunks;
|
||||
for (auto & context_file : retrieval_params.context_files) {
|
||||
std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
|
||||
for (auto & context_file : params.context_files) {
|
||||
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
||||
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
||||
}
|
||||
printf("Number of chunks: %ld\n", chunks.size());
|
||||
@@ -242,7 +184,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
// add eos if not present
|
||||
if (inp.empty() || inp.back() != llama_token_eos(model)) {
|
||||
if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
|
||||
inp.push_back(llama_token_eos(model));
|
||||
}
|
||||
chunk.tokens = inp;
|
||||
|
||||
@@ -11,6 +11,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "The quick brown fox";
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
+64
-636
@@ -123,29 +123,6 @@ struct slot_params {
|
||||
json input_suffix;
|
||||
};
|
||||
|
||||
struct server_params {
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
int32_t n_threads_http = -1;
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = "";
|
||||
std::string chat_template = "";
|
||||
std::string system_prompt = "";
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
std::string ssl_key_file = "";
|
||||
std::string ssl_cert_file = "";
|
||||
#endif
|
||||
|
||||
bool slots_endpoint = true;
|
||||
bool metrics_endpoint = false;
|
||||
std::string slot_save_path;
|
||||
};
|
||||
|
||||
struct server_slot {
|
||||
int id;
|
||||
int id_task = -1;
|
||||
@@ -1261,7 +1238,7 @@ struct server_context {
|
||||
}
|
||||
|
||||
json get_formated_generation(const server_slot & slot) const {
|
||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||
|
||||
std::vector<std::string> samplers_sequence;
|
||||
@@ -2334,561 +2311,6 @@ struct server_context {
|
||||
}
|
||||
};
|
||||
|
||||
static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
|
||||
printf("usage: %s [options]\n", argv0);
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
|
||||
printf(" -dt N, --defrag-thold N\n");
|
||||
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
|
||||
printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
|
||||
printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
|
||||
if (llama_supports_mlock()) {
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_supports_mmap()) {
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
|
||||
printf(" - distribute: spread execution evenly over all nodes\n");
|
||||
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
|
||||
printf(" - numactl: use the CPU map provided my numactl\n");
|
||||
if (llama_supports_gpu_offload()) {
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
||||
printf(" -nkvo, --no-kv-offload\n");
|
||||
printf(" disable KV offload\n");
|
||||
}
|
||||
printf(" -m FNAME, --model FNAME\n");
|
||||
printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
|
||||
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
||||
printf(" model download url (default: unused)\n");
|
||||
printf(" -hfr REPO, --hf-repo REPO\n");
|
||||
printf(" Hugging Face model repository (default: unused)\n");
|
||||
printf(" -hff FILE, --hf-file FILE\n");
|
||||
printf(" Hugging Face model file (default: unused)\n");
|
||||
printf(" -a ALIAS, --alias ALIAS\n");
|
||||
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
||||
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
printf(" --ssl-key-file FNAME path to file a PEM-encoded SSL private key\n");
|
||||
printf(" --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate\n");
|
||||
#endif
|
||||
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
||||
printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
|
||||
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||
printf(" KV cache data type for K (default: f16)\n");
|
||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||
printf(" KV cache data type for V (default: f16)\n");
|
||||
printf(" --log-format log output format: json or text (default: json)\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
||||
printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
|
||||
printf("\n");
|
||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
|
||||
printf(" --chat-template JINJA_TEMPLATE\n");
|
||||
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
|
||||
printf(" only commonly used templates are accepted:\n");
|
||||
printf(" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
|
||||
gpt_params default_params;
|
||||
server_params default_sparams;
|
||||
|
||||
std::string arg;
|
||||
bool invalid_param = false;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
if (arg == "--port") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.port = std::stoi(argv[i]);
|
||||
} else if (arg == "--rpc") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rpc_servers = argv[i];
|
||||
} else if (arg == "--host") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.hostname = argv[i];
|
||||
} else if (arg == "--path") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.public_path = argv[i];
|
||||
} else if (arg == "--api-key") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.api_keys.push_back(argv[i]);
|
||||
} else if (arg == "--api-key-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::ifstream key_file(argv[i]);
|
||||
if (!key_file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string key;
|
||||
while (std::getline(key_file, key)) {
|
||||
if (key.size() > 0) {
|
||||
sparams.api_keys.push_back(key);
|
||||
}
|
||||
}
|
||||
key_file.close();
|
||||
|
||||
}
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
else if (arg == "--ssl-key-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.ssl_key_file = argv[i];
|
||||
} else if (arg == "--ssl-cert-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.ssl_cert_file = argv[i];
|
||||
}
|
||||
#endif
|
||||
else if (arg == "--timeout" || arg == "-to") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.read_timeout = std::stoi(argv[i]);
|
||||
sparams.write_timeout = std::stoi(argv[i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
} else if (arg == "-mu" || arg == "--model-url") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model_url = argv[i];
|
||||
} else if (arg == "-hfr" || arg == "--hf-repo") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.hf_repo = argv[i];
|
||||
} else if (arg == "-hff" || arg == "--hf-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.hf_file = argv[i];
|
||||
} else if (arg == "-a" || arg == "--alias") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model_alias = argv[i];
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
exit(0);
|
||||
} else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--rope-scaling") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
||||
else { invalid_param = true; break; }
|
||||
} else if (arg == "--rope-freq-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_base = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-freq-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--yarn-ext-factor") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_ext_factor = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--yarn-attn-factor") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_attn_factor = std::stof(argv[i]);
|
||||
} else if (arg == "--yarn-beta-fast") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_beta_fast = std::stof(argv[i]);
|
||||
} else if (arg == "--yarn-beta-slow") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_beta_slow = std::stof(argv[i]);
|
||||
} else if (arg == "--pooling") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
||||
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
||||
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
||||
else { invalid_param = true; break; }
|
||||
} else if (arg == "--defrag-thold" || arg == "-dt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.defrag_thold = std::stof(argv[i]);
|
||||
} else if (arg == "--threads" || arg == "-t") {
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
} else if (arg == "--grp-attn-n" || arg == "-gan") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
||||
params.grp_attn_n = std::stoi(argv[i]);
|
||||
} else if (arg == "--grp-attn-w" || arg == "-gaw") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
||||
params.grp_attn_w = std::stoi(argv[i]);
|
||||
} else if (arg == "--threads-batch" || arg == "-tb") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_batch = std::stoi(argv[i]);
|
||||
} else if (arg == "--threads-http") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.n_threads_http = std::stoi(argv[i]);
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_batch = std::stoi(argv[i]);
|
||||
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ubatch = std::stoi(argv[i]);
|
||||
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (llama_supports_gpu_offload()) {
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
} else {
|
||||
LOG_WARNING(
|
||||
"Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
||||
"See main README.md for information on enabling GPU BLAS support",
|
||||
{{"n_gpu_layers", params.n_gpu_layers}});
|
||||
}
|
||||
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||
params.no_kv_offload = true;
|
||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string arg_next = argv[i];
|
||||
if (arg_next == "none") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
} else if (arg_next == "layer") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (arg_next == "row") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifndef GGML_USE_CUDA
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUDA
|
||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||
std::string arg_next = argv[i];
|
||||
|
||||
// split string by , and /
|
||||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
||||
|
||||
for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
|
||||
if (i_device < split_arg.size()) {
|
||||
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
|
||||
} else {
|
||||
params.tensor_split[i_device] = 0.0f;
|
||||
}
|
||||
}
|
||||
#else
|
||||
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
|
||||
#endif // GGML_USE_CUDA
|
||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#else
|
||||
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
|
||||
#endif
|
||||
} else if (arg == "--lora") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-scaled") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
const char * lora_adapter = argv[i];
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_base = argv[i];
|
||||
} else if (arg == "-v" || arg == "--verbose") {
|
||||
#if SERVER_VERBOSE != 1
|
||||
LOG_WARNING("server.cpp is not built with verbose logging.", {});
|
||||
#else
|
||||
server_verbose = true;
|
||||
#endif
|
||||
} else if (arg == "--mlock") {
|
||||
params.use_mlock = true;
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--numa") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
} else {
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
||||
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
||||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
} else if (arg == "--embedding" || arg == "--embeddings") {
|
||||
params.embedding = true;
|
||||
} else if (arg == "-cb" || arg == "--cont-batching") {
|
||||
params.cont_batching = true;
|
||||
} else if (arg == "-fa" || arg == "--flash-attn") {
|
||||
params.flash_attn = true;
|
||||
} else if (arg == "-np" || arg == "--parallel") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_parallel = std::stoi(argv[i]);
|
||||
} else if (arg == "-n" || arg == "--n-predict") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_predict = std::stoi(argv[i]);
|
||||
} else if (arg == "-spf" || arg == "--system-prompt-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::ifstream file(argv[i]);
|
||||
if (!file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string system_prompt;
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(system_prompt)
|
||||
);
|
||||
sparams.system_prompt = system_prompt;
|
||||
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||
params.cache_type_k = argv[++i];
|
||||
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||
params.cache_type_v = argv[++i];
|
||||
} else if (arg == "--log-format") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (std::strcmp(argv[i], "json") == 0) {
|
||||
server_log_json = true;
|
||||
} else if (std::strcmp(argv[i], "text") == 0) {
|
||||
server_log_json = false;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else if (arg == "--log-disable") {
|
||||
log_set_target(stdout);
|
||||
LOG_INFO("logging to file is disabled.", {});
|
||||
} else if (arg == "--slots-endpoint-disable") {
|
||||
sparams.slots_endpoint = false;
|
||||
} else if (arg == "--metrics") {
|
||||
sparams.metrics_endpoint = true;
|
||||
} else if (arg == "--slot-save-path") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.slot_save_path = argv[i];
|
||||
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
||||
if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
||||
sparams.slot_save_path += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else if (arg == "--chat-template") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (!verify_custom_template(argv[i])) {
|
||||
fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
|
||||
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.chat_template = argv[i];
|
||||
} else if (arg == "--override-kv") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
gpt_params_handle_model_default(params);
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back();
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
|
||||
// skip GH copilot requests when using default port
|
||||
if (req.path == "/v1/health" || req.path == "/v1/completions") {
|
||||
@@ -2929,16 +2351,22 @@ int main(int argc, char ** argv) {
|
||||
log_disable();
|
||||
#endif
|
||||
// own arguments required by this example
|
||||
gpt_params params;
|
||||
server_params sparams;
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// TODO: not great to use extern vars
|
||||
server_log_json = params.log_json;
|
||||
server_verbose = params.verbose;
|
||||
|
||||
// struct that contains llama context and inference
|
||||
server_context ctx_server;
|
||||
|
||||
server_params_parse(argc, argv, sparams, params);
|
||||
|
||||
if (!sparams.system_prompt.empty()) {
|
||||
ctx_server.system_prompt_set(sparams.system_prompt);
|
||||
if (!params.system_prompt.empty()) {
|
||||
ctx_server.system_prompt_set(params.system_prompt);
|
||||
}
|
||||
|
||||
if (params.model_alias == "unknown") {
|
||||
@@ -2962,10 +2390,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::unique_ptr<httplib::Server> svr;
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") {
|
||||
LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}});
|
||||
if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
|
||||
LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}});
|
||||
svr.reset(
|
||||
new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str())
|
||||
new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
|
||||
);
|
||||
} else {
|
||||
LOG_INFO("Running without SSL", {});
|
||||
@@ -3019,24 +2447,24 @@ int main(int argc, char ** argv) {
|
||||
});
|
||||
|
||||
// set timeouts and change hostname and port
|
||||
svr->set_read_timeout (sparams.read_timeout);
|
||||
svr->set_write_timeout(sparams.write_timeout);
|
||||
svr->set_read_timeout (params.timeout_read);
|
||||
svr->set_write_timeout(params.timeout_write);
|
||||
|
||||
if (!svr->bind_to_port(sparams.hostname, sparams.port)) {
|
||||
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||
if (!svr->bind_to_port(params.hostname, params.port)) {
|
||||
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, std::string> log_data;
|
||||
|
||||
log_data["hostname"] = sparams.hostname;
|
||||
log_data["port"] = std::to_string(sparams.port);
|
||||
log_data["hostname"] = params.hostname;
|
||||
log_data["port"] = std::to_string(params.port);
|
||||
|
||||
if (sparams.api_keys.size() == 1) {
|
||||
auto key = sparams.api_keys[0];
|
||||
if (params.api_keys.size() == 1) {
|
||||
auto key = params.api_keys[0];
|
||||
log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
|
||||
} else if (sparams.api_keys.size() > 1) {
|
||||
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
|
||||
} else if (params.api_keys.size() > 1) {
|
||||
log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
|
||||
}
|
||||
|
||||
// load the model
|
||||
@@ -3053,10 +2481,10 @@ int main(int argc, char ** argv) {
|
||||
const auto model_meta = ctx_server.model_meta();
|
||||
|
||||
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
||||
if (sparams.chat_template.empty()) {
|
||||
if (params.chat_template.empty()) {
|
||||
if (!ctx_server.validate_model_chat_template()) {
|
||||
LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||
sparams.chat_template = "chatml";
|
||||
params.chat_template = "chatml";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3068,11 +2496,11 @@ int main(int argc, char ** argv) {
|
||||
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
|
||||
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
|
||||
|
||||
const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
|
||||
const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
|
||||
|
||||
LOG_INFO("chat template", {
|
||||
{"chat_example", chat_example},
|
||||
{"built_in", sparams.chat_template.empty()},
|
||||
{"built_in", params.chat_template.empty()},
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3080,7 +2508,7 @@ int main(int argc, char ** argv) {
|
||||
// Middlewares
|
||||
//
|
||||
|
||||
auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
// TODO: should we apply API key to all endpoints, including "/health" and "/models"?
|
||||
static const std::set<std::string> protected_endpoints = {
|
||||
"/props",
|
||||
@@ -3098,7 +2526,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
// If API key is not set, skip validation
|
||||
if (sparams.api_keys.empty()) {
|
||||
if (params.api_keys.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3113,7 +2541,7 @@ int main(int argc, char ** argv) {
|
||||
std::string prefix = "Bearer ";
|
||||
if (auth_header.substr(0, prefix.size()) == prefix) {
|
||||
std::string received_api_key = auth_header.substr(prefix.size());
|
||||
if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
|
||||
if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
|
||||
return true; // API key is valid
|
||||
}
|
||||
}
|
||||
@@ -3168,7 +2596,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
res.status = 200; // HTTP OK
|
||||
if (sparams.slots_endpoint && req.has_param("include_slots")) {
|
||||
if (params.endpoint_slots && req.has_param("include_slots")) {
|
||||
health["slots"] = result.data.at("slots");
|
||||
}
|
||||
|
||||
@@ -3194,7 +2622,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
|
||||
if (!sparams.slots_endpoint) {
|
||||
if (!params.endpoint_slots) {
|
||||
res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
@@ -3218,7 +2646,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
|
||||
if (!sparams.metrics_endpoint) {
|
||||
if (!params.endpoint_metrics) {
|
||||
res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
@@ -3318,14 +2746,14 @@ int main(int argc, char ** argv) {
|
||||
res.status = 200; // HTTP OK
|
||||
};
|
||||
|
||||
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||
const auto handle_slots_save = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||
json request_data = json::parse(req.body);
|
||||
std::string filename = request_data.at("filename");
|
||||
if (!fs_validate_filename(filename)) {
|
||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
std::string filepath = sparams.slot_save_path + filename;
|
||||
std::string filepath = params.slot_save_path + filename;
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_SLOT_SAVE;
|
||||
@@ -3348,14 +2776,14 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
};
|
||||
|
||||
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||
const auto handle_slots_restore = [&ctx_server, &res_error, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||
json request_data = json::parse(req.body);
|
||||
std::string filename = request_data.at("filename");
|
||||
if (!fs_validate_filename(filename)) {
|
||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
std::string filepath = sparams.slot_save_path + filename;
|
||||
std::string filepath = params.slot_save_path + filename;
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
|
||||
@@ -3530,9 +2958,9 @@ int main(int argc, char ** argv) {
|
||||
res.set_content(models.dump(), "application/json; charset=utf-8");
|
||||
};
|
||||
|
||||
const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
|
||||
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
||||
|
||||
const int id_task = ctx_server.queue_tasks.get_new_id();
|
||||
|
||||
@@ -3757,29 +3185,29 @@ int main(int argc, char ** argv) {
|
||||
//
|
||||
|
||||
// register static assets routes
|
||||
if (!sparams.public_path.empty()) {
|
||||
if (!params.public_path.empty()) {
|
||||
// Set the base directory for serving static files
|
||||
svr->set_base_dir(sparams.public_path);
|
||||
svr->set_base_dir(params.public_path);
|
||||
}
|
||||
|
||||
// using embedded static files
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
|
||||
json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
|
||||
|
||||
// add new-ui files
|
||||
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
||||
|
||||
// register API routes
|
||||
svr->Get ("/health", handle_health);
|
||||
@@ -3798,7 +3226,7 @@ int main(int argc, char ** argv) {
|
||||
svr->Post("/v1/embeddings", handle_embeddings);
|
||||
svr->Post("/tokenize", handle_tokenize);
|
||||
svr->Post("/detokenize", handle_detokenize);
|
||||
if (!sparams.slot_save_path.empty()) {
|
||||
if (!params.slot_save_path.empty()) {
|
||||
// only enable slot endpoints if slot_save_path is set
|
||||
svr->Post("/slots/:id_slot", handle_slots_action);
|
||||
}
|
||||
@@ -3806,12 +3234,12 @@ int main(int argc, char ** argv) {
|
||||
//
|
||||
// Start the server
|
||||
//
|
||||
if (sparams.n_threads_http < 1) {
|
||||
if (params.n_threads_http < 1) {
|
||||
// +2 threads for monitoring endpoints
|
||||
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
|
||||
params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
|
||||
}
|
||||
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
|
||||
svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
|
||||
log_data["n_threads_http"] = std::to_string(params.n_threads_http);
|
||||
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
||||
|
||||
LOG_INFO("HTTP server listening", log_data);
|
||||
|
||||
|
||||
@@ -116,13 +116,6 @@ static inline void server_log(const char * level, const char * function, int lin
|
||||
// chat template utils
|
||||
//
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
inline bool verify_custom_template(const std::string & tmpl) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||
size_t alloc_size = 0;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
|
||||
|
||||
```bash
|
||||
./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
|
||||
./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
|
||||
|
||||
...
|
||||
|
||||
|
||||
+21
-29
@@ -6,28 +6,27 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
params.prompt = "Hello my name is";
|
||||
params.n_predict = 32;
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
||||
if (argc >= 3) {
|
||||
params.prompt = argv[2];
|
||||
}
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// total length of the sequence including the prompt
|
||||
const int n_len = 32;
|
||||
const int n_predict = params.n_predict;
|
||||
|
||||
// init LLM
|
||||
|
||||
@@ -36,9 +35,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
@@ -49,12 +46,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// initialize the context
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = 2048;
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
@@ -69,14 +61,14 @@ int main(int argc, char ** argv) {
|
||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
|
||||
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -115,7 +107,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
while (n_cur <= n_len) {
|
||||
while (n_cur <= n_predict) {
|
||||
// sample the next token
|
||||
{
|
||||
auto n_vocab = llama_n_vocab(model);
|
||||
@@ -134,7 +126,7 @@ int main(int argc, char ** argv) {
|
||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
break;
|
||||
|
||||
@@ -27,7 +27,8 @@ struct seq_draft {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -159,7 +159,6 @@
|
||||
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
|
||||
}
|
||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||
opencl = config.packages.default.override { useOpenCL = true; };
|
||||
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
|
||||
|
||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||
|
||||
+2
-4
@@ -2702,10 +2702,8 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
|
||||
if (cuda_graph_update_required) {
|
||||
// Extract nodes from graph
|
||||
if (cuda_ctx->cuda_graph->num_nodes == 0) {
|
||||
// First call with null argument gets number of nodes in graph
|
||||
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
||||
}
|
||||
// First call with null argument gets number of nodes in graph
|
||||
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
||||
// Subsequent call with non-null argument gets nodes
|
||||
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||
|
||||
+1
-1
@@ -1,7 +1,7 @@
|
||||
// An interface allowing to compute ggml_cgraph with Metal
|
||||
//
|
||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
|
||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
|
||||
//
|
||||
// How it works?
|
||||
//
|
||||
|
||||
-2305
File diff suppressed because it is too large
Load Diff
@@ -1,36 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
GGML_API void ggml_cl_init(void);
|
||||
|
||||
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
||||
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
// GGML_API void * ggml_cl_host_malloc(size_t size);
|
||||
// GGML_API void ggml_cl_host_free(void * ptr);
|
||||
|
||||
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||
|
||||
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
|
||||
// backend API
|
||||
|
||||
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
|
||||
|
||||
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
||||
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -297,17 +297,12 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
#elif defined(GGML_USE_OPENBLAS)
|
||||
#if defined(GGML_BLAS_USE_MKL)
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
// floating point type used to accumulate sums
|
||||
@@ -3380,10 +3375,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
ggml_cl_init();
|
||||
#endif
|
||||
|
||||
ggml_setup_op_has_task_pass();
|
||||
|
||||
is_first_call = false;
|
||||
@@ -9053,17 +9044,6 @@ static void ggml_compute_forward_add_f32(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
// TODO: OpenCL kernel support full broadcast
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||
if (ith == 0) {
|
||||
ggml_cl_add(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
@@ -10171,17 +10151,6 @@ static void ggml_compute_forward_mul_f32(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
// TODO: OpenCL kernel support full broadcast
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||
if (ith == 0) {
|
||||
ggml_cl_mul(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
@@ -12417,15 +12386,6 @@ static void ggml_compute_forward_mul_mat(
|
||||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
||||
const int64_t ne_plane = ne01*ne00;
|
||||
@@ -12873,8 +12833,6 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
bool use_blas = ggml_is_matrix(src0) &&
|
||||
ggml_is_matrix(src1) &&
|
||||
@@ -13072,7 +13030,7 @@ static void ggml_compute_forward_out_prod_q_f32(
|
||||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
if (ith != 0) {
|
||||
@@ -19546,11 +19504,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
{
|
||||
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
||||
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
||||
} else
|
||||
#endif
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
||||
if (node->src[0]->type != GGML_TYPE_F32) {
|
||||
@@ -22859,7 +22812,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
||||
}
|
||||
|
||||
int ggml_cpu_has_blas(void) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
@@ -22874,14 +22827,6 @@ int ggml_cpu_has_cuda(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_clblast(void) {
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_vulkan(void) {
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
return 1;
|
||||
@@ -22915,8 +22860,7 @@ int ggml_cpu_has_rpc(void) {
|
||||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
||||
ggml_cpu_has_sycl();
|
||||
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
|
||||
@@ -2425,7 +2425,6 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_API int ggml_cpu_has_blas (void);
|
||||
GGML_API int ggml_cpu_has_cuda (void);
|
||||
GGML_API int ggml_cpu_has_clblast (void);
|
||||
GGML_API int ggml_cpu_has_vulkan (void);
|
||||
GGML_API int ggml_cpu_has_kompute (void);
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
|
||||
@@ -13,8 +13,6 @@
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
# include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
# include "ggml-opencl.h"
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
# include "ggml-vulkan.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
@@ -110,7 +108,7 @@
|
||||
//
|
||||
|
||||
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
||||
static void llama_log_internal (ggml_log_level level, const char* format, ...);
|
||||
static void llama_log_internal (ggml_log_level level, const char * format, ...);
|
||||
static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
@@ -2149,12 +2147,12 @@ struct llama_control_vector {
|
||||
struct llama_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
using ttype = llama_token_type;
|
||||
using tattr = llama_token_attr;
|
||||
|
||||
struct token_data {
|
||||
token text;
|
||||
float score;
|
||||
ttype type;
|
||||
tattr attr;
|
||||
};
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
@@ -2406,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||
buft = ggml_backend_vk_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||
if (buft == nullptr) {
|
||||
@@ -2530,10 +2526,6 @@ static bool llama_kv_cache_init(
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
offload = false;
|
||||
#endif
|
||||
|
||||
// count used buffer types
|
||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||
if (offload) {
|
||||
@@ -4750,7 +4742,20 @@ static void llm_load_vocab(
|
||||
auto & token_data = vocab.id_to_token[i];
|
||||
token_data.text = std::move(word);
|
||||
token_data.score = scores ? scores[i] : 0.0f;
|
||||
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
||||
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
||||
|
||||
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
||||
switch(toktypes[i]) {
|
||||
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
||||
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
||||
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
||||
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
||||
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||||
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
||||
|
||||
@@ -4841,7 +4846,7 @@ static void llm_load_vocab(
|
||||
// build special tokens cache
|
||||
{
|
||||
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
||||
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
||||
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
|
||||
vocab.cache_special_tokens.push_back(id);
|
||||
}
|
||||
}
|
||||
@@ -4871,6 +4876,59 @@ static void llm_load_vocab(
|
||||
|
||||
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// Handle per token attributes
|
||||
//NOTE: Each model customizes per token attributes.
|
||||
//NOTE: Per token attributes are missing from the GGUF file.
|
||||
//TODO: Extract attributes from GGUF file.
|
||||
{
|
||||
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
||||
for (auto substr : substrs) {
|
||||
if (str.find(substr) < std::string::npos) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
||||
uint32_t current = vocab.id_to_token.at(id).attr;
|
||||
current = value ? (current | attr) : (current & ~attr);
|
||||
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
||||
};
|
||||
|
||||
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
||||
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
||||
};
|
||||
|
||||
std::string model_name;
|
||||
std::string tokenizer_pre;
|
||||
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||||
|
||||
// model name to lowercase
|
||||
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
||||
[] (const std::string::value_type x) {
|
||||
return std::tolower(x);
|
||||
}
|
||||
);
|
||||
|
||||
// set attributes by model/tokenizer name
|
||||
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
||||
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
||||
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
||||
for (auto id : vocab.cache_special_tokens) {
|
||||
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
||||
}
|
||||
for (auto token : {"</s>"}) {
|
||||
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
||||
}
|
||||
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
||||
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
@@ -12620,27 +12678,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
||||
|
||||
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
||||
}
|
||||
|
||||
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
||||
}
|
||||
|
||||
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
||||
}
|
||||
|
||||
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
||||
}
|
||||
|
||||
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
||||
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||
}
|
||||
|
||||
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||
@@ -13258,7 +13316,8 @@ struct fragment_buffer_variant {
|
||||
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
||||
// for each special token
|
||||
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
||||
const auto & special_token = vocab.id_to_token[special_id].text;
|
||||
const auto & data = vocab.id_to_token[special_id];
|
||||
const auto & special_token = data.text;
|
||||
|
||||
// for each text fragment
|
||||
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
||||
@@ -13295,13 +13354,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
||||
if (match > raw_text_base_offset) {
|
||||
// left
|
||||
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
||||
const int64_t left_reminder_length = match - raw_text_base_offset;
|
||||
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
||||
int64_t left_reminder_length = match - raw_text_base_offset;
|
||||
|
||||
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
||||
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
||||
left_reminder_length--;
|
||||
}
|
||||
}
|
||||
|
||||
if (left_reminder_length > 0) {
|
||||
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
||||
it++;
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
||||
#endif
|
||||
it++;
|
||||
}
|
||||
|
||||
// special token
|
||||
@@ -13310,16 +13378,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
||||
|
||||
// right
|
||||
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
||||
const int64_t right_reminder_offset = match + special_token.length();
|
||||
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
||||
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
||||
int64_t right_reminder_offset = match + special_token.length();
|
||||
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
||||
|
||||
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
||||
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
||||
right_reminder_offset++;
|
||||
right_reminder_length--;
|
||||
}
|
||||
}
|
||||
|
||||
if (right_reminder_length > 0) {
|
||||
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
||||
it++;
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
||||
#endif
|
||||
|
||||
it++;
|
||||
|
||||
if (source == 0) {
|
||||
buffer.erase_after(buffer.before_begin());
|
||||
} else {
|
||||
@@ -13365,9 +13442,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
||||
// tokenizer.encode('', add_special_tokens=False) returns []
|
||||
|
||||
static const bool rtrim = true; //TODO: as param
|
||||
bool is_prev_special = false;
|
||||
bool special_token_rtrim = false;
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
@@ -13377,25 +13452,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
||||
|
||||
// TODO: It's likely possible to get rid of this string copy entirely
|
||||
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
||||
// and passing 'add space prefix' as bool argument
|
||||
//
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
if (special_token_rtrim) {
|
||||
size_t num_whitespaces = 0;
|
||||
while (isspace(raw_text[num_whitespaces])) {
|
||||
num_whitespaces++;
|
||||
}
|
||||
if (num_whitespaces == raw_text.size()) {
|
||||
continue; // skip if all whitespaces
|
||||
}
|
||||
raw_text = raw_text.substr(num_whitespaces);
|
||||
}
|
||||
|
||||
if (vocab.add_space_prefix) {
|
||||
if (!output.size() || is_prev_special) { // prefix with space if first token
|
||||
raw_text = " " + raw_text;
|
||||
@@ -13411,11 +13469,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
output.push_back(fragment.token);
|
||||
is_prev_special = true;
|
||||
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
||||
special_token_rtrim = rtrim
|
||||
&& fragment.token != vocab.special_bos_id
|
||||
&& fragment.token != vocab.special_unk_id
|
||||
&& fragment.token != vocab.special_eos_id;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14650,260 +14703,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
//
|
||||
// Beam search
|
||||
//
|
||||
|
||||
struct llama_beam {
|
||||
std::vector<llama_token> tokens;
|
||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
||||
// Sort beams by probability. In case of ties, prefer beams at eob.
|
||||
bool operator<(const llama_beam & rhs) const {
|
||||
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
||||
}
|
||||
// Shift off first n tokens and discard them.
|
||||
void shift_tokens(const size_t n) {
|
||||
if (n) {
|
||||
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
||||
tokens.resize(tokens.size() - n);
|
||||
}
|
||||
}
|
||||
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
||||
};
|
||||
|
||||
// A struct for calculating logit-related info.
|
||||
struct llama_logit_info {
|
||||
const float * const logits;
|
||||
const int n_vocab;
|
||||
const float max_l;
|
||||
const float normalizer;
|
||||
struct sum_exp {
|
||||
float max_l;
|
||||
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
||||
};
|
||||
llama_logit_info(llama_context * ctx)
|
||||
: logits(llama_get_logits(ctx))
|
||||
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
||||
, max_l(*std::max_element(logits, logits + n_vocab))
|
||||
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
||||
{ }
|
||||
llama_token_data get_token_data(const llama_token token_id) const {
|
||||
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
||||
return {token_id, logits[token_id], p};
|
||||
}
|
||||
// Return top k token_data by logit.
|
||||
std::vector<llama_token_data> top_k(size_t k) {
|
||||
std::vector<llama_token_data> min_heap; // min-heap by logit
|
||||
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
||||
min_heap.reserve(k_min);
|
||||
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
||||
min_heap.push_back(get_token_data(token_id));
|
||||
}
|
||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
||||
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
||||
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
||||
if (min_heap.front().logit < logits[token_id]) {
|
||||
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
||||
min_heap.back().id = token_id;
|
||||
min_heap.back().logit = logits[token_id];
|
||||
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
||||
}
|
||||
}
|
||||
return min_heap;
|
||||
}
|
||||
float probability_from_logit(float logit) const {
|
||||
return normalizer * std::exp(logit - max_l);
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_beam_search_data {
|
||||
llama_context * ctx;
|
||||
size_t n_beams;
|
||||
int n_past;
|
||||
int n_predict;
|
||||
std::vector<llama_beam> beams;
|
||||
std::vector<llama_beam> next_beams;
|
||||
|
||||
// Re-calculated on each loop iteration
|
||||
size_t common_prefix_length;
|
||||
|
||||
// Used to communicate to/from callback on beams state.
|
||||
std::vector<llama_beam_view> beam_views;
|
||||
|
||||
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
||||
: ctx(ctx)
|
||||
, n_beams(n_beams)
|
||||
, n_past(n_past)
|
||||
, n_predict(n_predict)
|
||||
, beam_views(n_beams) {
|
||||
beams.reserve(n_beams);
|
||||
next_beams.reserve(n_beams);
|
||||
}
|
||||
|
||||
// Collapse beams to a single beam given by index.
|
||||
void collapse_beams(const size_t beam_idx) {
|
||||
if (0u < beam_idx) {
|
||||
std::swap(beams[0], beams[beam_idx]);
|
||||
}
|
||||
beams.resize(1);
|
||||
}
|
||||
|
||||
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
||||
// The repetitive patterns below reflect the 2 stages of heaps:
|
||||
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
||||
// * If the heap is full and a new element is found that should be included, pop the
|
||||
// least element to the back(), replace it with the new, then push it into the heap.
|
||||
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
||||
// Min-heaps use a greater-than comparator.
|
||||
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
||||
if (beam.eob) {
|
||||
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
||||
if (next_beams.size() < n_beams) {
|
||||
next_beams.push_back(std::move(beam));
|
||||
if (next_beams.size() == n_beams) {
|
||||
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
}
|
||||
} else if (next_beams.front().p < beam.p) {
|
||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
next_beams.back() = std::move(beam);
|
||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
}
|
||||
} else {
|
||||
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
||||
if (!beam.tokens.empty()) {
|
||||
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
||||
}
|
||||
llama_logit_info logit_info(ctx);
|
||||
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
||||
|
||||
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
||||
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
size_t i=0;
|
||||
if (next_beams.size() < n_beams) {
|
||||
for (; next_beams.size() < n_beams ; ++i) {
|
||||
llama_beam next_beam = beam;
|
||||
next_beam.tokens.push_back(next_tokens[i].id);
|
||||
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
||||
next_beams.push_back(std::move(next_beam));
|
||||
}
|
||||
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
} else {
|
||||
for (; next_beams.front().p == 0.0f ; ++i) {
|
||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
next_beams.back() = beam;
|
||||
next_beams.back().tokens.push_back(next_tokens[i].id);
|
||||
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
}
|
||||
}
|
||||
for (; i < n_beams ; ++i) {
|
||||
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
||||
if (next_beams.front().p < next_p) {
|
||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
next_beams.back() = beam;
|
||||
next_beams.back().tokens.push_back(next_tokens[i].id);
|
||||
next_beams.back().p = next_p;
|
||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find common_prefix_length based on beams.
|
||||
// Requires beams is not empty.
|
||||
size_t find_common_prefix_length() {
|
||||
size_t common_prefix_length = beams[0].tokens.size();
|
||||
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
||||
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
||||
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
||||
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
||||
common_prefix_length = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return common_prefix_length;
|
||||
}
|
||||
|
||||
// Construct beams_state to send back to caller via the callback function.
|
||||
// Side effect: set common_prefix_length = find_common_prefix_length();
|
||||
llama_beams_state get_beams_state(const bool last_call) {
|
||||
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
||||
beam_views[i] = beams[i].view();
|
||||
}
|
||||
common_prefix_length = find_common_prefix_length();
|
||||
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
||||
}
|
||||
|
||||
// Loop:
|
||||
// * while i < n_predict, AND
|
||||
// * any of the beams have not yet reached end-of-beam (eob), AND
|
||||
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
||||
// (since all other beam probabilities can only decrease)
|
||||
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
||||
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
||||
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
||||
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
||||
!beams[top_beam_index()].eob ; ++i) {
|
||||
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
||||
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
||||
if (common_prefix_length) {
|
||||
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
||||
n_past += common_prefix_length;
|
||||
}
|
||||
// Zero-out next_beam probabilities to place them last in following min-heap.
|
||||
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
||||
for (llama_beam & beam : beams) {
|
||||
beam.shift_tokens(common_prefix_length);
|
||||
fill_next_beams_by_top_probabilities(beam);
|
||||
}
|
||||
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
||||
beams.swap(next_beams);
|
||||
renormalize_beam_probabilities(beams);
|
||||
}
|
||||
collapse_beams(top_beam_index());
|
||||
callback(callback_data, get_beams_state(true));
|
||||
}
|
||||
|
||||
// As beams grow, the cumulative probabilities decrease.
|
||||
// Renormalize them to avoid floating point underflow.
|
||||
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
||||
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
||||
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
||||
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
||||
}
|
||||
|
||||
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
||||
size_t top_beam_index() {
|
||||
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
||||
}
|
||||
|
||||
// Copy (p,eob) for each beam which may have been changed by the callback.
|
||||
void update_beams_from_beam_views() {
|
||||
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
||||
beams[i].p = beam_views[i].p;
|
||||
beams[i].eob = beam_views[i].eob;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void llama_beam_search(llama_context * ctx,
|
||||
llama_beam_search_callback_fn_t callback, void * callback_data,
|
||||
size_t n_beams, int n_past, int n_predict) {
|
||||
assert(ctx);
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
||||
|
||||
beam_search_data.loop(callback, callback_data);
|
||||
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
ctx->n_sample++;
|
||||
}
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
@@ -16114,7 +15913,7 @@ bool llama_supports_mlock(void) {
|
||||
}
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
@@ -18221,9 +18020,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
|
||||
return model->vocab.id_to_token[token].score;
|
||||
}
|
||||
|
||||
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
||||
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
||||
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return model->vocab.id_to_token[token].type;
|
||||
return model->vocab.id_to_token[token].attr;
|
||||
}
|
||||
|
||||
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
||||
|
||||
@@ -97,7 +97,7 @@ extern "C" {
|
||||
LLAMA_ROPE_TYPE_GLM = 4,
|
||||
};
|
||||
|
||||
enum llama_token_type {
|
||||
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
||||
@@ -107,6 +107,20 @@ extern "C" {
|
||||
LLAMA_TOKEN_TYPE_BYTE = 6,
|
||||
};
|
||||
|
||||
enum llama_token_attr {
|
||||
LLAMA_TOKEN_ATTR_UNDEFINED = 0,
|
||||
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
|
||||
LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
|
||||
LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
|
||||
LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
|
||||
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
|
||||
LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
|
||||
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
|
||||
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
|
||||
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
|
||||
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
@@ -821,7 +835,7 @@ extern "C" {
|
||||
|
||||
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
||||
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
||||
LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
@@ -1042,49 +1056,9 @@ extern "C" {
|
||||
llama_token token);
|
||||
|
||||
//
|
||||
// Beam search
|
||||
// Model split
|
||||
//
|
||||
|
||||
struct llama_beam_view {
|
||||
const llama_token * tokens;
|
||||
|
||||
size_t n_tokens;
|
||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
||||
};
|
||||
|
||||
// Passed to beam_search_callback function.
|
||||
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
||||
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
||||
// These pointers are valid only during the synchronous callback, so should not be saved.
|
||||
struct llama_beams_state {
|
||||
struct llama_beam_view * beam_views;
|
||||
|
||||
size_t n_beams; // Number of elements in beam_views[].
|
||||
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
||||
bool last_call; // True iff this is the last callback invocation.
|
||||
};
|
||||
|
||||
// Type of pointer to the beam_search_callback function.
|
||||
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
||||
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
||||
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
||||
|
||||
/// @details Deterministically returns entire sentence constructed by a beam search.
|
||||
/// @param ctx Pointer to the llama_context.
|
||||
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
||||
/// @param callback_data A pointer that is simply passed back to callback.
|
||||
/// @param n_beams Number of beams to use.
|
||||
/// @param n_past Number of tokens already evaluated.
|
||||
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
||||
LLAMA_API void llama_beam_search(
|
||||
struct llama_context * ctx,
|
||||
llama_beam_search_callback_fn_t callback,
|
||||
void * callback_data,
|
||||
size_t n_beams,
|
||||
int32_t n_past,
|
||||
int32_t n_predict);
|
||||
|
||||
/// @details Build a split GGUF final path for this chunk.
|
||||
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||
// Returns the split_path length.
|
||||
|
||||
Binary file not shown.
@@ -5,7 +5,6 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
set(LLAMA_BLAS @LLAMA_BLAS@)
|
||||
set(LLAMA_CUDA @LLAMA_CUDA@)
|
||||
set(LLAMA_METAL @LLAMA_METAL@)
|
||||
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
|
||||
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
||||
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
||||
|
||||
@@ -36,10 +35,6 @@ if (LLAMA_METAL)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast REQUIRED)
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
|
||||
@@ -10,16 +10,18 @@ set -x
|
||||
|
||||
bench_args="${@:3}"
|
||||
|
||||
rm -f llama-bench.sqlite
|
||||
rm -f llama-bench.sqlite > /dev/null
|
||||
|
||||
# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)
|
||||
|
||||
git checkout $1
|
||||
make clean && make -j32 $make_opts llama-bench
|
||||
./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
|
||||
git checkout $1 > /dev/null
|
||||
make clean > /dev/null
|
||||
make -j$(nproc) $make_opts llama-bench > /dev/null
|
||||
./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
||||
|
||||
git checkout $2
|
||||
make clean && make -j32 $make_opts llama-bench
|
||||
./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
|
||||
git checkout $2 > /dev/null
|
||||
make clean > /dev/null
|
||||
make -j$(nproc) $make_opts llama-bench > /dev/null
|
||||
./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
|
||||
|
||||
./scripts/compare-llama-bench.py -b $1 -c $2
|
||||
|
||||
@@ -19,17 +19,17 @@ logger = logging.getLogger("compare-llama-bench")
|
||||
|
||||
# Properties by which to differentiate results per commit:
|
||||
KEY_PROPERTIES = [
|
||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||
]
|
||||
|
||||
# Properties that are boolean and are converted to Yes/No for the table:
|
||||
BOOL_PROPERTIES = ["cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
|
||||
# Header names for the table:
|
||||
PRETTY_NAMES = {
|
||||
"cuda": "CUDA", "opencl": "OpenCL", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
||||
"cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
||||
"gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
|
||||
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
|
||||
|
||||
@@ -13,12 +13,12 @@ logger = logging.getLogger("run-with-preset")
|
||||
CLI_ARGS_MAIN_PERPLEXITY = [
|
||||
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
|
||||
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
|
||||
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
|
||||
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
|
||||
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
|
||||
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
|
||||
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
|
||||
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
|
||||
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
|
||||
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
|
||||
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
|
||||
"simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
|
||||
"verbose-prompt"
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
# Helper script for deploying llama.cpp server with a single Bash command
|
||||
#
|
||||
# - Works on Linux and macOS
|
||||
# - Supports: CPU, CUDA, Metal, OpenCL
|
||||
# - Supports: CPU, CUDA, Metal
|
||||
# - Can run all GGUF models from HuggingFace
|
||||
# - Can serve requests in parallel
|
||||
# - Always builds latest llama.cpp from GitHub
|
||||
@@ -19,7 +19,7 @@
|
||||
# --port: port number, default is 8888
|
||||
# --repo: path to a repo containing GGUF model files
|
||||
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
||||
# --backend: cpu, cuda, metal, opencl, depends on the OS
|
||||
# --backend: cpu, cuda, metal, depends on the OS
|
||||
# --gpu-id: gpu id, default is 0
|
||||
# --n-parallel: number of parallel requests, default is 8
|
||||
# --n-kv: KV cache size, default is 4096
|
||||
@@ -72,7 +72,7 @@ function print_usage {
|
||||
printf " --port: port number, default is 8888\n"
|
||||
printf " --repo: path to a repo containing GGUF model files\n"
|
||||
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||
printf " --backend: cpu, cuda, metal, depends on the OS\n"
|
||||
printf " --gpu-id: gpu id, default is 0\n"
|
||||
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||
printf " --n-kv: KV cache size, default is 4096\n"
|
||||
@@ -387,9 +387,6 @@ elif [[ "$backend" == "cpu" ]]; then
|
||||
elif [[ "$backend" == "metal" ]]; then
|
||||
printf "[+] Building with Metal backend\n"
|
||||
make -j server $log
|
||||
elif [[ "$backend" == "opencl" ]]; then
|
||||
printf "[+] Building with OpenCL backend\n"
|
||||
LLAMA_CLBLAST=1 make -j server $log
|
||||
else
|
||||
printf "[-] Unknown backend: %s\n" "$backend"
|
||||
exit 1
|
||||
@@ -407,8 +404,6 @@ elif [[ "$backend" == "cpu" ]]; then
|
||||
args="-ngl 0"
|
||||
elif [[ "$backend" == "metal" ]]; then
|
||||
args="-ngl 999"
|
||||
elif [[ "$backend" == "opencl" ]]; then
|
||||
args="-ngl 999"
|
||||
else
|
||||
printf "[-] Unknown backend: %s\n" "$backend"
|
||||
exit 1
|
||||
|
||||
@@ -106,8 +106,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# src/ggml-kompute.h -> ggml-kompute.h
|
||||
# src/ggml-metal.h -> ggml-metal.h
|
||||
# src/ggml-metal.m -> ggml-metal.m
|
||||
# src/ggml-opencl.cpp -> ggml-opencl.cpp
|
||||
# src/ggml-opencl.h -> ggml-opencl.h
|
||||
# src/ggml-quants.c -> ggml-quants.c
|
||||
# src/ggml-quants.h -> ggml-quants.h
|
||||
# src/ggml-rpc.cpp -> ggml-rpc.cpp
|
||||
@@ -143,8 +141,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
|
||||
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
|
||||
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
|
||||
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
|
||||
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
||||
-e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
|
||||
|
||||
@@ -14,8 +14,6 @@ cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h
|
||||
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
||||
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
||||
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
|
||||
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp
|
||||
|
||||
@@ -156,17 +156,39 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||
'<s>a', # Phi-3 fail
|
||||
'<unk><|endoftext|><s>', # Phi-3 fail
|
||||
'a\na', # TODO: Bert fail
|
||||
'a </s> b', # rstrip phi-3
|
||||
'a <mask> b', # lstrip jina-v2
|
||||
]
|
||||
|
||||
|
||||
def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||
special_tokens = set(tokenizer.all_special_tokens)
|
||||
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
|
||||
special_tokens = list(sorted(special_tokens))
|
||||
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
|
||||
"""Brute force check all vocab words"""
|
||||
yield from vocab
|
||||
|
||||
|
||||
def generator_added_lr_strip(tokenizer) -> Iterator[str]:
|
||||
WHITESPACES = ["", " ", " ", " "]
|
||||
special_tokens = list(tokenizer.all_special_tokens)
|
||||
added_tokens = list(tokenizer.added_tokens_encoder)
|
||||
all_tokens = list(sorted(set(special_tokens + added_tokens)))
|
||||
for token in all_tokens:
|
||||
for lstrip in WHITESPACES:
|
||||
for rstrip in WHITESPACES:
|
||||
yield lstrip + token + rstrip
|
||||
yield "a" + lstrip + token + rstrip
|
||||
yield lstrip + token + rstrip + "z"
|
||||
yield "a" + lstrip + token + rstrip + "z"
|
||||
|
||||
|
||||
def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||
special_tokens = list(tokenizer.all_special_tokens)
|
||||
added_tokens = list(tokenizer.added_tokens_encoder)
|
||||
separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
|
||||
all_tokens = list(sorted(set(special_tokens + added_tokens + separations)))
|
||||
rand = random.Random()
|
||||
for m in range(iterations):
|
||||
rand.seed(m)
|
||||
words = rand.choices(special_tokens, k=500)
|
||||
words = rand.choices(all_tokens, k=500)
|
||||
if words[0] == tokenizer.bos_token: # skip spam warning of double BOS
|
||||
while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
|
||||
words.pop(0)
|
||||
@@ -175,11 +197,6 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
|
||||
yield "".join(words)
|
||||
|
||||
|
||||
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
|
||||
"""Brute force check all vocab words"""
|
||||
yield from vocab
|
||||
|
||||
|
||||
def generator_random_chars(iterations=100) -> Iterator[str]:
|
||||
"""Brute force random text with simple characters"""
|
||||
|
||||
@@ -274,8 +291,8 @@ def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, g
|
||||
ids2 = func_tokenize2(text)
|
||||
if ids1 != ids2:
|
||||
i = find_first_mismatch(ids1, ids2)
|
||||
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
|
||||
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
|
||||
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
|
||||
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
|
||||
logger.info(" TokenIDs: " + str(ids1))
|
||||
logger.info(" Expected: " + str(ids2))
|
||||
raise Exception()
|
||||
@@ -309,8 +326,9 @@ def main(argv: list[str] = None):
|
||||
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
|
||||
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
|
||||
@@ -322,14 +340,14 @@ def main(argv: list[str] = None):
|
||||
if __name__ == "__main__":
|
||||
# main()
|
||||
|
||||
path_tokenizers = "./models/tokenizers/"
|
||||
path_tokenizers = "./models/tokenizers/"
|
||||
path_vocab_format = "./models/ggml-vocab-%s.gguf"
|
||||
|
||||
# import os
|
||||
# tokenizers = os.listdir(path_tokenizers)
|
||||
tokenizers = [
|
||||
# "llama-spm", # SPM
|
||||
# "phi-3", # SPM
|
||||
"llama-spm", # SPM
|
||||
"phi-3", # SPM
|
||||
"jina-v2-en", # WPM
|
||||
"bert-bge", # WPM
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user