Compare commits

...

24 Commits

Author SHA1 Message Date
lhez 73e53dc834 opencl: ref count ggml_backend_opencl_context and refactor profiling (#14254)
* Move profiling info into `ggml_backend_opencl_context`
* Add `enqueue_ndrange_kernel` to launch kernel
2025-06-24 11:46:25 -07:00
Georgi Gerganov 62af464227 batch : fix check for empty sequences in memory (#14364)
* batch : fix check for empty sequences in memory

ggml-ci

* cont : reuse the var

ggml-ci
2025-06-24 18:26:30 +03:00
Mathieu Baudier c148cf1946 cmake : use LLAMA_BUILD_NUMBER when defining LLAMA_INSTALL_VERSION (#14362) 2025-06-24 15:05:31 +02:00
Nigel Bosch 1b809cee22 server : move no API key doc to /health (#14352) 2025-06-24 10:59:11 +02:00
Sigbjørn Skjæret abf241045d main : honor --verbose-prompt on interactive prompts (#14350) 2025-06-24 09:31:00 +02:00
Bartowski 901e20bbe5 jinja : Add Mistral-Small-3.2-24B-Instruct-2506.jinja (#14349)
This will allow the use of tools on the llama-server
2025-06-24 09:17:58 +03:00
uvos 0142961a2e CUDA/HIP: optimize mmv paths taken for HIP devices (#14324)
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2025-06-24 01:12:56 +02:00
bandoti ce82bd0117 ci: add workflow for relocatable cmake package (#14346) 2025-06-23 15:30:51 -03:00
Jeff Bolz bf2a99e3cb vulkan: update windows SDK in release.yml (#14344) 2025-06-23 15:44:48 +02:00
Molly Sophia 72c6bc3f3d llama : better rwkv chat template and add missing inputs.use_jinja setting (#14336)
* llama-cli : add missing `inputs.use_jinja` setting

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

* llama : better legacy chat template for rwkv

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>

---------

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
2025-06-23 19:56:19 +08:00
Johannes Gäßler defe2158dd CUDA: mul_mat_v support for batch sizes > 1 (#14262)
* CUDA: mul_mat_v support for batch sizes > 1

* use 64 bit math for initial offset calculation
2025-06-23 13:11:31 +02:00
Georgi Gerganov 7b50d589a8 kv-cells : fix tracking of seq_pos (#14339)
* kv-cells : fix tracking of seq_pos during cache reuse

ggml-ci

* cont : improve error message

ggml-ci

* cont : add more comments
2025-06-23 12:27:35 +03:00
Jeff Bolz 3a9457df96 vulkan: update windows SDK in CI (#14334) 2025-06-23 10:19:24 +02:00
Ed Addario fa4a9f2a1c quantize : handle user-defined pruning of whole layers (blocks) (#13037) 2025-06-22 23:16:26 +02:00
Sigbjørn Skjæret 238005c2dc gguf-py : fix SpecialVocab parsing when post_processor is null (#14330) 2025-06-22 19:46:17 +02:00
Ruikai Peng 66aba7aca9 run : avoid double tokenization (#14327)
* run : avoid double tokenization by adopting common_tokenize heuristic

* build : fix windows gcc and clang warnings

* lint : fixed trailing whitepace

* run : fix is_first flag
2025-06-23 01:28:06 +08:00
Georgi Gerganov f1f5e82df6 examples : fix is_first logic for tokenization (#14329)
ggml-ci
2025-06-22 20:10:07 +03:00
uvos af3373f1ad HIP: enable vec fattn on RDNA4 (#14323) 2025-06-22 16:51:23 +02:00
yuiseki 5d5c066de8 mtmd : fix Pixtral OOM with large images by capping image_size to 1024 (#14326)
Mistral Small 2506 models using Pixtral vision encoder were running out
of GPU memory when processing images larger than 1024x1024 pixels due to
exponential memory growth from unlimited image size.

This fix applies the same 1024x1024 limit used by Qwen2VL models to
prevent OOM issues while maintaining compatibility with existing models.
2025-06-22 14:44:57 +02:00
Sigbjørn Skjæret 40bfa04c95 common : use std::string_view now that we target c++17 (#14319) 2025-06-22 08:37:43 +03:00
Aman Gupta aa064b2eb7 CUDA: add mean operation (#14313)
* CUDA: add mean operation

* add back sum_rows_f32_cuda

* Review: early exit if col!=0
2025-06-22 12:39:54 +08:00
Sigbjørn Skjæret aa0ef5c578 gguf-py : fix Qwen3-Embedding eos token (#14314) 2025-06-21 18:12:05 +02:00
Markus Tavenrath bb16041cae Add support for VK_EXT_debug_utils to add labels to Vulkan objects. (#13792)
* Add support for VK_EXT_debug_utils to add labels to Vulkan objects. In step 1 compute pipelines are getting labeled.

* remove #ifdef for debug utils and add queue marker.
2025-06-21 08:17:12 +02:00
Sigbjørn Skjæret 58cba76a9a gguf-py : fix TemplateProcessing pair when bos/eos is missing (#14312) 2025-06-21 07:33:21 +02:00
31 changed files with 1171 additions and 828 deletions
+51
View File
@@ -0,0 +1,51 @@
name: Build relocatable cmake package
on:
workflow_dispatch:
workflow_call:
jobs:
linux:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y build-essential tcl
- name: Build
run: |
PREFIX="$(pwd)"/inst
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release
cmake --install build --prefix "$PREFIX" --config Release
export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
tclsh <<'EOF'
set build(commit) [string trim [exec git rev-parse --short HEAD]]
set build(number) [string trim [exec git rev-list --count HEAD]]
set build(version) "0.0.$build(number)"
set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \
"set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
"set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
puts -nonewline "Checking llama-config.cmake version... "
foreach check $checks {
if {![regexp -expanded -- $check $llamaconfig]} {
puts "\"$check\" failed!"
exit 1
}
}
puts "success."
EOF
cd examples/simple-cmake-pkg
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
cmake --build build
+40 -4
View File
@@ -5,10 +5,43 @@ on:
push:
branches:
- master
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: [
'.github/workflows/build.yml',
'.github/workflows/build-linux-cross.yml',
'.github/workflows/build-cmake-pkg.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: [
'.github/workflows/build.yml',
'.github/workflows/build-linux-cross.yml',
'.github/workflows/build-cmake-pkg.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -478,6 +511,9 @@ jobs:
build-linux-cross:
uses: ./.github/workflows/build-linux-cross.yml
build-cmake-pkg:
uses: ./.github/workflows/build-cmake-pkg.yml
macOS-latest-cmake-ios:
runs-on: macos-latest
@@ -683,7 +719,7 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.4.309.0
VULKAN_VERSION: 1.4.313.2
strategy:
matrix:
@@ -736,7 +772,7 @@ jobs:
id: get_vulkan
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+2 -2
View File
@@ -302,7 +302,7 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
VULKAN_VERSION: 1.4.309.0
VULKAN_VERSION: 1.4.313.2
strategy:
matrix:
@@ -332,7 +332,7 @@ jobs:
id: get_vulkan
if: ${{ matrix.backend == 'vulkan' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+1 -1
View File
@@ -95,7 +95,7 @@ endif()
if (NOT DEFINED LLAMA_BUILD_COMMIT)
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
endif()
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
# override ggml options
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
+3 -46
View File
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return result;
}
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
class string_view {
const std::string & _str;
const size_t _start;
const size_t _end;
public:
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
size_t size() const {
return _end - _start;
}
size_t length() const {
return size();
}
operator std::string() const {
return str();
}
std::string str() const {
return _str.substr(_start, _end - _start);
}
string_view substr(size_t pos, size_t len = std::string::npos) const {
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
}
char operator[](size_t pos) const {
auto index = _start + pos;
if (index >= _end) {
throw std::out_of_range("string_view index out of range");
}
return _str[_start + pos];
}
bool operator==(const string_view & other) const {
std::string this_str = *this;
std::string other_str = other;
return this_str == other_str;
}
};
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
auto has_min = min_value != std::numeric_limits<int>::min();
auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
}
out << "}";
};
std::function<void(const string_view &, const string_view &)> uniform_range =
[&](const string_view & from, const string_view & to) {
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
[&](const std::string_view & from, const std::string_view & to) {
size_t i = 0;
while (i < from.length() && i < to.length() && from[i] == to[i]) {
i++;
}
if (i > 0) {
out << "\"" << from.substr(0, i).str() << "\"";
out << "\"" << from.substr(0, i) << "\"";
}
if (i < from.length() && i < to.length()) {
if (i > 0) {
+1 -1
View File
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
auto generate = [&](const std::string & prompt) {
std::string response;
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == 0;
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == -1;
// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
+40 -2
View File
@@ -241,8 +241,18 @@ static bool fp16_mma_available(const int cc) {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
return false;
#else
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) {
return true;
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
return true;
#else
return false;
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
} else {
return false;
}
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
}
@@ -252,6 +262,14 @@ static bool fp16_mma_hardware_available(const int cc) {
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
}
static bool bf16_mma_hardware_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
}
static bool fp32_mma_hardware_available(const int cc) {
return GGML_CUDA_CC_IS_CDNA(cc);
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
static bool new_mma_available(const int cc) {
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
@@ -362,6 +380,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#endif // FP16_AVAILABLE
}
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
template<bool norm>
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
const int row = blockIdx.x;
const int col = threadIdx.x;
float sum = 0.0f;
for (int i = col; i < ncols; i += blockDim.x) {
sum += x[row * ncols + i];
}
sum = warp_reduce_sum(sum);
if (col != 0) {
return;
}
dst[row] = norm ? sum / ncols : sum;
}
template<int width = WARP_SIZE>
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
+18 -18
View File
@@ -37,6 +37,7 @@
#include "ggml-cuda/ssm-scan.cuh"
#include "ggml-cuda/sum.cuh"
#include "ggml-cuda/sumrows.cuh"
#include "ggml-cuda/mean.cuh"
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh"
@@ -99,8 +100,7 @@ int ggml_cuda_get_device() {
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
cudaError_t err;
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
{
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
err = cudaMallocManaged(ptr, size);
#if defined(GGML_USE_HIP)
if (err == hipSuccess) {
@@ -118,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
err = cudaMalloc(ptr, size);
}
#endif // defined(GGML_USE_HIP)
}
else
{
} else {
err = cudaMalloc(ptr, size);
}
return err;
@@ -1945,16 +1943,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
bool any_gpus_with_slow_fp16 = false;
bool any_gpus_without_fp16_mma = false;
bool any_gpus_with_slow_fp16 = false;
if (split) {
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@@ -1965,16 +1961,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
continue;
}
const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}
} else {
const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}
// debug helpers
@@ -1985,7 +1981,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
if (!split && use_mul_mat_vec) {
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
@@ -2357,6 +2353,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SUM_ROWS:
ggml_cuda_op_sum_rows(ctx, dst);
break;
case GGML_OP_MEAN:
ggml_cuda_op_mean(ctx, dst);
break;
case GGML_OP_SSM_CONV:
ggml_cuda_op_ssm_conv(ctx, dst);
break;
@@ -3260,6 +3259,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_POOL_2D:
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
return true;
+19
View File
@@ -0,0 +1,19 @@
#include "mean.cuh"
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *) src0->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
}
+3
View File
@@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+257 -87
View File
@@ -2,25 +2,26 @@
#include "common.cuh"
#include "mmv.cuh"
template <typename T, typename type_acc, int block_size>
template <typename T, typename type_acc, int ncols_dst, int block_size>
static __global__ void mul_mat_vec(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
const int64_t row = blockIdx.x;
const int64_t channel_dst = blockIdx.y;
const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio;
const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst;
const int64_t sample_dst = blockIdx.z;
const int64_t sample_x = sample_dst / sample_ratio;
const int64_t sample_y = sample_dst;
const int tid = threadIdx.x;
const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
const int row = blockIdx.x;
const int channel_dst = blockIdx.y;
const int channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio;
const int channel_y = ids ? channel_dst % nchannels_y : channel_dst;
const int sample_dst = blockIdx.z;
const int sample_x = sample_dst / sample_ratio;
const int sample_y = sample_dst;
const int tid = threadIdx.x;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
y += sample_y *stride_sample_y + channel_y *stride_channel_y;
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
const float2 * y2 = (const float2 *) y;
@@ -34,81 +35,108 @@ static __global__ void mul_mat_vec(
__syncthreads();
}
float sumf = 0.0f;
float sumf[ncols_dst] = {0.0f};
if constexpr (std::is_same<T, float>::value) {
const float2 * x2 = (const float2 *) x;
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmpx = x2[col2];
const float2 tmpy = y2[col2];
sumf += tmpx.x*tmpy.x;
sumf += tmpx.y*tmpy.y;
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
sumf[j] += tmpx.x*tmpy.x;
sumf[j] += tmpx.y*tmpy.y;
}
}
} else if constexpr (std::is_same<T, half>::value) {
const half2 * x2 = (const half2 *) x;
if (std::is_same<type_acc, float>::value) {
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmpx = __half22float2(x2[col2]);
const float2 tmpy = y2[col2];
sumf += tmpx.x * tmpy.x;
sumf += tmpx.y * tmpy.y;
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
sumf[j] += tmpx.x * tmpy.x;
sumf[j] += tmpx.y * tmpy.y;
}
}
} else {
#ifdef FP16_AVAILABLE
half2 sumh2 = make_half2(0.0f, 0.0f);
half2 sumh2[ncols_dst] = {{0.0f, 0.0f}};
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmp = y2[col2];
sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const half2 tmpx = x2[col2];
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y);
}
}
sumf = __low2float(sumh2) + __high2float(sumh2);
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]);
}
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
}
} else if constexpr (std::is_same<T, nv_bfloat16>::value) {
const int * x2 = (const int *) x;
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
const int tmpx = x2[col2];
const float2 tmpy = y2[col2];
sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const int tmpx = x2[col2];
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
}
}
} else {
static_assert(std::is_same<T, void>::value, "unsupported type");
}
sumf = warp_reduce_sum<warp_size>(sumf);
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
if (block_size > warp_size) {
buf_iw[tid/warp_size] = sumf;
__syncthreads();
if (tid >= warp_size) {
return;
if (block_size > warp_size) {
buf_iw[tid/warp_size] = sumf[j];
__syncthreads();
if (tid < warp_size) {
sumf[j] = buf_iw[tid];
sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
}
if (j < ncols_dst) {
__syncthreads();
}
}
sumf = buf_iw[tid];
sumf = warp_reduce_sum<warp_size>(sumf);
}
if (tid != 0) {
if (tid >= ncols_dst) {
return;
}
dst[row] = sumf;
dst[tid*stride_col_dst + row] = sumf[tid];
}
template <typename T, typename type_acc>
template <typename T, typename type_acc, int ncols_dst>
static void launch_mul_mat_vec_cuda(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t ncols, const int64_t nrows,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
GGML_ASSERT(ncols % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(ncols % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(stride_col_y % 2 == 0);
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
const int64_t channel_ratio = nchannels_dst / nchannels_x;
@@ -138,44 +166,52 @@ static void launch_mul_mat_vec_cuda(
const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) {
case 32: {
mul_mat_vec<T, type_acc, 32><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
mul_mat_vec<T, type_acc, ncols_dst, 32><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 64: {
mul_mat_vec<T, type_acc, 64><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
mul_mat_vec<T, type_acc, ncols_dst, 64><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 96: {
mul_mat_vec<T, type_acc, 96><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
mul_mat_vec<T, type_acc, ncols_dst, 96><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 128: {
mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
mul_mat_vec<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 160: {
mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
mul_mat_vec<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 192: {
mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
mul_mat_vec<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 224: {
mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
mul_mat_vec<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 256: {
mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
mul_mat_vec<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
default: {
GGML_ABORT("fatal error");
@@ -183,23 +219,91 @@ static void launch_mul_mat_vec_cuda(
}
}
template <typename T, typename type_acc>
static void mul_mat_vec_cuda_switch_ncols_dst(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
switch (ncols_dst) {
case 1:
launch_mul_mat_vec_cuda<T, type_acc, 1>
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 2:
launch_mul_mat_vec_cuda<T, type_acc, 2>
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 3:
launch_mul_mat_vec_cuda<T, type_acc, 3>
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 4:
launch_mul_mat_vec_cuda<T, type_acc, 4>
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 5:
launch_mul_mat_vec_cuda<T, type_acc, 5>
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 6:
launch_mul_mat_vec_cuda<T, type_acc, 6>
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 7:
launch_mul_mat_vec_cuda<T, type_acc, 7>
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 8:
launch_mul_mat_vec_cuda<T, type_acc, 8>
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
default:
GGML_ABORT("fatal error");
break;
}
}
template<typename T>
static void mul_mat_vec_cuda(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
enum ggml_prec prec, cudaStream_t stream) {
if constexpr(std::is_same<T, half>::value) {
if (prec == GGML_PREC_DEFAULT) {
launch_mul_mat_vec_cuda<T, half>
(x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
mul_mat_vec_cuda_switch_ncols_dst<T, half>
(x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
return;
}
}
launch_mul_mat_vec_cuda<T, float>
(x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
mul_mat_vec_cuda_switch_ncols_dst<T, float>
(x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
}
@@ -246,24 +350,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
const int64_t stride_channel_dst = ids ? s1 : s2;
const int64_t stride_channel_y = ids ? s11 : s12;
GGML_ASSERT(ncols_dst == 1);
GGML_ASSERT(!ids || ncols_dst == 1);
switch (src0->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0->data;
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
} break;
case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0->data;
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
} break;
case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
} break;
@@ -282,16 +386,19 @@ void ggml_cuda_op_mul_mat_vec(
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
GGML_ASSERT(src1_ncols == 1);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
// ggml_cuda_op provides single, contiguous matrices
const int64_t stride_row = ne00;
const int64_t stride_col_y = ne10;
const int64_t stride_col_dst = id == ctx.device ? ne0 : row_diff; // main device has larger memory buffer
const int64_t nchannels_x = 1;
const int64_t nchannels_y = 1;
const int64_t nchannels_dst = 1;
@@ -307,19 +414,19 @@ void ggml_cuda_op_mul_mat_vec(
switch (src0->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0_dd_i;
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0_dd_i;
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
@@ -334,3 +441,66 @@ void ggml_cuda_op_mul_mat_vec(
GGML_UNUSED(src1_ncols);
GGML_UNUSED(src1_padded_row_size);
}
bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
if (src0_ne[0] % 2 != 0) {
return false;
}
switch (type) {
case GGML_TYPE_F32:
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
return ne11 <= 8;
}
if (cc >= GGML_CUDA_CC_TURING) {
return ne11 <= 4;
}
return ne11 <= 3;
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
if (fp32_mma_hardware_available(cc)) {
return ne11 <= 3;
}
return ne11 <= 8;
}
return ne11 <= 8;
case GGML_TYPE_F16:
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1);
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
return src0_small && ne11 <= 4;
}
if (fp16_mma_hardware_available(cc)) {
return src0_small && ne11 <= 3;
}
return ne11 <= 8;
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
if (fp16_mma_hardware_available(cc)) {
if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
return ne11 <= 5;
}
return ne11 <= 2;
}
return ne11 <= 8;
}
return ne11 <= 8;
case GGML_TYPE_BF16:
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
const bool src0_small = (src0_ne[1] <= 512 || src0_ne[2]*src0_ne[3] == 1);
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
return src0_small && ne11 <= 4;
}
if (bf16_mma_hardware_available(cc)) {
return src0_small && ne11 <= 3;
}
return ne11 <= 8;
} else if (GGML_CUDA_CC_IS_AMD(cc)) {
if (bf16_mma_hardware_available(cc)) {
return ne11 <= 3;
}
return ne11 <= 8;
}
return ne11 <= 8;
default:
return false;
}
}
+2 -3
View File
@@ -1,8 +1,5 @@
#include "common.cuh"
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
#define MMV_MAX_ROWS 512
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
void ggml_cuda_op_mul_mat_vec(
@@ -10,3 +7,5 @@ void ggml_cuda_op_mul_mat_vec(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream);
bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
+5 -18
View File
@@ -1,25 +1,9 @@
#include "sumrows.cuh"
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
const int row = blockIdx.x;
const int col = threadIdx.x;
float sum = 0.0f;
for (int i = col; i < ncols; i += blockDim.x) {
sum += x[row * ncols + i];
}
sum = warp_reduce_sum(sum);
if (col == 0) {
dst[row] = sum;
}
}
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
}
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -35,5 +19,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
}
-1
View File
@@ -1,5 +1,4 @@
#include "common.cuh"
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
File diff suppressed because it is too large Load Diff
+57 -1
View File
@@ -1041,6 +1041,14 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
struct vk_instance_t {
vk::Instance instance;
bool debug_utils_support = false; // VK_EXT_debug_utils enabled
PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {};
PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {};
PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {};
PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {};
PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {};
PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {};
std::vector<size_t> device_indices;
vk_device devices[GGML_VK_MAX_DEVICES];
};
@@ -1180,6 +1188,14 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
}
pipeline->compiled = true;
if (vk_instance.debug_utils_support) {
vk::DebugUtilsObjectNameInfoEXT duoni;
duoni.objectType = vk::ObjectType::ePipeline;
duoni.pObjectName = pipeline->name.c_str();
duoni.objectHandle = reinterpret_cast<uint64_t>(static_cast<VkPipeline_T*>(pipeline->pipeline));
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
}
{
std::lock_guard<std::mutex> guard(device->mutex);
device->pipelines.insert({ pipeline->name, pipeline });
@@ -3561,6 +3577,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
static void ggml_vk_instance_init() {
if (vk_instance_initialized) {
return;
@@ -3581,7 +3599,7 @@ static void ggml_vk_instance_init() {
#ifdef __APPLE__
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
#endif
const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr;
std::vector<const char*> layers;
if (validation_ext) {
@@ -3596,6 +3614,9 @@ static void ggml_vk_instance_init() {
extensions.push_back("VK_KHR_portability_enumeration");
}
#endif
if (debug_utils_ext) {
extensions.push_back("VK_EXT_debug_utils");
}
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
#ifdef __APPLE__
if (portability_enumeration_ext) {
@@ -3619,6 +3640,18 @@ static void ggml_vk_instance_init() {
vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true;
if (debug_utils_ext) {
vk_instance.debug_utils_support = true;
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT");
vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT");
vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT");
vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT");
vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT");
vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT");
}
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -9656,6 +9689,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
if (vk_instance.debug_utils_support) {
vk::DebugUtilsLabelEXT dul = {};
dul.pLabelName = "ggml_backend_vk_graph_compute";
dul.color = std::array<float,4>{1.0f, 1.0f, 1.0f, 1.0f};
vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast<VkDebugUtilsLabelEXT*>(&dul));
}
uint64_t total_mat_mul_bytes = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
@@ -10345,6 +10385,22 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
UNUSED(instance_extensions);
}
// Extension availability
static bool ggml_vk_instance_debug_utils_ext_available(
const std::vector<vk::ExtensionProperties> & instance_extensions) {
// Check for portability enumeration extension for MoltenVK support
for (const auto & properties : instance_extensions) {
if (strcmp("VK_EXT_debug_utils", properties.extensionName) == 0) {
return true;
}
}
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_EXT_debug_utils not found." << std::endl;
return false;
UNUSED(instance_extensions);
}
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
switch (props.vendorID) {
case VK_VENDOR_ID_INTEL:
+72 -62
View File
@@ -167,71 +167,81 @@ class SpecialVocab:
tokenizer_config['bos_token'] = special_bos = special_cls
if not special_eos and special_sep and tokenizer_config:
tokenizer_config['eos_token'] = special_eos = special_sep
post_processor = tokenizer.get('post_processor', {})
for processor in post_processor.get('processors', [post_processor]):
if processor.get('type') == 'RobertaProcessing':
self.add_special_token['bos'] = True
self.add_special_token['eos'] = True
self.add_special_token['sep'] = True
if not special_cls and tokenizer_config:
special_cls = processor.get('cls', [special_bos])[0]
tokenizer_config['cls_token'] = special_cls
if not special_sep and tokenizer_config:
special_sep = processor.get('sep', [special_eos])[0]
tokenizer_config['sep_token'] = special_sep
continue
# Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
# Only works with simple templates, **will** get it wrong on unusual sequences
if processor.get('type') == 'TemplateProcessing':
tmpl_single = processor.get('single', [])
tmpl_pair = processor.get('pair', [])
special_first = None
special_last = None
if len(tmpl_single) > 1:
if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
if not tokenizer_config:
special_bos = special_first
self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
if special_first not in (special_bos, special_cls):
logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
if not tokenizer_config:
special_eos = special_last
self.add_special_token['eos'] = True if special_last == special_eos else False
if special_last != special_eos:
logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
if tmpl_pair:
seq_start = 1 if tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
seq_stop = -1 if tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
if seq_start == 0 or seq_stop is None:
logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
if tmpl_a != 'A' or tmpl_b != 'B':
logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
# A [sep] [eos] B
if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
add_sep = False
if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
if special_entry in (special_sep, special_eos) and not special_last:
add_sep = True
if special_entry not in (special_sep, special_eos):
logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
else:
logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
if len(tmpl_pair) == 2:
if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
if special_entry in (special_sep, special_eos):
if post_processor := tokenizer.get('post_processor'):
for processor in post_processor.get('processors', [post_processor]):
if processor.get('type') == 'RobertaProcessing':
self.add_special_token['bos'] = True
self.add_special_token['eos'] = True
self.add_special_token['sep'] = True
if not special_cls and tokenizer_config:
special_cls = processor.get('cls', [special_bos])[0]
tokenizer_config['cls_token'] = special_cls
if not special_sep and tokenizer_config:
special_sep = processor.get('sep', [special_eos])[0]
tokenizer_config['sep_token'] = special_sep
continue
# Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
# Only works with simple templates, **will** get it wrong on unusual sequences
if processor.get('type') == 'TemplateProcessing':
tmpl_single = processor.get('single', [])
tmpl_pair = processor.get('pair', [])
special_first = None
special_last = None
if len(tmpl_single) > 1:
if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
if not tokenizer_config:
special_bos = special_first
self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
if special_first not in (special_bos, special_cls):
logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
if not tokenizer_config:
special_eos = special_last
elif special_last != special_eos:
if 'eot' not in self.special_token_types:
self.special_token_types = tuple(self.special_token_types) + ('eot', )
tokenizer_config['eot_token'] = special_eos
elif 'eom' not in self.special_token_types:
self.special_token_types = tuple(self.special_token_types) + ('eom', )
tokenizer_config['eom_token'] = special_eos
else:
logger.warning(f'Overriding EOS token {special_eos!r} with {special_last!r} without EOT/EOM fallback!')
tokenizer_config['eos_token'] = special_eos = special_last
self.add_special_token['eos'] = True if special_last == special_eos else False
if special_last != special_eos:
logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
if tmpl_pair:
seq_start = 1 if special_first and tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
seq_stop = -1 if special_last and tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
if (special_first and seq_start == 0) or (special_last and seq_stop is None):
logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
if tmpl_a != 'A' or tmpl_b != 'B':
logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
# A [sep] [eos] B
if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
add_sep = False
if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
if special_entry in (special_sep, special_eos) and not special_last:
add_sep = True
if special_entry not in (special_sep, special_eos):
logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
else:
logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
self.add_special_token['sep'] = add_sep
if add_sep and not special_sep and tokenizer_config:
tokenizer_config['sep_token'] = special_eos
continue
logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
if len(tmpl_pair) == 2:
if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
if special_entry in (special_sep, special_eos):
add_sep = True
if special_entry not in (special_sep, special_eos):
logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
else:
logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
self.add_special_token['sep'] = add_sep
if add_sep and not special_sep and tokenizer_config:
tokenizer_config['sep_token'] = special_eos
continue
if not tokenizer_config:
return True
chat_template_alt = None
+6 -3
View File
@@ -390,6 +390,7 @@ extern "C" {
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
void * prune_layers; // pointer to vector containing layer indices to prune
} llama_model_quantize_params;
typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
// Requires the context to have a memory.
// For encode-decoder contexts, processes the batch using the decoder.
// Positive return values does not mean a fatal error, but rather a warning.
// Upon non-zero return values, the memory state is restored to the state before this call
// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
// Upon other return values, the memory state is restored to the state before this call
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
// 2 - aborted
// 2 - aborted (processed ubatches will remain in the context's memory)
// -1 - invalid input batch
// < -1 - error
// < -1 - fatal error (processed ubatches will remain in the context's memory)
LLAMA_API int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch);
@@ -0,0 +1,124 @@
{%- set today = strftime_now("%Y-%m-%d") %}
{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information or when the user's request requires up-to-date or specific data, you must use the available tools to fetch the information. Do not hesitate to use tools whenever they can provide a more accurate or complete response. If no relevant tools are available, then clearly state that you don't have the information and avoid making up anything.
If the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\").
You are always very attentive to dates, and when asked about information at specific dates, you discard information that is at another date.
You follow these instructions in all languages, and always respond to the user in the language they use or request.
Next sections describe the capabilities that you have.
# WEB BROWSING INSTRUCTIONS
You cannot perform any web search or access internet to open URLs, links etc. If it seems like the user is expecting you to do so, you clarify the situation and ask the user to copy paste the text directly in the chat.
# MULTI-MODAL INSTRUCTIONS
You have the ability to read images, but you cannot generate images. You also cannot transcribe audio files or videos.
You cannot read nor transcribe audio files or videos.
# TOOL CALLING INSTRUCTIONS
You may have access to tools that you can use to fetch information or perform actions. You must use these tools in the following situations:
1. When the request requires up-to-date information.
2. When the request requires specific data that you do not have in your knowledge base.
3. When the request involves actions that you cannot perform without tools.
Always prioritize using tools to provide the most accurate and helpful response. If tools are not available, inform the user that you cannot perform the requested action at the moment." %}
{{- bos_token }}
{%- set system_prompt = default_system_message %}
{%- set loop_messages = messages %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{%- if messages|length > 0 and messages[0]['role'] == 'system' %}
{%- if messages[0]['content'] is string %}
{%- set system_prompt = messages[0]['content'] %}
{%- else %}
{%- set system_prompt = messages[0]['content'][0]['text'] %}
{%- endif %}
{%- set loop_messages = messages[1:] %}
{%- endif %}
{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
{%- set ns = namespace(index=0) %}
{%- for message in loop_messages %}
{%- if not (message.role == "tool" or (message.get('tool_calls'))) %}
{%- if (message["role"] == "user") != (ns.index % 2 == 0) %}
{{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
{%- endif %}
{%- set ns.index = ns.index + 1 %}
{%- endif %}
{%- endfor %}
{{- '[SYSTEM_PROMPT]' + system_prompt + '[/SYSTEM_PROMPT]' }}
{%- for message in loop_messages %}
{%- if message['role'] == 'system' %}
{%- if message['content'] is string %}
{{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}
{%- else %}
{{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}
{%- endif %}
{%- elif message['role'] == 'user' %}
{%- if tools is not none and (message == user_messages[-1]) %}
{{- '[AVAILABLE_TOOLS]' + tools|tojson + '[/AVAILABLE_TOOLS]' }}
{%- endif %}
{{- '[INST]' }}
{%- if message['content'] is string %}
{{- message['content'] }}
{%- else %}
{%- for block in message['content'] %}
{%- if block['type'] == 'text' %}
{{- block['text'] }}
{%- elif block['type'] in ['image', 'image_url'] %}
{{- '[IMG]' }}
{%- else %}
{{- raise_exception('Only text and image blocks are supported in message content!') }}
{%- endif %}
{%- endfor %}
{%- endif %}
{{- '[/INST]' }}
{%- elif message['role'] == 'assistant' %}
{%- if message.get('tool_calls') %}
{%- for tool_call in message.tool_calls %}
{{- '[TOOL_CALLS]' + tool_call.function.name }}
{%- if not tool_call.id is defined or tool_call.id is not string or tool_call.id|length != 9 %}
{{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
{%- endif %}
{{- '[CALL_ID]' + tool_call.id }}
{{- '[ARGS]' + tool_call['function']['arguments']|tojson }}
{%- endfor %}
{{- eos_token }}
{%- elif message['content'] is string %}
{{- message['content'] + eos_token }}
{%- else %}
{%- for block in message['content'] %}
{%- if block['type'] == 'text' %}
{{- block['text'] }}
{%- elif block['type'] in ['image', 'image_url'] %}
{{- '[IMG]' }}
{%- else %}
{{- raise_exception('Only text and image blocks are supported in assistant content!') }}
{%- endif %}
{%- endfor %}
{{- eos_token }}
{%- endif %}
{%- elif message['role'] == 'tool_results' or message['role'] == 'tool' %}
{%- if message.content is defined and message.content.content is defined %}
{%- set content = message.content.content %}
{%- else %}
{%- set content = message.content %}
{%- endif %}
{%- if not message.tool_call_id is defined or message.tool_call_id is not string or message['tool_call_id']|length != 9 %}
{{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
{%- endif %}
{{- '[TOOL_RESULTS]' + message.tool_call_id + '[TOOL_CONTENT]' + content|string + '[/TOOL_RESULTS]' }}
{%- else %}
{{- raise_exception('Only system, user, assistant, and tool roles are supported!') }}
{%- endif %}
{%- endfor %}
+20 -7
View File
@@ -244,22 +244,35 @@ bool llama_batch_allocr::init(
continue;
}
if (memory) {
const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
if (p0 >= 0) {
bool ok = true;
if (batch.token) {
if (seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
return false;
if (seq_pos_min(s) != p0 + 1) {
ok = false;
}
} else {
assert(batch.embd);
// for embeddings (typically used as vision input), we allow them to have repeating positions
// ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
if (seq_pos_min(s) != memory->seq_pos_max(s) && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
return false;
if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
ok = false;
}
}
if (!ok) {
LLAMA_LOG_ERROR(
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
__func__, s, s, p0, s, seq_pos_min(s));
return false;
}
}
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
+11 -6
View File
@@ -528,12 +528,17 @@ int32_t llm_chat_apply_template(
}
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
// this template requires the model to have "\n\n" as EOT token
for (auto message : chat) {
std::string role(message->role);
if (role == "user") {
ss << "User: " << message->content << "\n\nAssistant:";
} else {
ss << message->content << "\n\n";
for (size_t i = 0; i < chat.size(); i++) {
std::string role(chat[i]->role);
if (role == "system") {
ss << "System: " << trim(chat[i]->content) << "\n\n";
} else if (role == "user") {
ss << "User: " << trim(chat[i]->content) << "\n\n";
if (i == chat.size() - 1) {
ss << "Assistant:";
}
} else if (role == "assistant") {
ss << "Assistant: " << trim(chat[i]->content) << "\n\n";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
-1
View File
@@ -1018,7 +1018,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
pos_min[s] = std::numeric_limits<llama_pos>::max();
}
// TODO: fix sequence indexing
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
const auto & seq_id = ubatch.seq_id[i][0];
+33 -9
View File
@@ -7,6 +7,7 @@
#include <cassert>
#include <vector>
#include <set>
#include <map>
// meta information about KV cells that can be part of multiple sequences at the same time
// TODO: add unit tests
@@ -164,7 +165,7 @@ public:
assert(seq_id >= 0);
seq[i].reset(seq_id);
seq_pos[seq_id].erase(pos[i]);
seq_pos_dec(seq_id, pos[i]);
if (seq[i].none()) {
pos[i] = -1;
@@ -187,7 +188,7 @@ public:
seq[i].reset();
seq[i].set(seq_id);
seq_pos[seq_id].insert(pos[i]);
seq_pos_inc(seq_id, pos[i]);
return false;
}
@@ -232,7 +233,7 @@ public:
assert(!seq[i].test(seq_id));
seq[i].set(seq_id);
seq_pos[seq_id].insert(pos[i]);
seq_pos_inc(seq_id, pos[i]);
}
// return the sequence id of this cell
@@ -259,7 +260,9 @@ public:
return -1;
}
return *seq_pos[seq_id].begin();
assert(seq_pos[seq_id].begin()->second > 0);
return seq_pos[seq_id].begin()->first;
}
// the maximum position of sequence seq_id currently present in any of the cells
@@ -272,7 +275,9 @@ public:
return -1;
}
return *seq_pos[seq_id].rbegin();
assert(seq_pos[seq_id].rbegin()->second > 0);
return seq_pos[seq_id].rbegin()->first;
}
// note: call only if the cell is not empty
@@ -389,17 +394,36 @@ private:
// the bitset seq[i] tells us which sequences are currently occupying the i-th cell
std::vector<seq_set_t> seq;
// the set seq_pos[s] tells us which positions are currently present for sequence s
// the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s
// if the position p is not present, seq_pos[s][p] is not set
// this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
//
// note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq:
// - during performing a cache reuse via (rm + add)
// - some vision models have input embeddings with repeating positions
//
std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
// helper functions for updating `seq_pos`, once cell at a time:
void seq_pos_dec(llama_seq_id s, llama_pos p) {
auto it = seq_pos[s].find(p);
assert(it != seq_pos[s].end());
if (--it->second == 0) {
seq_pos[s].erase(it);
}
}
void seq_pos_inc(llama_seq_id s, llama_pos p) {
seq_pos[s][p]++;
}
// remove cell i
void seq_pos_rm(uint32_t i) {
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
if (seq[i].test(s)) {
seq_pos[s].erase(pos[i]);
seq_pos_dec(s, pos[i]);
}
}
}
@@ -408,7 +432,7 @@ private:
void seq_pos_add(uint32_t i) {
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
if (seq[i].test(s)) {
seq_pos[s].insert(pos[i]);
seq_pos_inc(s, pos[i]);
}
}
}
+79 -4
View File
@@ -1,5 +1,4 @@
#include "llama-quant.h"
#include "llama-impl.h"
#include "llama-model.h"
#include "llama-model-loader.h"
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
}
}
static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
if (prune.empty()) {
return orig_name;
}
static const std::regex pattern(R"(blk\.(\d+)\.)");
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
const int blk = std::stoi(match[1]);
std::string new_name = orig_name;
if (mapped.count(blk)) {
// Already mapped, do nothing
} else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
mapped[blk] = "";
} else if (blk < prune.front()) {
mapped[blk] = std::to_string(blk);
next_id = blk + 1;
} else {
mapped[blk] = std::to_string(next_id);
++next_id;
}
return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
}
return orig_name;
}
static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
if (mapped.empty()) {
return orig_name;
}
static const std::regex pattern(R"(blk\.(\d+)\.)");
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
const std::string blk(match[1]);
std::string new_name = orig_name;
for (const auto & p : mapped) {
if (p.second == blk) {
LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
}
}
GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
}
return orig_name;
}
struct quantize_state_impl {
const llama_model & model;
const llama_model_quantize_params * params;
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
const size_t align = GGUF_DEFAULT_ALIGNMENT;
gguf_context_ptr ctx_out { gguf_init_empty() };
std::vector<int> prune_list = {};
if (params->prune_layers) {
prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
}
// copy the KV pairs from the input file
gguf_set_kv (ctx_out.get(), ml.meta.get());
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -597,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
}
std::map<int, std::string> mapped;
int blk_id = 0;
int pruned_attention_w = 0;
// make a list of weights
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
tensors.reserve(ml.weights_map.size());
for (const auto & it : ml.weights_map) {
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
if (remapped_name.empty()) {
if (it.first.find("attn_v.weight") != std::string::npos ||
it.first.find("attn_qkv.weight") != std::string::npos ||
it.first.find("attn_kv_b.weight") != std::string::npos) {
pruned_attention_w++;
}
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
continue;
} else if (remapped_name != it.first) {
ggml_set_name(it.second.tensor, remapped_name.c_str());
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
}
tensors.push_back(&it.second);
}
if (!prune_list.empty()) {
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
}
// keep_split requires that the weights are sorted by split index
if (params->keep_split) {
@@ -640,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
}
size_t total_size_org = 0;
@@ -681,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
for (size_t i = 0; i < ctx_outs.size(); ++i) {
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
}
}
@@ -832,7 +906,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
const float * imatrix = nullptr;
if (imatrix_data) {
auto it = imatrix_data->find(tensor->name);
auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
if (it == imatrix_data->end()) {
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
} else {
@@ -947,6 +1021,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.tensor_type =*/ nullptr,
/*.prune_layers =*/ nullptr
};
return result;
+2
View File
@@ -4652,6 +4652,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_conv_transpose_2d({256, 256, 256, 1}, {3, 3, 16, 256}, 1));
test_cases.emplace_back(new test_mean(GGML_TYPE_F32, {256, 256, 3, 1}));
return test_cases;
}
+11 -1
View File
@@ -292,6 +292,7 @@ int main(int argc, char ** argv) {
if (!params.system_prompt.empty() || !params.prompt.empty()) {
common_chat_templates_inputs inputs;
inputs.use_jinja = g_params->use_jinja;
inputs.messages = chat_msgs;
inputs.add_generation_prompt = !params.prompt.empty();
@@ -916,10 +917,19 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
if (params.verbose_prompt) {
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size() - original_size);
}
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
const std::string token_str = common_token_to_piece(ctx, token);
output_tokens.push_back(token);
output_ss << common_token_to_piece(ctx, token);
output_ss << token_str;
if (params.verbose_prompt) {
LOG_INF("%6d -> '%s'\n", token, token_str.c_str());
}
}
// reset assistant message
+3
View File
@@ -2211,6 +2211,9 @@ struct clip_model_loader {
{
hparams.rope_theta = 10000.0f;
hparams.warmup_image_size = hparams.patch_size * 8;
// Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
// ref: https://github.com/ggml-org/llama.cpp/issues/14310
hparams.image_size = 1024;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
} break;
case PROJECTOR_TYPE_GEMMA3:
+39 -5
View File
@@ -107,13 +107,11 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
return false;
}
// usage:
// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
printf(" [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -124,6 +122,8 @@ static void usage(const char * executable) {
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
printf(" Advanced option to remove all tensors from the given layers\n");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -286,6 +286,32 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
return true;
}
static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers) {
if (!data) {
printf("\n%s: no layer pruning ids provided\n\n", __func__);
return false;
}
const auto block_ids = string_split<std::string>(data, ',');
for (const auto & block_id : block_ids) {
int id;
try {
id = std::stoi(block_id);
} catch (...) {
id = -1;
}
if (id < 0) {
printf("\n%s: invalid layer id '%s'\n\n", __func__, block_id.c_str());
return false;
}
prune_layers.emplace_back(id);
}
sort(prune_layers.begin(), prune_layers.end());
prune_layers.erase(std::unique(prune_layers.begin(), prune_layers.end()), prune_layers.end());
return true;
}
int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
@@ -298,6 +324,7 @@ int main(int argc, char ** argv) {
std::vector<std::string> included_weights, excluded_weights;
std::vector<llama_model_kv_override> kv_overrides;
std::vector<tensor_quantization> tensor_types;
std::vector<int> prune_layers;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -324,6 +351,10 @@ int main(int argc, char ** argv) {
if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]);
@@ -411,6 +442,9 @@ int main(int argc, char ** argv) {
if (!tensor_types.empty()) {
params.tensor_types = &tensor_types;
}
if (!prune_layers.empty()) {
params.prune_layers = &prune_layers;
}
llama_backend_init();
+25 -9
View File
@@ -9,6 +9,9 @@
#include <nlohmann/json.hpp>
#if defined(_WIN32)
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
# include <io.h>
#else
@@ -939,17 +942,30 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
// Function to tokenize the prompt
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(llama_data.context.get()), 0) == 0;
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
prompt_tokens.resize(n_prompt_tokens);
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first,
true) < 0) {
printe("failed to tokenize the prompt\n");
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(llama_data.context.get()), 0) == -1;
int n_tokens = prompt.size() + 2 * is_first;
prompt_tokens.resize(n_tokens);
n_tokens = llama_tokenize(vocab, prompt.c_str(), prompt.size(),
prompt_tokens.data(), prompt_tokens.size(),
is_first, /*parse_special =*/true);
if (n_tokens == std::numeric_limits<int32_t>::min()) {
printe("tokenization failed: input too large\n");
return -1;
}
return n_prompt_tokens;
if (n_tokens < 0) {
prompt_tokens.resize(-n_tokens);
int check = llama_tokenize(vocab, prompt.c_str(), prompt.size(),
prompt_tokens.data(), prompt_tokens.size(),
is_first, /*parse_special =*/true);
if (check != -n_tokens) {
printe("failed to tokenize the prompt (size mismatch)\n");
return -1;
}
n_tokens = check;
} else {
prompt_tokens.resize(n_tokens);
}
return n_tokens;
}
// Check if we have enough space in the context to evaluate this batch
+3 -1
View File
@@ -370,6 +370,8 @@ node index.js
### GET `/health`: Returns heath check result
This endpoint is public (no API key check).
**Response format**
- HTTP status code 503
@@ -708,7 +710,7 @@ If the tokens are missing, then the extra context is simply prefixed at the star
### **GET** `/props`: Get server global properties.
This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
**Response format**
+3
View File
@@ -3418,9 +3418,12 @@ struct server_context {
}
if (ret < -1) {
// TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
err = "Compute error.";
}
// TODO: handle ret == 2 (abort) when we start aborting
if (!err.empty()) {
SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
for (auto & slot : slots) {