Compare commits

...

5 Commits

Author SHA1 Message Date
slaren 62fead3ea0 cuda : fix tensor size calculation for non-split buffer (#5145) 2024-01-26 18:59:43 +01:00
slaren 15b4538ff2 ggml-alloc : add 10% margin to the buffer sizes (#5149) 2024-01-26 19:18:26 +02:00
snadampal 7032f4f634 ggml : update softmax n_task calculation (#5126)
updated the n_task calculation to use max number of
threads possible. This has improved the prompt eval
performance by around 5% for DOT kernels and by
around 10% for MMLA kernels on AWS Graviton3.
2024-01-26 19:17:59 +02:00
Georgi Gerganov 5f1925a8ce scripts : move run-with-preset.py from root to scripts folder 2024-01-26 17:09:44 +02:00
Georgi Gerganov 3b7c914de2 tests : gitignore test-c.o 2024-01-26 14:48:15 +02:00
6 changed files with 14 additions and 18 deletions
+3 -1
View File
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
}
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
return alloc->max_size;
// FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
// to avoid this, we add a 10% margin to the buffer size
return alloc->max_size + alloc->max_size/10;
}
// graph allocator
+3 -1
View File
@@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) {
return buft->iface.get_alloc_size(buft, tensor);
size_t size = buft->iface.get_alloc_size(buft, tensor);
assert(size >= ggml_nbytes(tensor));
return size;
}
return ggml_nbytes(tensor);
}
+5 -14
View File
@@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
// TODO: mmq/mmv support
#endif
const int64_t nb11 = src1->nb[1];
const int64_t nb1 = dst->nb[1];
const size_t nb11 = src1->nb[1];
const size_t nb1 = dst->nb[1];
const struct ggml_tensor * ids = src0;
const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
if (ggml_is_quantized(tensor->type)) {
// initialize padding to 0 to avoid possible NaN values
int64_t row_low = 0;
int64_t row_high = ggml_nrows(tensor);
int64_t nrows_split = row_high - row_low;
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
size_t original_size = ggml_nbytes(tensor);
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
if (padded_size > original_size && tensor->view_src == nullptr) {
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
}
}
}
@@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
}
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
int64_t row_low = 0;
int64_t row_high = ggml_nrows(tensor);
int64_t nrows_split = row_high - row_low;
size_t size = ggml_nbytes_split(tensor, nrows_split);
size_t size = ggml_nbytes(tensor);
int64_t ne0 = tensor->ne[0];
if (ggml_is_quantized(tensor->type)) {
+1 -1
View File
@@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_SOFT_MAX:
{
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
} break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
@@ -46,7 +46,7 @@ Formatting considerations:
- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
- To define a tensor split, pass a list of floats.
"""
usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
epilog = (" --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
"Unknown args will be ignored.")
+1
View File
@@ -1,2 +1,3 @@
*
!*.*
test-c.o