Compare commits

...

12 Commits

Author SHA1 Message Date
slaren 20e12112fd llama : suggest reduce ctx size when kv init fails 2024-11-02 00:55:19 +01:00
slaren bf60f27cda ggml : do not abort when ggml_aligned_malloc fails 2024-11-02 00:54:16 +01:00
Diego Devesa a6744e43e8 llama : add simple-chat example (#10124)
* llama : add simple-chat example

---------

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
2024-11-01 23:50:59 +01:00
Diego Devesa e991e3127f llama : use smart pointers for ggml resources (#10117) 2024-11-01 23:48:26 +01:00
Shupei Fan 418f5eef26 vulkan : improve ggml_vk_create_buffer error handling (#9898) 2024-11-01 19:33:14 +01:00
Georgi Gerganov ba6f62eb79 readme : update hot topics 2024-11-01 17:31:51 +02:00
sasha0552 d865d1478c server : fix smart selection of available slot (#10120)
* Fix smart selection of available slot

* minor fix

* replace vectors of tokens with shorthands
2024-11-01 14:33:14 +01:00
Georgi Gerganov 1804adb0cf ggml : remove ggml_scratch (#10121)
ggml-ci
2024-11-01 12:58:45 +02:00
Georgi Gerganov 815fe72adc sync : ggml 2024-11-01 10:28:24 +02:00
Georgi Gerganov f221d56220 ggml : alloc ggml_contexts on the heap (whisper/2525) 2024-11-01 10:24:50 +02:00
Zhenwei Jin e597e50794 build: fix build error in Windows env with OneAPI setup (#10107) 2024-11-01 11:09:59 +08:00
Diego Devesa 85679d37f3 llama : improve output buffer type selection (#10098) 2024-11-01 00:49:53 +01:00
17 changed files with 531 additions and 420 deletions
+6
View File
@@ -34,6 +34,7 @@ BUILD_TARGETS = \
llama-save-load-state \
llama-server \
llama-simple \
llama-simple-chat \
llama-speculative \
llama-tokenize \
llama-vdot \
@@ -1287,6 +1288,11 @@ llama-simple: examples/simple/simple.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-simple-chat: examples/simple-chat/simple-chat.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-tokenize: examples/tokenize/tokenize.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+2 -1
View File
@@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
----
+1
View File
@@ -49,6 +49,7 @@ else()
endif()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(simple-chat)
add_subdirectory(speculative)
add_subdirectory(tokenize)
endif()
+12 -23
View File
@@ -725,12 +725,12 @@ struct server_context {
return nullptr;
}
server_slot * get_available_slot(const std::string & prompt) {
server_slot * get_available_slot(const server_task & task) {
server_slot * ret = nullptr;
// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
int max_lcp_len = 0;
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
int max_lcs_len = 0;
float similarity = 0;
for (server_slot & slot : slots) {
@@ -740,25 +740,25 @@ struct server_context {
}
// skip the slot if it does not contains cached tokens
if (slot.prompt_tokens.empty()) {
if (slot.cache_tokens.empty()) {
continue;
}
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
// fraction of the common substring length compared to the current slot's prompt length
similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
// fraction of the common subsequence length compared to the current slot's prompt length
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
// select the current slot if the criteria match
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
max_lcp_len = lcp_len;
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
max_lcs_len = lcs_len;
ret = &slot;
}
}
if (ret != nullptr) {
SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
}
}
@@ -1514,18 +1514,7 @@ struct server_context {
{
const int id_slot = json_value(task.data, "id_slot", -1);
server_slot * slot;
if (id_slot != -1) {
slot = get_slot_by_id(id_slot);
} else {
std::string prompt;
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
prompt = json_value(task.data, "prompt", std::string());
}
slot = get_available_slot(prompt);
}
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
+47 -5
View File
@@ -439,18 +439,60 @@ static std::string gen_chatcmplid() {
// other common utils
//
static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
return i;
}
static size_t longest_common_prefix(const std::string & a, const std::string & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
// check for empty sequences
if (a.empty() || b.empty()) {
return 0;
}
return i;
// get the lengths of the input sequences
int a_len = a.size();
int b_len = b.size();
// initialize the maximum length of the longest common subsequence (LCS)
int max_length = 0;
// use two rows instead of a 2D matrix to optimize space
std::vector<int> prev_row(b_len + 1, 0);
std::vector<int> curr_row(b_len + 1, 0);
// iterate through the elements of a
for (int i = 1; i <= a_len; i++) {
// iterate through the elements of b
for (int j = 1; j <= b_len; j++) {
// if elements at the current positions match
if (a[i - 1] == b[j - 1]) {
// if it's the first element of either sequences, set LCS length to 1
if (i == 1 || j == 1) {
curr_row[j] = 1;
} else {
// increment LCS length by 1 compared to the previous element
curr_row[j] = prev_row[j - 1] + 1;
}
// update max_length if necessary
if (curr_row[j] > max_length) {
max_length = curr_row[j];
}
} else {
// reset LCS length if elements don't match
curr_row[j] = 0;
}
}
// update the previous row for the next iteration
prev_row = curr_row;
}
// return the maximum length of the LCS
return max_length;
}
static bool ends_with(const std::string & str, const std::string & suffix) {
+5
View File
@@ -0,0 +1,5 @@
set(TARGET llama-simple-chat)
add_executable(${TARGET} simple-chat.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
+7
View File
@@ -0,0 +1,7 @@
# llama.cpp/example/simple-chat
The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file.
```bash
./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048
...
+197
View File
@@ -0,0 +1,197 @@
#include "llama.h"
#include <cstdio>
#include <cstring>
#include <iostream>
#include <string>
#include <vector>
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
printf("\n");
}
int main(int argc, char ** argv) {
std::string model_path;
int ngl = 99;
int n_ctx = 2048;
// parse command line arguments
for (int i = 1; i < argc; i++) {
try {
if (strcmp(argv[i], "-m") == 0) {
if (i + 1 < argc) {
model_path = argv[++i];
} else {
print_usage(argc, argv);
return 1;
}
} else if (strcmp(argv[i], "-c") == 0) {
if (i + 1 < argc) {
n_ctx = std::stoi(argv[++i]);
} else {
print_usage(argc, argv);
return 1;
}
} else if (strcmp(argv[i], "-ngl") == 0) {
if (i + 1 < argc) {
ngl = std::stoi(argv[++i]);
} else {
print_usage(argc, argv);
return 1;
}
} else {
print_usage(argc, argv);
return 1;
}
} catch (std::exception & e) {
fprintf(stderr, "error: %s\n", e.what());
print_usage(argc, argv);
return 1;
}
}
if (model_path.empty()) {
print_usage(argc, argv);
return 1;
}
// only print errors
llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
if (level >= GGML_LOG_LEVEL_ERROR) {
fprintf(stderr, "%s", text);
}
}, nullptr);
// initialize the model
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
if (!model) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
// initialize the context
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = n_ctx;
ctx_params.n_batch = n_ctx;
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (!ctx) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
// initialize the sampler
llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
// helper function to evaluate a prompt and generate a response
auto generate = [&](const std::string & prompt) {
std::string response;
// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
GGML_ABORT("failed to tokenize the prompt\n");
}
// prepare a batch for the prompt
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
llama_token new_token_id;
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
exit(0);
}
if (llama_decode(ctx, batch)) {
GGML_ABORT("failed to decode\n");
}
// sample the next token
new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation?
if (llama_token_is_eog(model, new_token_id)) {
break;
}
// convert the token to a string, print it and add it to the response
char buf[256];
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
if (n < 0) {
GGML_ABORT("failed to convert token to piece\n");
}
std::string piece(buf, n);
printf("%s", piece.c_str());
fflush(stdout);
response += piece;
// prepare the next batch with the sampled token
batch = llama_batch_get_one(&new_token_id, 1);
}
return response;
};
std::vector<llama_chat_message> messages;
std::vector<char> formatted(llama_n_ctx(ctx));
int prev_len = 0;
while (true) {
// get user input
printf("\033[32m> \033[0m");
std::string user;
std::getline(std::cin, user);
if (user.empty()) {
break;
}
// add the user input to the message list and format it
messages.push_back({"user", strdup(user.c_str())});
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
if (new_len > (int)formatted.size()) {
formatted.resize(new_len);
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
}
if (new_len < 0) {
fprintf(stderr, "failed to apply the chat template\n");
return 1;
}
// remove previous messages to obtain the prompt to generate the response
std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
// generate a response
printf("\033[33m");
std::string response = generate(prompt);
printf("\n\033[0m");
// add the response to the messages
messages.push_back({"assistant", strdup(response.c_str())});
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
if (prev_len < 0) {
fprintf(stderr, "failed to apply the chat template\n");
return 1;
}
}
// free resources
for (auto & msg : messages) {
free(const_cast<char *>(msg.content));
}
llama_sampler_free(smpl);
llama_free(ctx);
llama_free_model(model);
return 0;
}
+38
View File
@@ -0,0 +1,38 @@
#pragma once
#ifndef __cplusplus
#error "This header is for C++ only"
#endif
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include <memory>
// Smart pointers for ggml types
// ggml
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
// ggml-alloc
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
// ggml-backend
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
+7 -15
View File
@@ -217,7 +217,6 @@
#define GGML_MAX_DIMS 4
#define GGML_MAX_PARAMS 2048
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 10
#define GGML_MAX_N_THREADS 512
#define GGML_MAX_OP_PARAMS 64
@@ -559,10 +558,10 @@ extern "C" {
enum ggml_log_level {
GGML_LOG_LEVEL_NONE = 0,
GGML_LOG_LEVEL_INFO = 1,
GGML_LOG_LEVEL_WARN = 2,
GGML_LOG_LEVEL_ERROR = 3,
GGML_LOG_LEVEL_DEBUG = 4,
GGML_LOG_LEVEL_DEBUG = 1,
GGML_LOG_LEVEL_INFO = 2,
GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_ERROR = 4,
GGML_LOG_LEVEL_CONT = 5, // continue previous log
};
@@ -656,13 +655,6 @@ extern "C" {
void * abort_callback_data;
};
// scratch buffer
struct ggml_scratch {
size_t offs;
size_t size;
void * data;
};
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
@@ -760,12 +752,12 @@ extern "C" {
// main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx);
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
GGML_API void ggml_reset(struct ggml_context * ctx);
GGML_API void ggml_free (struct ggml_context * ctx);
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+2 -1
View File
@@ -1368,6 +1368,7 @@ add_library(ggml
../include/ggml.h
../include/ggml-alloc.h
../include/ggml-backend.h
../include/ggml-cpp.h
ggml.c
ggml-alloc.c
ggml-backend.cpp
@@ -1402,7 +1403,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
find_library(MATH_LIBRARY m)
if (MATH_LIBRARY)
if (NOT WIN32 OR NOT GGML_SYCL)
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
endif()
endif()
+1 -1
View File
@@ -798,7 +798,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
void * data = ggml_aligned_malloc(size);
if (data == NULL) {
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
GGML_LOG_ERROR("%s: failed to allocate buffer of size %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
return NULL;
}
+1 -4
View File
@@ -1047,7 +1047,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
return buf;
}
buf->size = size;
vk::BufferCreateInfo buffer_create_info{
vk::BufferCreateFlags(),
size,
@@ -1075,7 +1074,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
if (memory_type_index == UINT32_MAX) {
device->device.destroyBuffer(buf->buffer);
buf->size = 0;
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
}
@@ -1092,13 +1090,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
}
catch (const vk::SystemError& e) {
device->device.destroyBuffer(buf->buffer);
buf->size = 0;
throw e;
}
} else {
// Out of Host/Device memory, clean up buffer
device->device.destroyBuffer(buf->buffer);
buf->size = 0;
throw e;
}
}
@@ -1111,6 +1107,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
buf->device = device;
buf->size = size;
#ifdef GGML_VULKAN_MEMORY_DEBUG
device->memory_logger->log_allocation(buf, size);
+19 -112
View File
@@ -306,6 +306,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
}
#define GGML_DEBUG 0
#define GGML_GELU_FP16
#define GGML_GELU_QUICK_FP16
@@ -432,7 +433,6 @@ void * ggml_aligned_malloc(size_t size) {
break;
}
GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
GGML_ABORT("fatal error");
return NULL;
}
return aligned_memory;
@@ -2014,18 +2014,14 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
struct ggml_context {
size_t mem_size;
void* mem_buffer;
void * mem_buffer;
bool mem_buffer_owned;
bool no_alloc;
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
int n_objects;
struct ggml_object * objects_begin;
struct ggml_object * objects_end;
struct ggml_scratch scratch;
struct ggml_scratch scratch_save;
};
struct ggml_context_container {
@@ -3263,7 +3259,6 @@ struct ggml_numa_nodes {
//
struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_numa_nodes numa;
};
@@ -3845,7 +3840,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
g_state = (struct ggml_state) {
/*.contexts =*/ { { 0 } },
/*.numa =*/ {
.n_nodes = 0,
.total_cpus = 0,
@@ -3864,26 +3858,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
is_first_call = false;
}
// find non-used context in g_state
struct ggml_context * ctx = NULL;
ggml_critical_section_end();
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (!g_state.contexts[i].used) {
g_state.contexts[i].used = true;
ctx = &g_state.contexts[i].context;
GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
break;
}
}
if (ctx == NULL) {
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
ggml_critical_section_end();
return NULL;
}
struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
// allow to call ggml_init with 0 size
if (params.mem_size == 0) {
@@ -3897,12 +3874,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.no_alloc =*/ params.no_alloc,
/*.no_alloc_save =*/ params.no_alloc,
/*.n_objects =*/ 0,
/*.objects_begin =*/ NULL,
/*.objects_end =*/ NULL,
/*.scratch =*/ { 0, 0, NULL, },
/*.scratch_save =*/ { 0, 0, NULL, },
};
GGML_ASSERT(ctx->mem_buffer != NULL);
@@ -3911,56 +3885,35 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
ggml_critical_section_end();
return ctx;
}
void ggml_reset(struct ggml_context * ctx) {
if (ctx == NULL) {
return;
}
ctx->n_objects = 0;
ctx->objects_begin = NULL;
ctx->objects_end = NULL;
}
void ggml_free(struct ggml_context * ctx) {
if (ctx == NULL) {
return;
}
// make this function thread safe
ggml_critical_section_start();
bool found = false;
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (&g_state.contexts[i].context == ctx) {
g_state.contexts[i].used = false;
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
__func__, i, ggml_used_mem(ctx));
if (ctx->mem_buffer_owned) {
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
}
found = true;
break;
}
if (ctx->mem_buffer_owned) {
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
}
if (!found) {
GGML_PRINT_DEBUG("%s: context not found\n", __func__);
}
ggml_critical_section_end();
GGML_FREE(ctx);
}
size_t ggml_used_mem(const struct ggml_context * ctx) {
return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
}
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
ctx->scratch = scratch;
return result;
}
bool ggml_get_no_alloc(struct ggml_context * ctx) {
return ctx->no_alloc;
}
@@ -3988,27 +3941,6 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
return max_size;
}
// IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace
// operators when using scratch buffers
// TODO: implement a better way
static void ggml_scratch_save(struct ggml_context * ctx) {
// this is needed to allow opt tensors to store their data
// TODO: again, need to find a better way
ctx->no_alloc_save = ctx->no_alloc;
ctx->no_alloc = false;
ctx->scratch_save = ctx->scratch;
ctx->scratch.data = NULL;
}
static void ggml_scratch_load(struct ggml_context * ctx) {
ctx->no_alloc = ctx->no_alloc_save;
ctx->scratch = ctx->scratch_save;
}
////////////////////////////////////////////////////////////////////////////////
static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
@@ -4089,29 +4021,13 @@ static struct ggml_tensor * ggml_new_tensor_impl(
size_t obj_alloc_size = 0;
if (view_src == NULL && !ctx->no_alloc) {
if (ctx->scratch.data != NULL) {
// allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
assert(false);
return NULL;
}
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
ctx->scratch.offs += data_size;
} else {
// allocate tensor data in the context's memory pool
obj_alloc_size = data_size;
}
// allocate tensor data in the context's memory pool
obj_alloc_size = data_size;
}
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
GGML_ASSERT(obj_new);
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
#ifdef __clang__
@@ -4207,24 +4123,16 @@ struct ggml_tensor * ggml_new_tensor_4d(
}
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
ggml_scratch_save(ctx);
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
ggml_scratch_load(ctx);
ggml_set_i32(result, value);
return result;
}
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
ggml_scratch_save(ctx);
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_scratch_load(ctx);
ggml_set_f32(result, value);
return result;
@@ -20292,7 +20200,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
uint64_t size_eval = 0;
// compute size of intermediate results
// TODO: does not take into account scratch buffers !!!!
for (int i = 0; i < cgraph->n_nodes; ++i) {
size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
}
+1 -1
View File
@@ -1 +1 @@
162e232411ee98ceb0cccfa84886118d917d2123
bb78a40dc60e04c626bac2b65840b509988e990d
+1
View File
@@ -0,0 +1 @@
../ggml/include/ggml-cpp.h
+184 -257
View File
File diff suppressed because it is too large Load Diff