mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
354ebac8cb
* server: real-time reasoning interruption via control endpoint Builds on the manual reasoning budget trigger from #23949. Adds a CONTROL task that mirrors the CANCEL path on the live slot and calls common_sampler_reasoning_budget_force to end thinking mid-generation. POST /v1/chat/completions/control with { id_slot, action }, opt-in reasoning_control arms the budget sampler on demand. Router and single model. Minimal WebUI button as a skeleton for further UI work. * ui: track reasoning phase via explicit streaming state Add isReasoning to the chat store, mirroring the isLoading pattern: per conversation map, private setter, public accessor and reactive export. Set from the stream callbacks, true on reasoning chunks, false on the first content chunk, reset on stream end and resynced on conversation switch. The skip button now keys off isReasoning so it shows only during the thinking phase, not the whole generation. * ui: extract control endpoint and action into constants Move the chat completion routes, the slots route and the reasoning control action out of chat.service into api-endpoints and a dedicated control-actions module. No behavior change, drops the magic strings so the control protocol has a single source of truth. * server: target reasoning control by completion id Address @ngxson review on the control endpoint. Switch from id_slot to the chat completion id to avoid a TOCTOU: the slot can be reassigned between the lookup and the control request, so matching the live completion (oaicompat_cmpl_id) is safe and a finished one simply matches nothing. Rename the action to reasoning_end, guard it on the reasoning_control flag of the target slot, and reduce the response to {success} with an optional message. * ui: target reasoning control by completion id Keep the streamed completion id on the message and post it back to the control endpoint instead of probing /slots. Drops the slot discovery and the TOCTOU that came with it. Action renamed to reasoning_end, response read as {success}. * server: address review from @ngxson Move the control fields into task_params and drop the redundant comments on the control path. * server: document the reasoning control endpoint * Update tools/ui/src/lib/types/database.d.ts Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * ui: rename cmplId to completionId Per @allozaur review, clearer name for the streamed completion id. * ui: wire completion id capture through the agentic flow The webui streams through the agentic flow, which relayed onModel but not onCompletionId, so the completion id never reached the message and the control request was never sent. Relay it through the flow and its callbacks type, declare id on the chunk type, and log an explicit error when the button fires without a usable id. * ui: target reasoning control model from the message The model is a property of the completion, so read it from the streaming message like the id, not from the model dropdown which is unrelated UI state. Makes the request self-consistent by construction instead of just unlikely to drift. --------- Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
652 lines
18 KiB
C++
652 lines
18 KiB
C++
#pragma once
|
|
|
|
#include "common.h"
|
|
#include "llama.h"
|
|
|
|
#include <string>
|
|
#include <unordered_set>
|
|
#include <list>
|
|
#include <map>
|
|
|
|
// TODO: prevent including the whole server-common.h as we only use server_tokens
|
|
#include "server-common.h"
|
|
|
|
using json = nlohmann::ordered_json;
|
|
|
|
enum server_task_type {
|
|
SERVER_TASK_TYPE_COMPLETION,
|
|
SERVER_TASK_TYPE_EMBEDDING,
|
|
SERVER_TASK_TYPE_RERANK,
|
|
SERVER_TASK_TYPE_INFILL,
|
|
SERVER_TASK_TYPE_CANCEL,
|
|
SERVER_TASK_TYPE_CONTROL,
|
|
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
|
SERVER_TASK_TYPE_METRICS,
|
|
SERVER_TASK_TYPE_SLOT_SAVE,
|
|
SERVER_TASK_TYPE_SLOT_RESTORE,
|
|
SERVER_TASK_TYPE_SLOT_ERASE,
|
|
SERVER_TASK_TYPE_GET_LORA,
|
|
SERVER_TASK_TYPE_SET_LORA,
|
|
};
|
|
|
|
// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common
|
|
enum task_response_type {
|
|
TASK_RESPONSE_TYPE_NONE, // llama.cpp native format
|
|
TASK_RESPONSE_TYPE_OAI_CHAT,
|
|
TASK_RESPONSE_TYPE_OAI_CMPL,
|
|
TASK_RESPONSE_TYPE_OAI_RESP,
|
|
TASK_RESPONSE_TYPE_OAI_ASR, // transcriptions API
|
|
TASK_RESPONSE_TYPE_OAI_EMBD,
|
|
TASK_RESPONSE_TYPE_ANTHROPIC,
|
|
};
|
|
|
|
enum stop_type {
|
|
STOP_TYPE_NONE,
|
|
STOP_TYPE_EOS,
|
|
STOP_TYPE_WORD,
|
|
STOP_TYPE_LIMIT,
|
|
};
|
|
|
|
struct task_params {
|
|
bool stream = false;
|
|
bool include_usage = false;
|
|
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
|
bool return_tokens = false;
|
|
bool return_progress = false;
|
|
|
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
|
int32_t n_predict = -1; // new tokens to predict
|
|
int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters
|
|
int32_t n_cmpl = 1; // number of completions to generate from this prompt
|
|
|
|
int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled)
|
|
|
|
// number of prompt tokens before the latest user message
|
|
int32_t n_before_user = -1;
|
|
|
|
int64_t t_max_prompt_ms = -1; // TODO: implement
|
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
|
|
|
std::map<int, float> lora; // mapping adapter ID -> scale
|
|
|
|
std::vector<std::string> antiprompt;
|
|
std::vector<std::string> response_fields;
|
|
|
|
bool timings_per_token = false;
|
|
bool post_sampling_probs = false;
|
|
|
|
struct common_params_sampling sampling;
|
|
struct common_params_speculative speculative;
|
|
|
|
// response formatting
|
|
bool verbose = false;
|
|
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
|
std::string oaicompat_model;
|
|
std::string oaicompat_cmpl_id;
|
|
|
|
// realtime control (SERVER_TASK_TYPE_CONTROL)
|
|
std::string control_action;
|
|
std::string control_cmpl_id;
|
|
|
|
// per-request parameters for chat parsing
|
|
common_chat_parser_params chat_parser_params;
|
|
|
|
// Embeddings
|
|
int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
|
|
|
|
json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const;
|
|
json to_json(bool only_metrics = false) const;
|
|
};
|
|
|
|
// struct for tracking the state of a task (e.g., for streaming)
|
|
struct task_result_state {
|
|
// tracking diffs for partial tool calls
|
|
std::vector<common_chat_msg_diff> diffs;
|
|
common_chat_parser_params chat_parser_params;
|
|
common_chat_msg chat_msg;
|
|
std::string generated_text; // append new chunks of generated text here
|
|
std::vector<std::string> generated_tool_call_ids;
|
|
std::unordered_set<size_t> sent_tool_call_names;
|
|
|
|
// for OpenAI Responses and Anthropic streaming API:
|
|
// track output item / content block state across chunks
|
|
bool thinking_block_started = false;
|
|
bool text_block_started = false;
|
|
|
|
// for OpenAI Responses streaming API
|
|
const std::string oai_resp_id;
|
|
const std::string oai_resp_reasoning_id;
|
|
const std::string oai_resp_message_id;
|
|
std::string oai_resp_fc_id; // function call ID for current args delta
|
|
|
|
task_result_state(const common_chat_parser_params & chat_parser_params);
|
|
|
|
// parse partial tool calls and update the internal state
|
|
common_chat_msg update_chat_msg(
|
|
const std::string & text_added,
|
|
bool is_partial,
|
|
std::vector<common_chat_msg_diff> & diffs,
|
|
bool filter_tool_calls = false);
|
|
};
|
|
|
|
struct server_task {
|
|
int id = -1; // to be filled by server_queue
|
|
|
|
// TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
|
|
size_t index = 0; // used when there are multiple prompts (batch request)
|
|
|
|
// used by SERVER_TASK_TYPE_CANCEL
|
|
int id_target = -1;
|
|
int id_slot = -1;
|
|
|
|
// used by parallel sampling (multiple completions from same prompt)
|
|
int id_parent = -1;
|
|
// temporary store of child tasks for scheduling
|
|
// note: accessing to elements is invalid after the task is moved to server_slot
|
|
std::vector<server_task> child_tasks;
|
|
|
|
// used by SERVER_TASK_TYPE_INFERENCE
|
|
task_params params;
|
|
server_tokens tokens;
|
|
|
|
// only used by CLI, this allow tokenizing CLI inputs on server side
|
|
// we need this because mtmd_context and vocab are not accessible outside of server_context
|
|
bool cli = false;
|
|
std::string cli_prompt;
|
|
std::vector<raw_buffer> cli_files;
|
|
|
|
server_task_type type;
|
|
|
|
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
|
|
struct slot_action {
|
|
int id_slot;
|
|
std::string filename;
|
|
std::string filepath;
|
|
};
|
|
slot_action slot_action;
|
|
|
|
// used by SERVER_TASK_TYPE_METRICS
|
|
bool metrics_reset_bucket = false;
|
|
|
|
// used by SERVER_TASK_TYPE_SET_LORA
|
|
std::map<int, float> set_lora; // mapping adapter ID -> scale
|
|
|
|
server_task() = default;
|
|
|
|
server_task(server_task_type type) : type(type) {}
|
|
|
|
int32_t n_tokens() const {
|
|
return tokens.size();
|
|
}
|
|
|
|
bool need_embd() const {
|
|
switch (type) {
|
|
case SERVER_TASK_TYPE_EMBEDDING:
|
|
case SERVER_TASK_TYPE_RERANK:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool need_logits() const {
|
|
switch (type) {
|
|
case SERVER_TASK_TYPE_COMPLETION:
|
|
case SERVER_TASK_TYPE_INFILL:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool need_sampling() const {
|
|
switch (type) {
|
|
case SERVER_TASK_TYPE_COMPLETION:
|
|
case SERVER_TASK_TYPE_INFILL:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static task_params params_from_json_cmpl(
|
|
const llama_vocab * vocab,
|
|
const common_params & params_base,
|
|
const int n_ctx_slot,
|
|
const std::vector<llama_logit_bias> & logit_bias_eog,
|
|
const json & data);
|
|
|
|
// utility function
|
|
static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
|
|
std::unordered_set<int> ids(tasks.size());
|
|
for (size_t i = 0; i < tasks.size(); i++) {
|
|
ids.insert(tasks[i].id);
|
|
for (auto & child : tasks[i].child_tasks) {
|
|
ids.insert(child.id);
|
|
}
|
|
}
|
|
return ids;
|
|
}
|
|
|
|
void add_child(int id_parent, int id_child) {
|
|
server_task copy;
|
|
|
|
copy.id = id_child;
|
|
copy.id_parent = id_parent;
|
|
copy.params = params;
|
|
copy.type = type;
|
|
copy.tokens = tokens.clone();
|
|
copy.id_slot = -1; // child tasks cannot specify slot
|
|
|
|
// use different sampling seed for each child
|
|
// note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
|
|
if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
|
|
copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
|
|
}
|
|
|
|
child_tasks.push_back(std::move(copy));
|
|
}
|
|
|
|
// the task will be moved into queue, then onto slots
|
|
// however, the state must be kept by caller (e.g., HTTP thread)
|
|
task_result_state create_state() const {
|
|
return task_result_state(params.chat_parser_params);
|
|
}
|
|
|
|
bool is_parent() const {
|
|
return child_tasks.size() > 0;
|
|
}
|
|
|
|
bool is_child() const {
|
|
return id_parent != -1;
|
|
}
|
|
};
|
|
|
|
struct result_timings {
|
|
int32_t cache_n = -1;
|
|
|
|
int32_t prompt_n = -1;
|
|
double prompt_ms = 0.0;
|
|
double prompt_per_token_ms = 0.0;
|
|
double prompt_per_second = 0.0;
|
|
|
|
int32_t predicted_n = -1;
|
|
double predicted_ms = 0.0;
|
|
double predicted_per_token_ms = 0.0;
|
|
double predicted_per_second = 0.0;
|
|
|
|
// Optional speculative metrics - only included when > 0
|
|
int32_t draft_n = 0;
|
|
int32_t draft_n_accepted = 0;
|
|
|
|
json to_json() const;
|
|
};
|
|
|
|
struct result_prompt_progress {
|
|
int32_t total = 0;
|
|
int32_t cache = 0;
|
|
int32_t processed = 0;
|
|
int64_t time_ms = 0;
|
|
|
|
json to_json() const;
|
|
};
|
|
|
|
struct server_task_result {
|
|
int id = -1;
|
|
int id_slot = -1;
|
|
|
|
// TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
|
|
size_t index = 0; // to be used for batched tasks
|
|
|
|
virtual bool is_error() {
|
|
// only used by server_task_result_error
|
|
return false;
|
|
}
|
|
virtual bool is_stop() {
|
|
// only used by server_task_result_cmpl_*
|
|
return true;
|
|
}
|
|
virtual void update(task_result_state &) {
|
|
// only used by server_task_result_cmpl_*
|
|
}
|
|
virtual json to_json() = 0;
|
|
virtual ~server_task_result() = default;
|
|
};
|
|
|
|
// using shared_ptr for polymorphism of server_task_result
|
|
using server_task_result_ptr = std::unique_ptr<server_task_result>;
|
|
|
|
struct completion_token_output {
|
|
llama_token tok;
|
|
float prob;
|
|
std::string text_to_send;
|
|
struct prob_info {
|
|
llama_token tok;
|
|
std::string txt;
|
|
float prob;
|
|
};
|
|
std::vector<prob_info> probs;
|
|
|
|
json to_json(bool post_sampling_probs) const;
|
|
|
|
static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs);
|
|
|
|
static float logarithm(float x);
|
|
|
|
static std::vector<unsigned char> str_to_bytes(const std::string & str);
|
|
|
|
};
|
|
|
|
struct server_task_result_cmpl_final : server_task_result {
|
|
std::string content;
|
|
llama_tokens tokens;
|
|
|
|
bool stream;
|
|
bool include_usage;
|
|
result_timings timings;
|
|
std::string prompt;
|
|
|
|
bool truncated;
|
|
int32_t n_decoded;
|
|
int32_t n_prompt_tokens;
|
|
int32_t n_prompt_tokens_cache;
|
|
int32_t n_tokens_cached;
|
|
bool has_new_line;
|
|
std::string stopping_word;
|
|
stop_type stop = STOP_TYPE_NONE;
|
|
|
|
bool post_sampling_probs;
|
|
std::vector<completion_token_output> probs_output;
|
|
std::vector<std::string> response_fields;
|
|
|
|
task_params generation_params;
|
|
|
|
// response formatting
|
|
bool verbose = false;
|
|
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
|
std::string oaicompat_model;
|
|
std::string oaicompat_cmpl_id;
|
|
common_chat_msg oaicompat_msg; // to be populated by update()
|
|
|
|
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
|
bool is_updated = false;
|
|
|
|
// for OpenAI Responses API
|
|
std::string oai_resp_id;
|
|
std::string oai_resp_reasoning_id;
|
|
std::string oai_resp_message_id;
|
|
|
|
virtual bool is_stop() override {
|
|
return true; // in stream mode, final responses are considered stop
|
|
}
|
|
|
|
virtual json to_json() override;
|
|
|
|
virtual void update(task_result_state & state) override {
|
|
is_updated = true;
|
|
oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs);
|
|
|
|
oai_resp_id = state.oai_resp_id;
|
|
oai_resp_reasoning_id = state.oai_resp_reasoning_id;
|
|
oai_resp_message_id = state.oai_resp_message_id;
|
|
}
|
|
|
|
json to_json_non_oaicompat();
|
|
|
|
json usage_json_oaicompat();
|
|
|
|
json to_json_oaicompat();
|
|
|
|
json to_json_oaicompat_chat();
|
|
|
|
json to_json_oaicompat_chat_stream();
|
|
|
|
json to_json_oaicompat_resp();
|
|
|
|
json to_json_oaicompat_resp_stream();
|
|
|
|
json to_json_oaicompat_asr();
|
|
|
|
json to_json_anthropic();
|
|
|
|
json to_json_anthropic_stream();
|
|
};
|
|
|
|
struct server_task_result_cmpl_partial : server_task_result {
|
|
std::string content;
|
|
llama_tokens tokens;
|
|
|
|
int32_t n_decoded;
|
|
int32_t n_prompt_tokens;
|
|
int32_t n_prompt_tokens_cache;
|
|
|
|
bool post_sampling_probs;
|
|
bool is_progress = false;
|
|
bool is_begin = false; // whether to send 200 status to HTTP client (begin of SSE stream)
|
|
// ref: https://github.com/ggml-org/llama.cpp/pull/23884
|
|
completion_token_output prob_output;
|
|
result_timings timings;
|
|
result_prompt_progress progress;
|
|
|
|
// response formatting
|
|
bool verbose = false;
|
|
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
|
std::string oaicompat_model;
|
|
std::string oaicompat_cmpl_id;
|
|
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
|
bool is_updated = false;
|
|
|
|
// Streaming state copied from task_result_state for this chunk
|
|
bool thinking_block_started = false;
|
|
bool text_block_started = false;
|
|
|
|
// for OpenAI Responses API
|
|
std::string oai_resp_id;
|
|
std::string oai_resp_reasoning_id;
|
|
std::string oai_resp_message_id;
|
|
std::string oai_resp_fc_id;
|
|
|
|
// for Anthropic API: track if any reasoning content has been generated
|
|
bool anthropic_has_reasoning = false;
|
|
|
|
virtual bool is_stop() override {
|
|
return false; // in stream mode, partial responses are not considered stop
|
|
}
|
|
|
|
virtual void update(task_result_state & state) override;
|
|
|
|
virtual json to_json() override;
|
|
|
|
json to_json_non_oaicompat();
|
|
|
|
json to_json_oaicompat();
|
|
|
|
json to_json_oaicompat_chat();
|
|
|
|
json to_json_oaicompat_resp();
|
|
|
|
json to_json_oaicompat_asr();
|
|
|
|
json to_json_anthropic();
|
|
};
|
|
|
|
struct server_task_result_embd : server_task_result {
|
|
std::vector<std::vector<float>> embedding;
|
|
|
|
int32_t n_tokens;
|
|
|
|
// response formatting
|
|
task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
|
|
|
|
virtual json to_json() override;
|
|
|
|
json to_json_non_oaicompat();
|
|
|
|
json to_json_oaicompat();
|
|
};
|
|
|
|
struct server_task_result_rerank : server_task_result {
|
|
float score = -1e6;
|
|
|
|
int32_t n_tokens;
|
|
|
|
virtual json to_json() override;
|
|
};
|
|
|
|
struct server_task_result_error : server_task_result {
|
|
error_type err_type = ERROR_TYPE_SERVER;
|
|
std::string err_msg;
|
|
|
|
// for ERROR_TYPE_EXCEED_CONTEXT_SIZE
|
|
int32_t n_prompt_tokens = 0;
|
|
int32_t n_ctx = 0;
|
|
|
|
virtual bool is_error() override {
|
|
return true;
|
|
}
|
|
|
|
virtual json to_json() override;
|
|
};
|
|
|
|
struct server_task_result_metrics : server_task_result {
|
|
int n_idle_slots;
|
|
int n_processing_slots;
|
|
int n_tasks_deferred;
|
|
int64_t t_start;
|
|
|
|
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
|
|
uint64_t n_prompt_tokens_processed_total = 0;
|
|
uint64_t t_prompt_processing_total = 0;
|
|
uint64_t n_tokens_predicted_total = 0;
|
|
uint64_t t_tokens_generation_total = 0;
|
|
|
|
uint64_t n_tokens_max = 0;
|
|
|
|
uint64_t n_prompt_tokens_processed = 0;
|
|
uint64_t t_prompt_processing = 0;
|
|
|
|
uint64_t n_tokens_predicted = 0;
|
|
uint64_t t_tokens_generation = 0;
|
|
|
|
uint64_t n_decode_total = 0;
|
|
uint64_t n_busy_slots_total = 0;
|
|
|
|
// while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
|
|
// therefore, we use json to temporarily store the slot.to_json() result
|
|
json slots_data = json::array();
|
|
|
|
virtual json to_json() override;
|
|
};
|
|
|
|
struct server_task_result_slot_save_load : server_task_result {
|
|
std::string filename;
|
|
bool is_save; // true = save, false = load
|
|
|
|
size_t n_tokens;
|
|
size_t n_bytes;
|
|
double t_ms;
|
|
|
|
virtual json to_json() override;
|
|
};
|
|
|
|
struct server_task_result_slot_erase : server_task_result {
|
|
size_t n_erased;
|
|
|
|
virtual json to_json() override;
|
|
};
|
|
|
|
struct server_task_result_control : server_task_result {
|
|
bool success = false;
|
|
std::string message; // optional detail when success is false
|
|
|
|
virtual json to_json() override {
|
|
json out = json { { "success", success } };
|
|
if (!message.empty()) {
|
|
out["message"] = message;
|
|
}
|
|
return out;
|
|
}
|
|
};
|
|
|
|
struct server_task_result_get_lora : server_task_result {
|
|
struct lora {
|
|
common_adapter_lora_info info;
|
|
std::string alora_invocation_string;
|
|
llama_tokens alora_invocation_tokens;
|
|
};
|
|
std::vector<lora> loras;
|
|
|
|
virtual json to_json() override;
|
|
};
|
|
|
|
struct server_task_result_apply_lora : server_task_result {
|
|
virtual json to_json() override;
|
|
};
|
|
|
|
struct server_prompt_data {
|
|
std::vector<uint8_t> main;
|
|
std::vector<uint8_t> drft;
|
|
|
|
size_t size() const {
|
|
return main.size() + drft.size();
|
|
}
|
|
};
|
|
|
|
struct server_prompt {
|
|
server_tokens tokens;
|
|
|
|
server_prompt_data data;
|
|
|
|
std::list<common_prompt_checkpoint> checkpoints;
|
|
|
|
size_t size() const {
|
|
size_t res = 0;
|
|
|
|
res += data.size();
|
|
|
|
for (const auto & ckpt : checkpoints) {
|
|
res += ckpt.size();
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
int n_tokens() const {
|
|
return tokens.size();
|
|
}
|
|
|
|
server_prompt clone() const {
|
|
return server_prompt {
|
|
tokens.clone(),
|
|
data,
|
|
checkpoints,
|
|
};
|
|
}
|
|
};
|
|
|
|
struct server_prompt_cache {
|
|
server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
|
|
this->limit_size = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib);
|
|
this->limit_tokens = limit_tokens;
|
|
}
|
|
|
|
std::list<server_prompt> states;
|
|
|
|
// in bytes, 0 = no limit
|
|
size_t limit_size = 0;
|
|
|
|
// in tokens, 0 = no limit
|
|
size_t limit_tokens = 0;
|
|
|
|
size_t size() const;
|
|
|
|
size_t n_tokens() const;
|
|
|
|
server_prompt * alloc(const server_prompt & prompt, size_t state_size_main, size_t state_size_drft);
|
|
|
|
bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx_main, llama_context * ctx_drft, int32_t id_slot);
|
|
|
|
void update();
|
|
};
|