mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
server: add SSE ping interval (#24013)
This commit is contained in:
@@ -3693,6 +3693,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
auto res = create_response();
|
||||
auto completion_id = gen_chatcmplid();
|
||||
auto & rd = res->rd;
|
||||
auto & params = this->params;
|
||||
|
||||
try {
|
||||
std::vector<server_task> tasks;
|
||||
@@ -3828,7 +3829,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
}
|
||||
res->status = 200;
|
||||
res->content_type = "text/event-stream";
|
||||
res->next = [res_this = res.get(), res_type, &req](std::string & output) -> bool {
|
||||
res->next = [res_this = res.get(), res_type, &req, ¶ms](std::string & output) -> bool {
|
||||
static auto format_error = [](task_response_type res_type, const json & res_json) {
|
||||
if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
|
||||
return format_anthropic_sse({
|
||||
@@ -3873,7 +3874,25 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||
}
|
||||
|
||||
// receive subsequent results
|
||||
auto result = rd.next(req.should_stop);
|
||||
bool timeout = false;
|
||||
int64_t start_time = ggml_time_ms();
|
||||
auto result = rd.next([&timeout, &req, &start_time, ¶ms]() {
|
||||
if (req.should_stop()) {
|
||||
return true; // should_stop condition met
|
||||
} else if (params.sse_ping_interval > 0 && ggml_time_ms() - start_time > (int64_t)params.sse_ping_interval * 1000) {
|
||||
timeout = true;
|
||||
return true; // timeout
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (timeout) {
|
||||
// some clients may time out (e.g. undici) will time out if no data is received for a while, so we need to send a ping to keep the connection alive
|
||||
SRV_DBG("%s", "sending SSE ping\n");
|
||||
output = ":\n\n";
|
||||
return true;
|
||||
}
|
||||
|
||||
if (result == nullptr) {
|
||||
SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
|
||||
GGML_ASSERT(req.should_stop());
|
||||
|
||||
@@ -381,10 +381,6 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
|
||||
if (result == nullptr) {
|
||||
// timeout, check stop condition
|
||||
if (should_stop()) {
|
||||
const int64_t time_elapsed_ms = ggml_time_ms() - time_start_ms;
|
||||
if (time_elapsed_ms > 30000) {
|
||||
SRV_WRN("%s", "request cancelled after 30s, potentially a client-side timeout; please check your client's code\n");
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -169,8 +169,6 @@ struct server_response_reader {
|
||||
bool cancelled = false;
|
||||
int polling_interval_seconds;
|
||||
|
||||
const int64_t time_start_ms = ggml_time_ms();
|
||||
|
||||
// tracking generation state and partial tool calls
|
||||
// only used by streaming completions
|
||||
std::vector<task_result_state> states;
|
||||
|
||||
Reference in New Issue
Block a user