Compare commits

...

12 Commits

Author SHA1 Message Date
Mikolaj Kucharski fabde3bf51 arg: Add comment line support to --api-key-file (#23168) 2026-06-19 17:33:54 +02:00
Alessandro de Oliveira Faria (A.K.A.CABELO) 0d2d9ccbf6 vendor : update cpp-httplib to 0.48.0 (#24787) 2026-06-19 22:16:35 +08:00
Xuan-Son Nguyen 8c2d6f6475 server: add --agent arg, remove redundant webui naming compat (#24801)
* server: add --agent arg, remove redundant webui naming compat

* corrent env

* fix the test

* llama-gen-docs

* nits: wordings
2026-06-19 16:06:13 +02:00
Aldehir Rojas 38724ab593 docker : build the UI (#24794)
* docker : build the UI

* cont : use existing APP_VERSION
2026-06-19 15:32:31 +02:00
Xuan-Son Nguyen e2e7a9b2d0 mtmd: several bug fixes (#24784)
* mtmd: several bug fixes

* fix build

* fix gemma4ua

* add sanity check in get_u32()

* fix build (2)

* area() avoid overflow
2026-06-19 12:18:36 +02:00
Ruixiang Wang b14e3fb90c spec: support eagle3 for qwen3.5 & 3.6 (#24593)
* spec: support qwen3.5 & 3.6 eagle3 draft

* eagle3: Add deferred boundary checkpoints restore support for hybrid models

* apply suggestions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* spec: adapt to API change

* spec: fix naming

* cont : add TODO

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-06-19 13:08:50 +03:00
Xuan-Son Nguyen 159d093a43 server: fix non-bound n_discard value (ctx shifting) (#24786)
* server: fix non-bound n_discard value

* Update tools/server/server-context.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-06-19 10:53:44 +02:00
Georgi Gerganov 5fd2dc2c41 sync : ggml 2026-06-19 10:19:14 +03:00
Georgi Gerganov 1868af13ac ggml : bump version to 0.15.2 (ggml/1548) 2026-06-19 10:19:14 +03:00
Georgi Gerganov 5bd21b8555 pi : remove docs from system prompt (#24791) 2026-06-19 09:34:00 +03:00
Georgi Gerganov 80452d65b9 server : consolidate slot selection into get_available_slot (#24755)
Absorb get_slot_by_id logic into get_available_slot so slot selection
is handled by a single function call. When a specific slot id is
requested, the LCP similarity check still runs to enable proper
prompt cache updates.

Assisted-by: pi:llama.cpp/Qwen3.6-27B
2026-06-19 09:22:34 +03:00
shalinib-ibm 8141e730f1 ggml-cpu: support K tails in power10 Q8/Q4 MMA matmul (#24753)
* ggml-cpu: support K tails in Power10 MMA Q8/Q4 matmul

This patch removes the requirement that K be divisible by kc in the tinyBlas_Q0_PPC tiled matmul path. Process the final K panel using its actual depth and pass the reduced panel size through packing and kernel execution.  This allows more workloads to use the MMA kernel and reduces fallback to mnpack.

* Apply suggestion from @taronaeo

Co-authored-by: Aaron Teo <taronaeo@gmail.com>

---------

Co-authored-by: Aaron Teo <taronaeo@gmail.com>
2026-06-19 08:55:38 +03:00
36 changed files with 610 additions and 356 deletions
+16
View File
@@ -13,6 +13,20 @@ ARG APP_REVISION=N/A
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM ${CANN_BASE_IMAGE} AS build
# -- Install build dependencies --
@@ -26,6 +40,8 @@ WORKDIR /app
# -- Copy project files --
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
# -- Set CANN environment variables (required for compilation) --
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+16
View File
@@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH
@@ -16,6 +30,8 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
else \
+16
View File
@@ -11,6 +11,20 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG GCC_VERSION
@@ -26,6 +40,8 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
+16
View File
@@ -5,6 +5,20 @@ ARG APP_REVISION=N/A
## Build Image
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=ON
@@ -22,6 +36,8 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
+16
View File
@@ -10,6 +10,20 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
# MUSA architecture to build for (defaults to all supported archs)
@@ -29,6 +43,8 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
+16
View File
@@ -22,6 +22,20 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
## Build Image
FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build
@@ -69,6 +83,8 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
# Build Stage
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
cmake -B build/ReleaseOV -G Ninja \
+16
View File
@@ -11,6 +11,20 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
### Build image
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
@@ -38,6 +52,8 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build \
-DGGML_HIP=ON \
+16
View File
@@ -4,6 +4,20 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
### Build Llama.cpp stage
FROM docker.io/gcc:${GCC_VERSION} AS build
@@ -20,6 +34,8 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN --mount=type=cache,target=/root/.ccache \
--mount=type=cache,target=/app/build \
cmake -S . -B build -G Ninja \
+16
View File
@@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
# Install build tools
@@ -17,6 +31,8 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
cmake --build build --config Release -j$(nproc)
+16
View File
@@ -3,6 +3,20 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
@@ -14,6 +28,8 @@ WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
cmake --build build -j $(nproc)
+3
View File
@@ -10,6 +10,9 @@
build*/
tools/ui/node_modules/
tools/ui/dist/
models/*
/llama-cli
-10
View File
@@ -25,13 +25,3 @@ Commits:
- Do not explicitly set the git author in commits - rely on the default git config
- Always use `--no-gpg-sign` when committing
- Never `git push` without explicit confirmation from the user
Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
+20 -54
View File
@@ -2830,62 +2830,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
// Deprecated: use --ui-config instead (kept for backward compat)
add_opt(common_arg(
{"--webui-config"}, "JSON",
"[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
[](common_params & params, const std::string & value) {
params.ui_config_json = value;
params.webui_config_json = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
add_opt(common_arg(
{"--ui-config"}, "JSON",
{"--ui-config", "--webui-config"}, "JSON",
"JSON that provides default UI settings (overrides UI defaults)",
[](common_params & params, const std::string & value) {
params.ui_config_json = value;
params.webui_config_json = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
// Deprecated: use --ui-config-file instead (kept for backward compat)
add_opt(common_arg(
{"--webui-config-file"}, "PATH",
"[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
[](common_params & params, const std::string & value) {
params.ui_config_json = read_file(value);
params.webui_config_json = params.ui_config_json;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
add_opt(common_arg(
{"--ui-config-file"}, "PATH",
{"--ui-config-file", "--webui-config-file"}, "PATH",
"JSON file that provides default UI settings (overrides UI defaults)",
[](common_params & params, const std::string & value) {
params.ui_config_json = read_file(value);
params.webui_config_json = params.ui_config_json;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
// Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
add_opt(common_arg(
{"--webui-mcp-proxy"},
{"--no-webui-mcp-proxy"},
"[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
[](common_params & params, bool value) {
params.ui_mcp_proxy = value;
params.webui_mcp_proxy = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
add_opt(common_arg(
{"--ui-mcp-proxy"},
{"--no-ui-mcp-proxy"},
{"--ui-mcp-proxy", "--webui-mcp-proxy"},
{"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"},
"experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
[](common_params & params, bool value) {
params.ui_mcp_proxy = value;
params.webui_mcp_proxy = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
add_opt(common_arg(
@@ -2897,24 +2861,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.server_tools = parse_csv_row(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
// Deprecated: use --ui/--no-ui instead (kept for backward compat)
add_opt(common_arg(
{"--webui"},
{"--no-webui"},
"[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
add_opt(common_arg(
{"-ag", "--agent"},
{"-no-ag", "--no-agent"},
"whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
[](common_params & params, bool value) {
params.ui = value;
params.webui = value;
if (value) {
params.server_tools = {"all"};
params.ui_mcp_proxy = true;
} else {
params.server_tools.clear();
params.ui_mcp_proxy = false;
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT"));
add_opt(common_arg(
{"--ui"},
{"--no-ui"},
{"--ui", "--webui"},
{"--no-ui", "--no-webui"},
string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.ui = value;
params.webui = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
add_opt(common_arg(
@@ -2945,7 +2911,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
add_opt(common_arg(
{"--api-key-file"}, "FNAME",
"path to file containing API keys (default: none)",
"path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
[](common_params & params, const std::string & value) {
std::ifstream key_file(value);
if (!key_file) {
@@ -2953,7 +2919,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
std::string key;
while (std::getline(key_file, key)) {
if (!key.empty()) {
if (!key.empty() && key[0] != '#') {
params.api_keys.push_back(key);
}
}
+3 -1
View File
@@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode(
}
size_t common_prompt_checkpoint::size() const {
return data_tgt.size() + data_dft.size();
return data_tgt.size() + data_dft.size() + data_spec.size();
}
bool common_prompt_checkpoint::empty() const {
@@ -2049,6 +2049,7 @@ void common_prompt_checkpoint::clear() {
data_tgt.clear();
data_dft.clear();
data_spec.clear();
}
void common_prompt_checkpoint::update_pos(
@@ -2138,4 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() {
void common_prompt_checkpoint::clear_dft() {
data_dft.clear();
data_spec.clear();
}
+5 -7
View File
@@ -363,7 +363,7 @@ struct common_params_speculative {
uint32_t need_n_rs_seq() const {
bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3;
});
return needs_rs_seq ? draft.n_max : 0u;
@@ -624,12 +624,6 @@ struct common_params {
// UI configs
bool ui = true;
// Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
bool webui = ui;
bool webui_mcp_proxy = false;
std::string webui_config_json;
bool ui_mcp_proxy = false;
std::string ui_config_json;
@@ -1065,6 +1059,10 @@ struct common_prompt_checkpoint {
std::vector<uint8_t> data_tgt;
std::vector<uint8_t> data_dft;
// (optional) speculative-decoding implementation state stashed with the checkpoint
// (e.g. eagle3's deferred-boundary g_embd row)
std::vector<uint8_t> data_spec;
size_t size() const;
bool empty() const;
+72
View File
@@ -161,6 +161,10 @@ struct common_speculative_impl {
virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;
// (optional) serialize/restore per-seq internal state (e.g. eagle3's deferred boundary).
virtual bool get_state(llama_seq_id /*seq_id*/, std::vector<uint8_t> & /*data*/) const { return false; }
virtual void set_state(llama_seq_id /*seq_id*/, const std::vector<uint8_t> & /*data*/) {}
// true if this implementation requires the target context to extract post-norm embeddings
virtual bool need_embd() const = 0;
@@ -841,6 +845,49 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
(size_t) n_embd_dec * sizeof(float));
}
// we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets:
// their single-position checkpoints drop it on restore
bool need_boundary_stash() const {
const llama_model * model_tgt = llama_get_model(params.ctx_tgt);
return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt);
}
bool get_state(llama_seq_id seq_id, std::vector<uint8_t> & data) const override {
if (!need_boundary_stash()) {
return false;
}
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) {
return false;
}
const llama_pos pos = pending_pos_last[seq_id];
const std::vector<float> & g = pending_g_last[seq_id];
data.resize(sizeof(llama_pos) + g.size() * sizeof(float));
std::memcpy(data.data(), &pos, sizeof(llama_pos));
std::memcpy(data.data() + sizeof(llama_pos), g.data(), g.size() * sizeof(float));
return true;
}
void set_state(llama_seq_id seq_id, const std::vector<uint8_t> & data) override {
if (!need_boundary_stash()) {
return;
}
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
return;
}
if (data.size() != sizeof(llama_pos) + (size_t) n_embd_dec * sizeof(float)) {
return;
}
llama_pos pos = -1;
std::memcpy(&pos, data.data(), sizeof(llama_pos));
pending_pos_last[seq_id] = pos;
pending_g_last[seq_id].resize(n_embd_dec);
std::memcpy(pending_g_last[seq_id].data(), data.data() + sizeof(llama_pos), (size_t) n_embd_dec * sizeof(float));
}
bool need_embd() const override {
return false;
}
@@ -2118,6 +2165,31 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
}
}
// TODO: support the case of more than one speculative implementations having a state
bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data) {
if (spec == nullptr) {
return false;
}
for (auto & impl : spec->impls) {
if (impl->get_state(seq_id, data)) {
return true;
}
}
return false;
}
void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data) {
if (spec == nullptr) {
return;
}
for (auto & impl : spec->impls) {
impl->set_state(seq_id, data);
}
}
void common_speculative_print_stats(const common_speculative * spec) {
if (spec == nullptr) {
return;
+4
View File
@@ -68,6 +68,10 @@ void common_speculative_draft(common_speculative * spec);
// informs the speculative context that n_accepted tokens were accepted by the target model
void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
// (optional) get/set internal state
bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data);
void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data);
// print statistics about the speculative decoding
void common_speculative_print_stats(const common_speculative * spec);
+1 -1
View File
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 15)
set(GGML_VERSION_PATCH 1)
set(GGML_VERSION_PATCH 2)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+6 -5
View File
@@ -2345,7 +2345,7 @@ class tinyBLAS_Q0_PPC {
else if (n_aligned % 16 == 0) nc = 16;
else nc = 8;
}
bool can_use_tiled = n_aligned > 0 && (m % mc == 0) && (k % kc == 0);
bool can_use_tiled = n_aligned > 0 && (m % mc == 0);
if (can_use_tiled) {
matmul_tiled(m, n_aligned, mc, nc, kc);
if (n > n_aligned) {
@@ -3063,13 +3063,14 @@ class tinyBLAS_Q0_PPC {
int64_t ii = (job / xtiles) * mc;
int64_t jj = (job % xtiles) * nc;
for (int64_t kk = 0; kk < k; kk += kc) {
int64_t k_cur = MIN(kc, k - kk);
if constexpr(is_Ablock_q4) {
packNormal_q4_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
packNormal_q4_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
} else {
packNormal_q8_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack);
packNormal_q8_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack);
}
packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, kc, (uint8_t *)B_pack);
KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack);
packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, k_cur, (uint8_t *)B_pack);
KERNEL_Q0(ii, jj, mc, nc, k_cur, kk, A_pack, B_pack);
}
}
}
+1 -1
View File
@@ -1 +1 @@
3af5f5760e19a96427f5f7a93b79cbdf3d4b265b
707321c4cf6d21cb4bc831aa8b687dbf01a521ce
+1 -1
View File
@@ -5,7 +5,7 @@ import os
import sys
import subprocess
HTTPLIB_VERSION = "refs/tags/v0.47.0"
HTTPLIB_VERSION = "refs/tags/v0.48.0"
vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
+2
View File
@@ -156,6 +156,8 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
// MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;
ggml_tensor * inpSA = inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+2
View File
@@ -179,6 +179,8 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
// MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;
ggml_tensor * inpSA = inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+2 -1
View File
@@ -161,7 +161,7 @@
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
| `--image, --audio, --video FILE` | path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
@@ -174,6 +174,7 @@
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) |
| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
+29 -1
View File
@@ -1675,6 +1675,9 @@ struct clip_model_loader {
// note: some models having hparams.image_size == 0, which means the image size is dynamic
throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size));
}
if (hparams.image_size > 65536) {
throw std::runtime_error(string_format("%s: image_size (%d) is too large (max 65536)\n", __func__, hparams.image_size));
}
if (hparams.patch_size <= 0) {
throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
}
@@ -1723,6 +1726,19 @@ struct clip_model_loader {
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
// GEMMA4UA is encoder-free: it uses n_mel_bins as a raw-waveform frame size (640) and has no FFT/filterbank, so the mel-range and FFT
// checks below do not apply to it.
const bool fft_based = model.proj_type != PROJECTOR_TYPE_GEMMA4UA;
// Validate audio hparams loaded from GGUF metadata
if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) {
throw std::runtime_error(string_format("%s: n_mel_bins (%d) must be in range [1, 256]\n", __func__, hparams.n_mel_bins));
}
if (fft_based && (hparams.audio_sample_rate <= 0 || hparams.audio_n_fft <= 0 || hparams.audio_hop_len <= 0 || hparams.audio_window_len <= 0)) {
throw std::runtime_error(string_format("%s: audio hparams invalid: sample_rate=%d n_fft=%d window_len=%d hop_len=%d\n",
__func__, hparams.audio_sample_rate, hparams.audio_n_fft, hparams.audio_window_len, hparams.audio_hop_len));
}
}
LOG_INF("\n");
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
@@ -2831,6 +2847,12 @@ struct clip_model_loader {
img.set_size({sz, sz}, false, false);
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
} else {
// GEMMA4UA uses n_mel_bins as a raw-waveform frame size (640), not a mel-bin count,
// so the [1, 256] bound only applies to FFT-based models.
const bool fft_based = ctx_clip.model.proj_type != PROJECTOR_TYPE_GEMMA4UA;
if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) {
throw std::runtime_error(string_format("%s: invalid n_mel_bins (%d), must be in [1, 256]\n", __func__, hparams.n_mel_bins));
}
img.set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
}
@@ -2994,7 +3016,13 @@ struct clip_model_loader {
}
return;
}
output = gguf_get_val_u32(ctx_gguf.get(), i);
const uint32_t val = gguf_get_val_u32(ctx_gguf.get(), i);
// sanity check
if (val > (uint32_t) INT32_MAX) {
throw std::runtime_error(string_format("%s: value %u for key '%s' exceeds INT32_MAX\n",
__func__, val, key.c_str()));
}
output = (int) val;
}
void get_f32(const std::string & key, float & output, bool required = true) const {
+3
View File
@@ -24,6 +24,9 @@ struct clip_image_size {
return !(*this == other);
}
int area() const {
// avoid overflow when computing area
GGML_ASSERT(width >= 0 && width <= 46000);
GGML_ASSERT(height >= 0 && height <= 46000);
return width * height;
}
};
+76 -63
View File
@@ -32,8 +32,8 @@ void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
}
}
void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
int n_fft,
void mtmd_audio_cache::fill_mel_filterbank_matrix(int64_t n_mel,
int64_t n_fft,
int sample_rate,
float fmin,
float fmax,
@@ -86,11 +86,16 @@ void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
hz_pts[i] = mel_to_hz(mel_pts[i]);
}
const int n_fft_bins = n_fft / 2 + 1;
const int64_t n_fft_bins = n_fft / 2 + 1;
// Validate allocation size
if ((size_t)n_mel * (size_t)n_fft_bins > SIZE_MAX) {
GGML_ASSERT(false && "mel filterbank allocation too large");
}
// filterbank
std::vector<float> out(n_mel * n_fft_bins, 0);
for (int m = 0; m < n_mel; ++m) {
std::vector<float> out((size_t)n_mel * (size_t)n_fft_bins, 0);
for (int64_t m = 0; m < n_mel; ++m) {
const double f_left = hz_pts[m];
const double f_center = hz_pts[m + 1];
const double f_right = hz_pts[m + 2];
@@ -266,8 +271,8 @@ static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out)
}
struct filter_params {
int32_t n_mel;
int32_t n_fft_bins;
int64_t n_mel;
int64_t n_fft_bins;
int32_t hann_window_size;
int32_t hop_length;
int32_t sample_rate;
@@ -293,8 +298,8 @@ static void log_mel_spectrogram_worker_thread(int ith,
std::vector<float> fft_in(frame_size * 2, 0.0);
std::vector<float> fft_out(frame_size * 2 * 2 * 2);
int n_fft_bins = params.n_fft_bins;
int i = ith;
int64_t n_fft_bins = params.n_fft_bins;
int64_t i = ith;
const auto & filters = cache.filters;
@@ -302,17 +307,18 @@ static void log_mel_spectrogram_worker_thread(int ith,
GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
// calculate FFT only when fft_in are not all zero
for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
const int offset = i * frame_step;
for (; i < std::min((int64_t)(n_samples / frame_step + 1), out.n_len); i += n_threads) {
const int64_t offset = i * frame_step;
// apply Hann window (~10% faster)
for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
const int valid_len = std::min(frame_size, std::max(0, n_samples - (int)offset));
for (int j = 0; j < valid_len; j++) {
fft_in[j] = hann[j] * samples[offset + j];
}
// fill the rest with zeros
if (n_samples - offset < frame_size) {
std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
if (valid_len < frame_size) {
std::fill(fft_in.begin() + valid_len, fft_in.end(), 0.0);
}
// FFT
@@ -325,7 +331,7 @@ static void log_mel_spectrogram_worker_thread(int ith,
}
// mel spectrogram
for (int j = 0; j < out.n_mel; j++) {
for (int64_t j = 0; j < out.n_mel; j++) {
double sum = 0.0;
// unroll loop (suggested by GH user @lunixbochs)
int k = 0;
@@ -339,21 +345,21 @@ static void log_mel_spectrogram_worker_thread(int ith,
}
// handle n_fft remainder
for (; k < n_fft_bins; k++) {
sum += fft_out[k] * filters.data[j * n_fft_bins + k];
sum += fft_out[k] * filters.data[(size_t)j * n_fft_bins + k];
}
sum = std::max(sum, (double)params.mel_floor);
sum = params.use_natural_log
? log(sum)
: log10(sum);
out.data[j * out.n_len + i] = sum;
out.data[(size_t)j * out.n_len + i] = sum;
}
}
// Otherwise fft_out are all zero
double sum = params.use_natural_log ? log(1e-10) : log10(1e-10);
for (; i < out.n_len; i += n_threads) {
for (int j = 0; j < out.n_mel; j++) {
out.data[j * out.n_len + i] = sum;
for (int64_t j = 0; j < out.n_mel; j++) {
out.data[(size_t)j * out.n_len + i] = sum;
}
}
}
@@ -437,16 +443,21 @@ static bool log_mel_spectrogram(
GGML_ASSERT(params.hop_length > 0);
out.n_mel = params.n_mel;
out.n_len = (n_samples - frame_size) / frame_step + 1;
// TODO: handle these checks better
if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) {
LOG_ERR("%s: size overflow\n", __func__);
// Validate dimensions before allocation to prevent integer overflow
if (out.n_mel <= 0 || out.n_len <= 0) {
LOG_ERR("%s: invalid mel dimensions n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len);
return false;
}
const size_t total_size = (size_t)out.n_mel * (size_t)out.n_len;
if (total_size > SIZE_MAX / sizeof(float)) {
LOG_ERR("%s: size overflow: n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len);
return false;
}
if (n_samples < frame_size) {
LOG_ERR("%s: not enough samples after padding\n", __func__);
return false;
}
out.data.resize(out.n_mel * out.n_len);
out.data.resize(total_size);
{
std::vector<std::thread> workers(n_threads - 1);
@@ -464,38 +475,39 @@ static bool log_mel_spectrogram(
}
}
const int effective_n_len = n_samples_in / frame_step;
const int64_t effective_n_len = n_samples_in / frame_step;
if (params.norm_per_feature) {
GGML_ASSERT(effective_n_len > 1);
for (int i = 0; i < out.n_mel; i++) {
for (int64_t i = 0; i < out.n_mel; i++) {
double mean = 0;
for (int j = 0; j < effective_n_len; ++j) {
mean += out.data[i * out.n_len + j];
for (int64_t j = 0; j < effective_n_len; ++j) {
mean += out.data[(size_t)i * out.n_len + j];
}
mean /= effective_n_len;
double var = 0.0;
for (int j = 0; j < effective_n_len; ++j) {
const double value = out.data[i * out.n_len + j] - mean;
for (int64_t j = 0; j < effective_n_len; ++j) {
const double value = out.data[(size_t)i * out.n_len + j] - mean;
var += value * value;
}
var /= effective_n_len - 1; // unbiased
const double mstd = std::sqrt(var + 1e-5);
for (int j = 0; j < effective_n_len; ++j) {
auto &value = out.data[i * out.n_len + j];
for (int64_t j = 0; j < effective_n_len; ++j) {
auto &value = out.data[(size_t)i * out.n_len + j];
value = (value - mean) / mstd;
}
// pad the rest with zeros
for (int j = effective_n_len; j < out.n_len; ++j) {
out.data[i * out.n_len + j] = 0.0;
for (int64_t j = effective_n_len; j < out.n_len; ++j) {
out.data[(size_t)i * out.n_len + j] = 0.0;
}
}
} else if (!params.no_padding) {
// Whisper-style clamping and normalization (NOT used by Gemma4)
double mmax = -1e20;
for (int i = 0; i < out.n_mel*out.n_len; i++) {
const size_t mel_size = (size_t)out.n_mel * (size_t)out.n_len;
for (size_t i = 0; i < mel_size; i++) {
if (out.data[i] > mmax) {
mmax = out.data[i];
}
@@ -503,7 +515,7 @@ static bool log_mel_spectrogram(
mmax -= 8.0;
for (int i = 0; i < out.n_mel*out.n_len; i++) {
for (size_t i = 0; i < mel_size; i++) {
if (out.data[i] < mmax) {
out.data[i] = mmax;
}
@@ -582,13 +594,13 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
// because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
// we always expect the mel to have 3000 silent frames at the end
if (DEBUG) {
printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
printf("output: n_mel = %d, n_len = %d\n", (int) out_full.n_mel, (int) out_full.n_len);
}
const size_t frames_per_chunk = 3000;
GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
if ((size_t) n_len < frames_per_chunk) {
int64_t n_len = std::min((int64_t)frames_per_chunk, out_full.n_len - (int64_t)off);
if (n_len < (int64_t)frames_per_chunk) {
break; // last incomplete chunk will always be a padded chunk, safe to ignore
}
@@ -596,10 +608,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
out_chunk.n_len = n_len;
out_chunk.n_mel = out_full.n_mel;
out_chunk.n_len_org = out_full.n_mel; // unused
out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
out_chunk.data.reserve((size_t)out_chunk.n_mel * (size_t)out_chunk.n_len);
for (int i = 0; i < out_full.n_mel; i++) {
auto src = out_full.data.begin() + i * out_full.n_len + off;
for (int64_t i = 0; i < out_full.n_mel; i++) {
auto src = out_full.data.begin() + (size_t)i * out_full.n_len + off;
out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
}
@@ -681,8 +693,8 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa
// The effective frame count: center-padded STFT gives ~n_samples/hop_length frames.
// We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames.
const int n_eff = std::min(mel_full.n_len,
(int)(n_samples / hparams.audio_hop_len) + 1);
const int64_t n_eff = std::min(mel_full.n_len,
(int64_t)(n_samples / hparams.audio_hop_len) + 1);
// Split into inference windows matching n_window_infer=800 from model config.
// Each window is padded to the next multiple of chunk_size for the cgraph.
@@ -690,18 +702,18 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa
const int chunk_size = 100; // conv sub-chunk size (n_window * 2, n_window=50)
const int window_size = 800; // mel frames per forward pass (n_window_infer=800)
for (int off = 0; off < n_eff; off += window_size) {
const int win_eff = std::min(window_size, n_eff - off);
const int n_chunks = (win_eff + chunk_size - 1) / chunk_size;
const int n_padded = n_chunks * chunk_size;
for (int64_t off = 0; off < n_eff; off += window_size) {
const int64_t win_eff = std::min((int64_t)window_size, n_eff - off);
const int64_t n_chunks = (win_eff + chunk_size - 1) / chunk_size;
const int64_t n_padded = n_chunks * chunk_size;
mtmd_audio_mel out;
out.n_mel = mel_full.n_mel;
out.n_len = n_padded;
out.n_len_org = win_eff;
out.data.assign(out.n_mel * out.n_len, 0.0f);
for (int m = 0; m < out.n_mel; m++) {
const int copy_len = std::min(win_eff, mel_full.n_len - off);
out.data.assign((size_t)out.n_mel * (size_t)out.n_len, 0.0f);
for (int64_t m = 0; m < out.n_mel; m++) {
const int64_t copy_len = std::min((int64_t)win_eff, mel_full.n_len - off);
if (copy_len > 0) {
std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off,
mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len,
@@ -823,37 +835,38 @@ bool mtmd_audio_preprocessor_granite_speech::preprocess(const float *
}
double mmax = -1e20;
for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
const size_t mel_size = (size_t)mel.n_mel * (size_t)mel.n_len;
for (size_t i = 0; i < mel_size; i++) {
if (mel.data[i] > mmax) {
mmax = mel.data[i];
}
}
mmax -= 8.0;
for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
for (size_t i = 0; i < mel_size; i++) {
if (mel.data[i] < mmax) {
mel.data[i] = mmax;
}
mel.data[i] = (mel.data[i] + 4.0) / 4.0;
}
int n_frames = mel.n_len;
int64_t n_frames = mel.n_len;
if (n_frames % 2 == 1) {
n_frames--;
}
const int n_mel = mel.n_mel;
const int n_stacked = n_frames / 2;
const int64_t n_mel = mel.n_mel;
const int64_t n_stacked = n_frames / 2;
mtmd_audio_mel stacked;
stacked.n_mel = 2 * n_mel;
stacked.n_len = n_stacked;
stacked.n_len_org = (int)n_samples;
stacked.data.resize(2 * n_mel * n_stacked);
stacked.n_len_org = (int64_t)n_samples;
stacked.data.resize((size_t)2 * (size_t)n_mel * (size_t)n_stacked);
for (int t = 0; t < n_stacked; t++) {
for (int m = 0; m < n_mel; m++) {
stacked.data[m * n_stacked + t] = mel.data[m * mel.n_len + 2 * t];
stacked.data[(m + n_mel) * n_stacked + t] = mel.data[m * mel.n_len + 2 * t + 1];
for (int64_t t = 0; t < n_stacked; t++) {
for (int64_t m = 0; m < n_mel; m++) {
stacked.data[(size_t)m * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t];
stacked.data[(size_t)(m + n_mel) * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t + 1];
}
}
@@ -921,8 +934,8 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s
const int hop = hparams.audio_hop_len;
const int n_with_left = (int)chunk_len + pad_left;
// PyTorch: unfold(size=frame_length+1, step=hop) on semicausal-padded waveform
const int pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
const int n_padded_needed = (pt_frames - 1) * hop + fft_size;
const int64_t pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
const int64_t n_padded_needed = (pt_frames - 1) * hop + fft_size;
const int total_pad = std::max((int)(n_padded_needed - (int)chunk_len), pad_left);
std::vector<float> padded_samples(total_pad + chunk_len, 0.0f);
std::copy(chunk_ptr, chunk_ptr + chunk_len, padded_samples.data() + pad_left);
+7 -7
View File
@@ -10,16 +10,16 @@
#define MTMD_INTERNAL_HEADER
struct mtmd_audio_mel {
int n_len;
int n_len_org;
int n_mel;
int64_t n_len;
int64_t n_len_org;
int64_t n_mel;
std::vector<float> data;
};
struct mtmd_audio_mel_filters {
int32_t n_mel;
int32_t n_fft;
int64_t n_mel;
int64_t n_fft;
std::vector<float> data;
};
@@ -39,8 +39,8 @@ struct mtmd_audio_cache {
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
void fill_mel_filterbank_matrix(int n_mel,
int n_fft,
void fill_mel_filterbank_matrix(int64_t n_mel,
int64_t n_fft,
int sample_rate, // e.g. 16000
float fmin = 0.0f, // e.g. 0.0
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
+4 -1
View File
@@ -1295,9 +1295,12 @@ struct mtmd_tokenizer {
for (auto & mel_spec : mel_spec_chunks) {
const bool is_placeholder = mel_spec.data.empty();
// Validate dimensions fit in clip_image_size (int)
GGML_ASSERT(mel_spec.n_len <= INT32_MAX && mel_spec.n_len >= 0);
GGML_ASSERT(mel_spec.n_mel <= INT32_MAX && mel_spec.n_mel >= 0);
clip_image_f32 mel_f32;
mel_f32.set_size(
{mel_spec.n_len, mel_spec.n_mel},
{(int)mel_spec.n_len, (int)mel_spec.n_mel},
is_placeholder, /* is_audio */ true);
mel_f32.cpy_buf(mel_spec.data);
+9 -11
View File
@@ -175,13 +175,12 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `-tk, --talker-model FILE` | path to the qwen3-omni talker gguf, enables the /v1/audio/speech endpoint<br/>(env: LLAMA_ARG_TALKER_MODEL) |
| `-c2w, --code2wav-model FILE` | path to the qwen3-omni code2wav gguf, the talker code detokenizer<br/>(env: LLAMA_ARG_CODE2WAV_MODEL) |
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
| `--mtmd-batch-max-tokens N` | maximum number of image tokens per batch when encoding images (default: 1024)<br/>(env: LLAMA_ARG_MTMD_BATCH_MAX_TOKENS) |
| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) |
@@ -190,23 +189,21 @@ For the full list of features, please refer to [server's changelog](https://gith
| `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)<br/>(env: LLAMA_ARG_REUSE_PORT) |
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
| `--webui-config JSON` | [DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
| `--ui-config JSON` | JSON that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG) |
| `--webui-config-file PATH` | [DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
| `--ui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG_FILE) |
| `--webui-mcp-proxy, --no-webui-mcp-proxy` | [DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy<br/>(env: LLAMA_ARG_WEBUI_MCP_PROXY) |
| `--ui-mcp-proxy, --no-ui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_UI_MCP_PROXY) |
| `--ui-config, --webui-config JSON` | JSON that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG) |
| `--ui-config-file, --webui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG_FILE) |
| `--ui-mcp-proxy, --webui-mcp-proxy, --no-ui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_UI_MCP_PROXY) |
| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)<br/>specify "all" to enable all tools<br/>available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime<br/>(env: LLAMA_ARG_TOOLS) |
| `--webui, --no-webui` | [DEPRECATED: use --ui/--no-ui] whether to enable the Web UI<br/>(env: LLAMA_ARG_WEBUI) |
| `--ui, --no-ui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_UI) |
| `-ag, --agent, -no-ag, --no-agent` | whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_AGENT) |
| `--ui, --webui, --no-ui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_UI) |
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
| `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
| `--api-key-file FNAME` | path to file containing API keys (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
| `--api-key-file FNAME` | path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
| `-to, --timeout N` | server read/write timeout in seconds (default: 3600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--sse-ping-interval N` | server SSE ping interval in seconds (-1 = disabled, default: 30)<br/>(env: LLAMA_ARG_SSE_PING_INTERVAL) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
@@ -231,6 +228,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) |
| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
+32 -15
View File
@@ -1302,11 +1302,8 @@ private:
}
}
// populate UI settings (from either new ui_config_json or deprecated webui_config_json)
{
const std::string & cfg = !params_base.ui_config_json.empty()
? params_base.ui_config_json
: params_base.webui_config_json;
const std::string & cfg = params_base.ui_config_json;
if (!cfg.empty()) {
try {
json json_settings = json::parse(cfg);
@@ -1395,11 +1392,23 @@ private:
bool update_cache = false;
// if a specific slot is requested, use it (still goes through cache update logic below)
if (task.id_slot != -1) {
ret = get_slot_by_id(task.id_slot);
if (ret) {
SLT_INF(*ret, "selected slot by id (%d)\n", task.id_slot);
}
}
// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
if (slot_prompt_similarity != 0.0f) {
float sim_best = 0;
for (server_slot & slot : slots) {
if (task.id_slot != -1 && slot.id != task.id_slot) {
continue;
}
// skip the slot if it is not available
if (slot.is_processing()) {
continue;
@@ -1426,8 +1435,10 @@ private:
if (ret != nullptr) {
const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
sim_best, slot_prompt_similarity, f_keep);
if (task.id_slot == -1) {
SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
sim_best, slot_prompt_similarity, f_keep);
}
// if we are about to lose a large portion of the existing context - save it in the prompt cache
if (f_keep < 0.5f) {
@@ -2158,6 +2169,8 @@ private:
cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
// stash the draft's speculative state with the checkpoint
common_speculative_get_state(spec.get(), slot.id, cur.data_spec);
SLT_INF(slot,
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
@@ -2180,10 +2193,9 @@ private:
}
}
const int id_slot = task.id_slot;
const int id_task = task.id;
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
server_slot * slot = get_available_slot(task);
//
// slot scheduling logic
@@ -2552,7 +2564,10 @@ private:
n_keep = std::min(slot.n_ctx - 4, n_keep);
const int n_left = slot.prompt.n_tokens() - n_keep;
const int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2);
int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2);
// ref: https://github.com/ggml-org/llama.cpp/pull/24786
n_discard = std::clamp(n_discard, 0, std::max(0, n_left - 1));
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
@@ -2982,6 +2997,8 @@ private:
// restore the context checkpoint
it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
// restore the draft's speculative state
common_speculative_set_state(spec.get(), slot.id, it->data_spec);
pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);
@@ -4284,18 +4301,18 @@ void server_routes::init_routes() {
{ "endpoint_props", params.endpoint_props },
{ "endpoint_metrics", params.endpoint_metrics },
// New keys
{ "ui", params.ui },
{ "ui_settings", meta->json_ui_settings },
{ "ui", params.ui },
{ "ui_settings", meta->json_ui_settings },
// Deprecated: use ui/ui_settings instead (kept for backward compat)
{ "webui", params.webui },
{ "webui_settings", meta->json_webui_settings },
{ "webui", params.ui },
{ "webui_settings", meta->json_ui_settings },
{ "chat_template", tmpl_default },
{ "chat_template_caps", meta->chat_template_caps },
{ "bos_token", meta->bos_token_str },
{ "eos_token", meta->eos_token_str },
{ "build_info", meta->build_info },
{ "is_sleeping", queue_tasks.is_sleeping() },
{ "cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy },
{ "cors_proxy_enabled", params.ui_mcp_proxy },
};
if (params.use_jinja) {
if (!tmpl_tools.empty()) {
+7 -8
View File
@@ -1462,9 +1462,9 @@ void server_models_routes::init_routes() {
auto res = std::make_unique<server_http_res>();
res_ok(res, {
// TODO: add support for this on web UI
{"role", "router"},
{"max_instances", params.models_max},
{"models_autoload", params.models_autoload},
{"role", "router"},
{"max_instances", params.models_max},
{"models_autoload", params.models_autoload},
// this is a dummy response to make sure the UI doesn't break
{"model_alias", "llama-server"},
{"model_path", "none"},
@@ -1473,11 +1473,10 @@ void server_models_routes::init_routes() {
{"n_ctx", 0},
}},
// New key
{"ui_settings", ui_settings},
// Deprecated: use ui_settings instead (kept for backward compat)
{"webui_settings", webui_settings},
{"build_info", std::string(llama_build_info())},
{"cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy},
{"ui_settings", ui_settings},
{"webui_settings", webui_settings},
{"build_info", std::string(llama_build_info())},
{"cors_proxy_enabled", params.ui_mcp_proxy},
});
return res;
}
+1 -4
View File
@@ -212,10 +212,7 @@ struct server_models_routes {
server_models models;
server_models_routes(const common_params & params, int argc, char ** argv)
: params(params), models(params, argc, argv) {
// Support both new ui_config_json and deprecated webui_config_json
const std::string & cfg = !this->params.ui_config_json.empty()
? this->params.ui_config_json
: this->params.webui_config_json;
const std::string & cfg = this->params.ui_config_json;
if (!cfg.empty()) {
try {
json json_settings = json::parse(cfg);
+1 -2
View File
@@ -227,8 +227,7 @@ int llama_server(int argc, char ** argv) {
ctx_http.register_gcp_compat();
// CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP)
// Supports both new ui_mcp_proxy and deprecated webui_mcp_proxy fields
if (params.ui_mcp_proxy || params.webui_mcp_proxy) {
if (params.ui_mcp_proxy) {
SRV_WRN("%s", "-----------------\n");
SRV_WRN("%s", "CORS proxy is enabled, do not expose server to untrusted environments\n");
SRV_WRN("%s", "This feature is EXPERIMENTAL and may be removed or changed in future versions\n");
+96 -145
View File
@@ -5809,11 +5809,9 @@ std::string decode_query_component(const std::string &component,
for (size_t i = 0; i < component.size(); i++) {
if (component[i] == '%' && i + 2 < component.size()) {
std::string hex = component.substr(i + 1, 2);
char *end;
unsigned long value = std::strtoul(hex.c_str(), &end, 16);
if (end == hex.c_str() + 2) {
result += static_cast<char>(value);
auto val = 0;
if (detail::from_hex_to_i(component, i + 1, 2, val)) {
result += static_cast<char>(val);
i += 2;
} else {
result += component[i];
@@ -12551,6 +12549,21 @@ bool parse_ipv4(const std::string &str, unsigned char *out) {
return *p == '\0';
}
// Parse an IP literal (IPv4 or IPv6) into raw network-order bytes.
// `out` must have room for at least 16 bytes. Returns the address length
// (4 for IPv4, 16 for IPv6) on success, or 0 if the string is not an IP
// literal. Used to match a host against iPAddress SANs the same way the
// OpenSSL backend does via X509_check_ip.
size_t parse_ip_address(const std::string &str, unsigned char *out) {
if (is_ipv4_address(str)) { return parse_ipv4(str, out) ? 4 : 0; }
struct in6_addr addr6 = {};
if (inet_pton(AF_INET6, str.c_str(), &addr6) == 1) {
memcpy(out, &addr6, 16);
return 16;
}
return 0;
}
#ifdef _WIN32
// Enumerate Windows system certificates and call callback with DER data
template <typename Callback>
@@ -12852,6 +12865,30 @@ int openssl_verify_callback(int preverify_ok, X509_STORE_CTX *ctx) {
return callback(verify_ctx) ? 1 : 0;
}
// X509_STORE_get0_objects is deprecated since OpenSSL 4.0 because it is not
// thread-safe; X509_STORE_get1_objects (OpenSSL 3.3+) returns a snapshot
// that must be released with release_store_objects
#if !defined(OPENSSL_IS_BORINGSSL) && !defined(LIBRESSL_VERSION_NUMBER) && \
OPENSSL_VERSION_NUMBER >= 0x30300000L
#define CPPHTTPLIB_HAS_X509_STORE_GET1_OBJECTS
#endif
STACK_OF(X509_OBJECT) * get_store_objects(X509_STORE *store) {
#ifdef CPPHTTPLIB_HAS_X509_STORE_GET1_OBJECTS
return X509_STORE_get1_objects(store);
#else
return X509_STORE_get0_objects(store);
#endif
}
void release_store_objects(STACK_OF(X509_OBJECT) * objs) {
#ifdef CPPHTTPLIB_HAS_X509_STORE_GET1_OBJECTS
sk_X509_OBJECT_pop_free(objs, X509_OBJECT_free);
#else
(void)objs; // get0 variant returns an internal pointer; nothing to free
#endif
}
} // namespace impl
ctx_t create_client_context() {
@@ -13373,11 +13410,19 @@ std::string get_cert_subject_cn(cert_t cert) {
auto subject_name = X509_get_subject_name(x509);
if (!subject_name) return "";
char buf[256];
auto len =
X509_NAME_get_text_by_NID(subject_name, NID_commonName, buf, sizeof(buf));
if (len < 0) return "";
return std::string(buf, static_cast<size_t>(len));
// X509_NAME_get_text_by_NID is deprecated since OpenSSL 4.0
auto idx = X509_NAME_get_index_by_NID(subject_name, NID_commonName, -1);
if (idx < 0) return "";
auto entry = X509_NAME_get_entry(subject_name, idx);
if (!entry) return "";
auto data = X509_NAME_ENTRY_get_data(entry);
if (!data) return "";
return std::string(
reinterpret_cast<const char *>(ASN1_STRING_get0_data(data)),
static_cast<size_t>(ASN1_STRING_length(data)));
}
std::string get_cert_issuer_name(cert_t cert) {
@@ -13582,8 +13627,9 @@ size_t get_ca_certs(ctx_t ctx, std::vector<cert_t> &certs) {
auto store = SSL_CTX_get_cert_store(ssl_ctx);
if (!store) { return 0; }
auto objs = X509_STORE_get0_objects(store);
auto objs = impl::get_store_objects(store);
if (!objs) { return 0; }
auto se = detail::scope_exit([&] { impl::release_store_objects(objs); });
auto count = sk_X509_OBJECT_num(objs);
for (decltype(count) i = 0; i < count; i++) {
@@ -13609,8 +13655,9 @@ std::vector<std::string> get_ca_names(ctx_t ctx) {
auto store = SSL_CTX_get_cert_store(ssl_ctx);
if (!store) { return names; }
auto objs = X509_STORE_get0_objects(store);
auto objs = impl::get_store_objects(store);
if (!objs) { return names; }
auto se = detail::scope_exit([&] { impl::release_store_objects(objs); });
auto count = sk_X509_OBJECT_num(objs);
for (decltype(count) i = 0; i < count; i++) {
@@ -13716,110 +13763,6 @@ std::string verify_error_string(long error_code) {
} // namespace tls
bool SSLClient::verify_host(X509 *server_cert) const {
/* Quote from RFC2818 section 3.1 "Server Identity"
If a subjectAltName extension of type dNSName is present, that MUST
be used as the identity. Otherwise, the (most specific) Common Name
field in the Subject field of the certificate MUST be used. Although
the use of the Common Name is existing practice, it is deprecated and
Certification Authorities are encouraged to use the dNSName instead.
Matching is performed using the matching rules specified by
[RFC2459]. If more than one identity of a given type is present in
the certificate (e.g., more than one dNSName name, a match in any one
of the set is considered acceptable.) Names may contain the wildcard
character * which is considered to match any single domain name
component or component fragment. E.g., *.a.com matches foo.a.com but
not bar.foo.a.com. f*.com matches foo.com but not bar.com.
In some cases, the URI is specified as an IP address rather than a
hostname. In this case, the iPAddress subjectAltName must be present
in the certificate and must exactly match the IP in the URI.
*/
return verify_host_with_subject_alt_name(server_cert) ||
verify_host_with_common_name(server_cert);
}
bool
SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const {
auto ret = false;
auto type = GEN_DNS;
struct in6_addr addr6 = {};
struct in_addr addr = {};
size_t addr_len = 0;
#ifndef __MINGW32__
if (inet_pton(AF_INET6, host_.c_str(), &addr6)) {
type = GEN_IPADD;
addr_len = sizeof(struct in6_addr);
} else if (inet_pton(AF_INET, host_.c_str(), &addr)) {
type = GEN_IPADD;
addr_len = sizeof(struct in_addr);
}
#endif
auto alt_names = static_cast<const struct stack_st_GENERAL_NAME *>(
X509_get_ext_d2i(server_cert, NID_subject_alt_name, nullptr, nullptr));
if (alt_names) {
auto dsn_matched = false;
auto ip_matched = false;
auto count = sk_GENERAL_NAME_num(alt_names);
for (decltype(count) i = 0; i < count && !dsn_matched; i++) {
auto val = sk_GENERAL_NAME_value(alt_names, i);
if (!val || val->type != type) { continue; }
auto name =
reinterpret_cast<const char *>(ASN1_STRING_get0_data(val->d.ia5));
if (name == nullptr) { continue; }
auto name_len = static_cast<size_t>(ASN1_STRING_length(val->d.ia5));
switch (type) {
case GEN_DNS:
dsn_matched =
detail::match_hostname(std::string(name, name_len), host_);
break;
case GEN_IPADD:
if (!memcmp(&addr6, name, addr_len) || !memcmp(&addr, name, addr_len)) {
ip_matched = true;
}
break;
}
}
if (dsn_matched || ip_matched) { ret = true; }
}
GENERAL_NAMES_free(const_cast<STACK_OF(GENERAL_NAME) *>(
reinterpret_cast<const STACK_OF(GENERAL_NAME) *>(alt_names)));
return ret;
}
bool SSLClient::verify_host_with_common_name(X509 *server_cert) const {
const auto subject_name = X509_get_subject_name(server_cert);
if (subject_name != nullptr) {
char name[BUFSIZ];
auto name_len = X509_NAME_get_text_by_NID(subject_name, NID_commonName,
name, sizeof(name));
if (name_len != -1) {
return detail::match_hostname(
std::string(name, static_cast<size_t>(name_len)), host_);
}
}
return false;
}
#endif // CPPHTTPLIB_OPENSSL_SUPPORT
/*
@@ -14622,10 +14565,10 @@ bool verify_hostname(cert_t cert, const char *hostname) {
auto mcert = static_cast<const mbedtls_x509_crt *>(cert);
std::string host_str(hostname);
// Check if hostname is an IP address
bool is_ip = impl::is_ipv4_address(host_str);
unsigned char ip_bytes[4];
if (is_ip) { impl::parse_ipv4(host_str, ip_bytes); }
// Check if hostname is an IP address (IPv4 or IPv6)
unsigned char ip_bytes[16];
auto ip_len = impl::parse_ip_address(host_str, ip_bytes);
auto is_ip = ip_len > 0;
// Check Subject Alternative Names (SAN)
// In Mbed TLS 3.x, subject_alt_names contains raw values without ASN.1 tags
@@ -14637,9 +14580,9 @@ bool verify_hostname(cert_t cert, const char *hostname) {
size_t len = san->buf.len;
if (is_ip) {
// Check if this SAN is an IPv4 address (4 bytes)
if (len == 4 && memcmp(p, ip_bytes, 4) == 0) { return true; }
// Check if this SAN is an IPv6 address (16 bytes) - skip for now
// For an IP host, only a matching iPAddress SAN of the same family
// (4 bytes for IPv4, 16 bytes for IPv6) may authenticate it.
if (len == ip_len && memcmp(p, ip_bytes, ip_len) == 0) { return true; }
} else {
// Check if this SAN is a DNS name (printable ASCII string)
bool is_dns = len > 0;
@@ -14654,21 +14597,25 @@ bool verify_hostname(cert_t cert, const char *hostname) {
san = san->next;
}
// Fallback: Check Common Name (CN) in subject
char cn[256];
int ret = mbedtls_x509_dn_gets(cn, sizeof(cn), &mcert->subject);
if (ret > 0) {
std::string cn_str(cn);
// Fallback: Check Common Name (CN) in subject. Skipped for IP-literal hosts:
// an IP identity is only valid via an iPAddress SAN, never the CN (RFC 9110;
// the OpenSSL backend's X509_check_ip behaves the same way).
if (!is_ip) {
char cn[256];
int ret = mbedtls_x509_dn_gets(cn, sizeof(cn), &mcert->subject);
if (ret > 0) {
std::string cn_str(cn);
// Look for "CN=" in the DN string
size_t cn_pos = cn_str.find("CN=");
if (cn_pos != std::string::npos) {
size_t start = cn_pos + 3;
size_t end = cn_str.find(',', start);
std::string cn_value =
cn_str.substr(start, end == std::string::npos ? end : end - start);
// Look for "CN=" in the DN string
size_t cn_pos = cn_str.find("CN=");
if (cn_pos != std::string::npos) {
size_t start = cn_pos + 3;
size_t end = cn_str.find(',', start);
std::string cn_value =
cn_str.substr(start, end == std::string::npos ? end : end - start);
if (detail::match_hostname(cn_value, host_str)) { return true; }
if (detail::match_hostname(cn_value, host_str)) { return true; }
}
}
}
@@ -15774,10 +15721,10 @@ bool verify_hostname(cert_t cert, const char *hostname) {
auto x509 = static_cast<WOLFSSL_X509 *>(cert);
std::string host_str(hostname);
// Check if hostname is an IP address
bool is_ip = impl::is_ipv4_address(host_str);
unsigned char ip_bytes[4];
if (is_ip) { impl::parse_ipv4(host_str, ip_bytes); }
// Check if hostname is an IP address (IPv4 or IPv6)
unsigned char ip_bytes[16];
auto ip_len = impl::parse_ip_address(host_str, ip_bytes);
auto is_ip = ip_len > 0;
// Check Subject Alternative Names
auto *san_names = static_cast<WOLF_STACK_OF(WOLFSSL_GENERAL_NAME) *>(
@@ -15804,10 +15751,12 @@ bool verify_hostname(cert_t cert, const char *hostname) {
}
}
} else if (is_ip && names->type == WOLFSSL_GEN_IPADD) {
// IP address
// IP address: only an iPAddress SAN of the same family (4 bytes for
// IPv4, 16 bytes for IPv6) may authenticate the host.
unsigned char *ip_data = wolfSSL_ASN1_STRING_data(names->d.iPAddress);
int ip_len = wolfSSL_ASN1_STRING_length(names->d.iPAddress);
if (ip_data && ip_len == 4 && memcmp(ip_data, ip_bytes, 4) == 0) {
auto san_ip_len = wolfSSL_ASN1_STRING_length(names->d.iPAddress);
if (ip_data && san_ip_len == static_cast<int>(ip_len) &&
memcmp(ip_data, ip_bytes, ip_len) == 0) {
wolfSSL_sk_free(san_names);
return true;
}
@@ -15816,8 +15765,10 @@ bool verify_hostname(cert_t cert, const char *hostname) {
wolfSSL_sk_free(san_names);
}
// Fallback: Check Common Name (CN) in subject
WOLFSSL_X509_NAME *subject = wolfSSL_X509_get_subject_name(x509);
// Fallback: Check Common Name (CN) in subject. Skipped for IP-literal hosts:
// an IP identity is only valid via an iPAddress SAN, never the CN (RFC 9110;
// the OpenSSL backend's X509_check_ip behaves the same way).
auto subject = is_ip ? nullptr : wolfSSL_X509_get_subject_name(x509);
if (subject) {
char cn[256] = {};
int cn_len = wolfSSL_X509_NAME_get_text_by_NID(subject, NID_commonName, cn,
+63 -18
View File
@@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.47.0"
#define CPPHTTPLIB_VERSION_NUM "0x002f00"
#define CPPHTTPLIB_VERSION "0.48.0"
#define CPPHTTPLIB_VERSION_NUM "0x003000"
#ifdef _WIN32
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
@@ -686,18 +686,70 @@ inline from_chars_result<T> from_chars(const char *first, const char *last,
return {p, std::errc{}};
}
// from_chars for double (simple wrapper for strtod)
// from_chars for double (hand-written, locale-independent)
//
// The only double consumed by this library is the HTTP quality value, whose
// grammar is (RFC 9110 12.4.2):
// qvalue = ( "0" [ "." 0*3DIGIT ] ) / ( "1" [ "." 0*3("0") ] )
// i.e. a non-negative decimal with no sign, exponent, "inf"/"nan", or wide
// magnitude. So this parser recognizes exactly 1*DIGIT [ "." *DIGIT ] with
// '.' always the decimal separator (std::strtod would instead read it from the
// global C locale, mis-parsing q-values once an embedder calls
// setlocale(LC_ALL, "") into a comma-decimal locale). The caller range-checks
// the result to [0, 1], so inputs outside that range need not be distinguished
// here. Allocation-free, single pass, and free of the overflow/rounding edge
// cases that exponent and wide-range handling would introduce.
inline from_chars_result<double> from_chars(const char *first, const char *last,
double &value) {
std::string s(first, last);
char *endptr = nullptr;
errno = 0;
value = std::strtod(s.c_str(), &endptr);
if (endptr == s.c_str()) { return {first, std::errc::invalid_argument}; }
if (errno == ERANGE) {
return {first + (endptr - s.c_str()), std::errc::result_out_of_range};
value = 0.0;
const char *p = first;
// Each 1eN is exactly representable, so a single final division by the
// matching entry yields a correctly-rounded result.
static const double powers_of_ten[] = {
1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18};
const int max_frac_digits =
static_cast<int>(sizeof(powers_of_ten) / sizeof(powers_of_ten[0])) - 1;
// Accumulate digits into a 64-bit integer and remember how many were
// fractional. Two independent caps keep this bounded and safe:
// * accumulation saturates before mantissa could overflow uint64_t, and
// * frac_digits is capped at max_frac_digits so it is always a valid index
// into powers_of_ten (without this an input like "0.000...0" would never
// grow mantissa, so the saturation cap alone would not bound it).
// Both caps only drop digits far beyond the precision a q-value needs; any
// value they would change is well outside [0, 1] and rejected by the caller.
uint64_t mantissa = 0;
int frac_digits = 0;
bool seen_digit = false;
const uint64_t limit = ((std::numeric_limits<uint64_t>::max)() - 9) / 10;
auto accumulate = [&](char c) {
if (mantissa <= limit) {
mantissa = mantissa * 10 + static_cast<uint64_t>(c - '0');
return true;
}
return false;
};
for (; p != last && '0' <= *p && *p <= '9'; ++p) {
seen_digit = true;
accumulate(*p);
}
return {first + (endptr - s.c_str()), std::errc{}};
if (p != last && *p == '.') {
++p;
for (; p != last && '0' <= *p && *p <= '9'; ++p) {
seen_digit = true;
if (frac_digits < max_frac_digits && accumulate(*p)) { ++frac_digits; }
}
}
if (!seen_digit) { return {first, std::errc::invalid_argument}; }
value = static_cast<double>(mantissa) / powers_of_ten[frac_digits];
return {p, std::errc{}};
}
inline bool parse_port(const char *s, size_t len, int &port) {
@@ -2826,13 +2878,6 @@ private:
#endif
friend class ClientImpl;
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
private:
bool verify_host(X509 *server_cert) const;
bool verify_host_with_subject_alt_name(X509 *server_cert) const;
bool verify_host_with_common_name(X509 *server_cert) const;
#endif
};
#endif // CPPHTTPLIB_SSL_ENABLED