Compare commits

...

12 Commits

Author SHA1 Message Date
Jeff Bolz c342c3b93d vulkan: fix non-contig rope (#19299) 2026-02-05 08:38:59 +01:00
will-lms af252d0758 metal : add missing includes (#19348) 2026-02-05 08:05:09 +02:00
Sigbjørn Skjæret 11fb327bf3 vendor : add missing llama_add_compile_flags (#19322)
* add missing llama_add_compile_flags

* disable all warnings for ssl, crypto and fipsmodule
2026-02-05 02:27:38 +01:00
Aaron Teo e6e934c5ea vendor: update cpp-httplib version (#19313)
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2026-02-05 05:15:03 +08:00
Daniel Bevenius b536eb0233 codeowners : add danbev for examples/debug (#19332)
* codeowners : add danbev for examples/debug

* Add @pwilkin to CODEOWNERS for debug

---------

Co-authored-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>
2026-02-04 20:20:40 +01:00
Xuan-Son Nguyen e0c93af2a0 debug: make common_debug_print_tensor readable (#19331)
* debug: make common_debug_print_tensor readable

* editorconfig
2026-02-04 17:55:31 +01:00
Georgi Gerganov 423bee462b ci : fix sanitize workflow to enable ggml sanitizers too (#19323) 2026-02-04 15:12:03 +02:00
Xuan-Son Nguyen 8abcc70a74 model: (qwen3next) correct vectorized key_gdiff calculation (#19324)
* model: (qwen3next) correct vectorized key_gdiff calculation

* move transpose to outside of loop
2026-02-04 13:09:58 +01:00
Georgi Gerganov eaba92c3dc tests : add non-cont, inplace rope tests (#19296)
* tests : add non-cont, inplace rope tests

* cont : exercise dim 3

Co-authored-by: Jeff Bolz <jbolz@nvidia.com>

* cont : more dim3 exercises

---------

Co-authored-by: Jeff Bolz <jbolz@nvidia.com>
2026-02-04 12:45:21 +02:00
Daniel Bevenius 6ab881b7c3 model-conversion : add tensor-info.py utility (#18954)
This commit adds a new python script that can be used to print tensors
information from a tensor in a safetensors model.

The motivation for this is that during model conversion work it can
sometimes be useful to verify the shape of tensors in the original
model. While it is possible to print the tensors when loading the model
this can be slow when working with larger models.
With this script it is possible to quickly query tensor shapes.

Example usage:
```console
(venv) $ ./scripts/utils/tensor-info.py --help
usage: tensor-info.py [-h] [-m MODEL_PATH] [-l] [tensor_name]

Print tensor information from a safetensors model

positional arguments:
  tensor_name           Name of the tensor to inspect

options:
  -h, --help            show this help message and exit
  -m MODEL_PATH, --model-path MODEL_PATH
                        Path to the model directory (default: MODEL_PATH environment variable)
  -l, --list            List unique tensor patterns in the model (layer numbers replaced with #)
```

Listing tensor names:
```console
(venv) $ ./scripts/utils/tensor-info.py -m ~/work/ai/models/google/embeddinggemma-300m -l
embed_tokens.weight
layers.#.input_layernorm.weight
layers.#.mlp.down_proj.weight
layers.#.mlp.gate_proj.weight
layers.#.mlp.up_proj.weight
layers.#.post_attention_layernorm.weight
layers.#.post_feedforward_layernorm.weight
layers.#.pre_feedforward_layernorm.weight
layers.#.self_attn.k_norm.weight
layers.#.self_attn.k_proj.weight
layers.#.self_attn.o_proj.weight
layers.#.self_attn.q_norm.weight
layers.#.self_attn.q_proj.weight
layers.#.self_attn.v_proj.weight
norm.weight
```

Printing a specific tensor's information:
```console
(venv) $ ./scripts/utils/tensor-info.py -m ~/work/ai/models/google/embeddinggemma-300m layers.0.input_layernorm.weight
Tensor: layers.0.input_layernorm.weight
File:   model.safetensors
Shape:  [768]
```
2026-02-04 10:40:53 +01:00
Georgi Gerganov d838c22bb3 spec : fix the check-rate logic of ngram-simple (#19261)
* spec : fix the check-rate logic of ngram-simple

* cont : refactor + fix checks
2026-02-04 10:39:53 +02:00
Daniel Bevenius 25f40ca65f completion : simplify batch (embd) processing (#19286)
* completion : simplify batch (embd) processing

This commit simplifies the processing of embd by removing the for loop
that currently exists which uses params.n_batch as its increment. This
commit also removes the clamping of n_eval as the size of embd is always
at most the size of params.n_batch.

The motivation is to clarify the code as it is currently a little
confusing when looking at this for loop in isolation and thinking that
it can process multiple batches.

* add an assert to verify n_eval is not greater than n_batch
2026-02-04 05:43:28 +01:00
23 changed files with 626 additions and 216 deletions
+2
View File
@@ -293,6 +293,7 @@ jobs:
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
@@ -303,6 +304,7 @@ jobs:
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+1
View File
@@ -27,6 +27,7 @@
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
/examples/convert-llama2c-to-ggml/ @ggerganov
/examples/debug/ @danbev @pwilkin
/examples/deprecation-warning/ @ggerganov
/examples/diffusion/ @am17an
/examples/embedding/ @ggerganov
+18 -16
View File
@@ -45,6 +45,8 @@ static float common_ggml_get_float_value(const uint8_t * data,
return v;
}
#define INDENT " "
template <bool abort>
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
GGML_ASSERT(n > 0);
@@ -60,41 +62,41 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
}
}
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
LOG_ERR(" [\n");
LOG(INDENT "[\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2 * n) {
LOG_ERR(" ..., \n");
LOG(INDENT INDENT "..., \n");
i2 = ne[2] - n;
}
LOG_ERR(" [\n");
LOG(INDENT INDENT "[\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2 * n) {
LOG_ERR(" ..., \n");
LOG(INDENT INDENT INDENT "..., \n");
i1 = ne[1] - n;
}
LOG_ERR(" [");
LOG(INDENT INDENT INDENT "[");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2 * n) {
LOG_ERR("..., ");
LOG(" ..., ");
i0 = ne[0] - n;
}
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
LOG_ERR("%12.4f", v);
LOG("%12.4f", v);
if (i0 < ne[0] - 1) {
LOG_ERR(", ");
LOG(", ");
}
}
LOG_ERR("],\n");
LOG(" ],\n");
}
LOG_ERR(" ],\n");
LOG(INDENT INDENT "],\n");
}
LOG_ERR(" ]\n");
LOG_ERR(" sum = %f\n", sum);
LOG(INDENT "]\n");
LOG(INDENT "sum = %f\n", sum);
}
if constexpr (abort) {
if (std::isnan(sum)) {
LOG_ERR("encountered NaN - aborting\n");
LOG("encountered NaN - aborting\n");
exit(0);
}
}
@@ -137,9 +139,9 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
}
if (matches_filter) {
LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
common_ggml_ne_string(t).c_str());
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
common_ggml_ne_string(t).c_str());
}
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+3 -12
View File
@@ -47,21 +47,15 @@ static std::string common_tokens_to_str(const llama_tokens & inp, size_t start,
* @return Vector of draft tokens, empty if no matching pattern is found
*/
llama_tokens common_ngram_simple_draft(
common_ngram_simple_state & state,
const common_ngram_simple_config & config,
const llama_tokens & tokens, llama_token sampled) {
// Simple implementation of self-speculative decoding without a draft model.
//
const size_t cur_len = tokens.size();
// Only check every check_rate tokens to save compute
// i.e., perform check if (cur_len - idx_last_check) >= check_rate
if (state.idx_last_check + state.config.check_rate > cur_len) {
llama_tokens draft_tokens;
return draft_tokens;
}
size_t n_draft_min = state.config.size_ngram; // size of n-gram to lookup in token history
size_t n_draft_max = state.config.size_mgram; // the m-gram following the found n-gram is used for draft
const size_t n_draft_min = config.size_ngram; // size of n-gram to lookup in token history
const size_t n_draft_max = config.size_mgram; // the m-gram following the found n-gram is used for draft
// vector for tokens we want to verify.
// return empty vector if there is no match.
@@ -80,9 +74,6 @@ llama_tokens common_ngram_simple_draft(
}
pattern.push_back(sampled); // add the last token to the pattern
// We do a search in the token history.
state.idx_last_check = cur_len;
size_t match_pos = 0; // we ignore position 0, position 0 == no match
// search backwards, but skip the current match (we are currently there)
for (size_t j = cur_len - n_draft_min - 1; j > 0; --j) {
+1 -15
View File
@@ -27,23 +27,9 @@ struct common_ngram_simple_config {
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
};
// current state (and config) of n-gram simple.
struct common_ngram_simple_state {
common_ngram_simple_config config;
size_t idx_last_check = 0; // index of last check in context history (mutable)
common_ngram_simple_state(const common_ngram_simple_config & config)
: config(config) {}
};
// Searches for a n-gram in the history and checks whether a draft sequence should be generated.
// state: the ngram simple state to search in.
// inp: the tokens generated so far.
// sampled: the token that was just sampled.
// draft: vector to store the draft tokens, initially empty.
llama_tokens common_ngram_simple_draft(
common_ngram_simple_state & state,
const common_ngram_simple_config & config,
const llama_tokens & tokens, llama_token sampled);
+14 -6
View File
@@ -463,12 +463,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
// state of self-speculation (simple implementation, not ngram-map)
struct common_speculative_state_ngram_simple : public common_speculative_state {
common_ngram_simple_state state;
common_ngram_simple_config config;
uint16_t check_id = 0; // used to control the frequency of generating drafts
common_speculative_state_ngram_simple(
enum common_speculative_type type,
common_ngram_simple_state state)
: common_speculative_state(type), state(state) {}
common_ngram_simple_config config)
: common_speculative_state(type), config(config) {}
void begin(const llama_tokens & prompt) override {
GGML_UNUSED(prompt);
@@ -479,7 +481,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
result = common_ngram_simple_draft(state, prompt_tgt, id_last);
++check_id;
if (check_id < config.check_rate) {
return;
}
check_id = 0;
result = common_ngram_simple_draft(config, prompt_tgt, id_last);
GGML_UNUSED(params);
}
@@ -889,14 +897,14 @@ common_speculative * common_speculative_init(
uint16_t mgram_size_value = ngram_map.size_value;
uint16_t check_rate = ngram_map.check_rate;
auto config_simple = common_ngram_simple_config{
auto config_simple = common_ngram_simple_config {
/* .size_ngram = */ ngram_size_key,
/* .size_mgram = */ mgram_size_value,
/* .check_rate = */ check_rate
};
auto state = std::make_unique<common_speculative_state_ngram_simple>(
/* .type = */ config.type,
/* .state = */ common_ngram_simple_state(config_simple)
/* .state = */ config_simple
);
impls.push_back(std::move(state));
break;
+159
View File
@@ -0,0 +1,159 @@
#!/usr/bin/env python3
import argparse
import json
import os
import re
import sys
from pathlib import Path
from typing import Optional
from safetensors import safe_open
MODEL_SAFETENSORS_FILE = "model.safetensors"
MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
index_file = model_path / MODEL_SAFETENSORS_INDEX
if index_file.exists():
with open(index_file, 'r') as f:
index = json.load(f)
return index.get("weight_map", {})
return None
def get_all_tensor_names(model_path: Path) -> list[str]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
return list(weight_map.keys())
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
try:
with safe_open(single_file, framework="pt", device="cpu") as f:
return list(f.keys())
except Exception as e:
print(f"Error reading {single_file}: {e}")
sys.exit(1)
print(f"Error: No safetensors files found in {model_path}")
sys.exit(1)
def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
return weight_map.get(tensor_name)
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
return single_file.name
return None
def normalize_tensor_name(tensor_name: str) -> str:
normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
normalized = re.sub(r'\.\d+$', '.#', normalized)
return normalized
def list_all_tensors(model_path: Path, unique: bool = False):
tensor_names = get_all_tensor_names(model_path)
if unique:
seen = set()
for tensor_name in sorted(tensor_names):
normalized = normalize_tensor_name(tensor_name)
if normalized not in seen:
seen.add(normalized)
print(normalized)
else:
for tensor_name in sorted(tensor_names):
print(tensor_name)
def print_tensor_info(model_path: Path, tensor_name: str):
tensor_file = find_tensor_file(model_path, tensor_name)
if tensor_file is None:
print(f"Error: Could not find tensor '{tensor_name}' in model index")
print(f"Model path: {model_path}")
sys.exit(1)
file_path = model_path / tensor_file
try:
with safe_open(file_path, framework="pt", device="cpu") as f:
if tensor_name in f.keys():
tensor_slice = f.get_slice(tensor_name)
shape = tensor_slice.get_shape()
print(f"Tensor: {tensor_name}")
print(f"File: {tensor_file}")
print(f"Shape: {shape}")
else:
print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
sys.exit(1)
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found.")
sys.exit(1)
except Exception as e:
print(f"An error occurred: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Print tensor information from a safetensors model"
)
parser.add_argument(
"tensor_name",
nargs="?", # optional (if --list is used for example)
help="Name of the tensor to inspect"
)
parser.add_argument(
"-m", "--model-path",
type=Path,
help="Path to the model directory (default: MODEL_PATH environment variable)"
)
parser.add_argument(
"-l", "--list",
action="store_true",
help="List unique tensor patterns in the model (layer numbers replaced with #)"
)
args = parser.parse_args()
model_path = args.model_path
if model_path is None:
model_path_str = os.environ.get("MODEL_PATH")
if model_path_str is None:
print("Error: --model-path not provided and MODEL_PATH environment variable not set")
sys.exit(1)
model_path = Path(model_path_str)
if not model_path.exists():
print(f"Error: Model path does not exist: {model_path}")
sys.exit(1)
if not model_path.is_dir():
print(f"Error: Model path is not a directory: {model_path}")
sys.exit(1)
if args.list:
list_all_tensors(model_path, unique=True)
else:
if args.tensor_name is None:
print("Error: tensor_name is required when not using --list")
sys.exit(1)
print_tensor_info(model_path, args.tensor_name)
if __name__ == "__main__":
main()
+3
View File
@@ -7,6 +7,9 @@
#include "ggml-metal-context.h"
#include "ggml-metal-ops.h"
#include <mutex>
#include <string>
#define GGML_METAL_NAME "MTL"
#define GGML_METAL_MAX_DEVICES 16
+24 -8
View File
@@ -1263,25 +1263,30 @@ struct vk_op_diag_mask_push_constants {
struct vk_op_rope_push_constants {
uint32_t rope_mode;
uint32_t ncols;
uint32_t nrows;
uint32_t n_dims;
float freq_scale;
uint32_t p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[2];
float theta_scale;
uint32_t has_ff;
uint32_t ne02;
uint32_t s1;
uint32_t s2;
int32_t sections[4];
uint32_t is_imrope;
uint32_t is_back;
uint32_t set_rows_stride;
uint32_t ne00;
uint32_t ne01;
uint32_t ne02;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
};
static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128");
// For fused rms_norm+mul+rope(+view+set_rows)
struct vk_op_rms_norm_mul_rope_push_constants {
@@ -10405,12 +10410,22 @@ static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor *
uint32_t nb01 = src0->nb[1] / ggml_type_size(src0->type);
uint32_t nb02 = src0->nb[2] / ggml_type_size(src0->type);
uint32_t nb03 = src0->nb[3] / ggml_type_size(src0->type);
uint32_t nb11 = dst->nb[1] / ggml_type_size(dst->type);
uint32_t nb12 = dst->nb[2] / ggml_type_size(dst->type);
uint32_t nb13 = dst->nb[3] / ggml_type_size(dst->type);
vk_op_rope_push_constants rope {
(uint32_t)mode, (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
has_ff, (uint32_t)src0->ne[2], nb01, nb02,
(uint32_t)mode, (uint32_t)ggml_nrows(src0), (uint32_t)n_dims, freq_scale,
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, has_ff,
{ sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride,
(uint32_t)src0->ne[0],
(uint32_t)src0->ne[1],
(uint32_t)src0->ne[2],
nb01, nb02, nb03,
nb11, nb12, nb13,
};
return rope;
@@ -14798,6 +14813,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_REPEAT_BACK:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_ROPE:
return ggml_is_contiguous_rows(op) && ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_ROPE_BACK:
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
@@ -112,12 +112,11 @@ void rms_norm(uint num_iters) {
#if RMS_NORM_ROPE_FUSION
barrier();
rope_params rp = p.rope;
uint rope_row = (samp*nchannels + channel)*nrows + row;
for (uint t = 2*tid; t < ncols; t += 2*BLOCK_SIZE) {
if (rp.rope_mode == GGML_ROPE_TYPE_NEOX) {
rope_neox(t, rope_row, rp);
rope_neox(t, row, channel, samp, rp);
} else if (rp.rope_mode == GGML_ROPE_TYPE_NORMAL) {
rope_norm(t, rope_row, rp);
rope_norm(t, row, channel, samp, rp);
}
}
#endif
@@ -4,12 +4,12 @@ float rope_yarn_ramp(const float low, const float high, const uint i0) {
return 1.0f - min(1.0f, max(0.0f, y));
}
uint rope_a_coord(const uint i0, const uint i01, const uint i02, rope_params p) {
uint rope_a_coord(const uint i0, const uint i01, const uint i02, const uint i03, rope_params p) {
#if RMS_NORM_ROPE_FUSION
// Per-row offset in shared memory
const uint ix = i0;
#else
const uint ix = i02*p.nb02 + i01*p.nb01 + i0;
const uint ix = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0;
#endif
return ix;
}
@@ -34,26 +34,19 @@ void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out
sin_theta = sin(theta) * mscale;
}
void rope_norm(const uint i0, const uint i1, rope_params p) {
uint ne0 = p.ncols;
uint ne1 = p.p_delta_rows;
if (i0 >= ne0) {
void rope_norm(const uint i0, const uint i1, const uint i2, const uint i3, rope_params p) {
if (i0 >= p.ne00) {
return;
}
// i1 is actually i2*nb2+i1, but the rows are contiguous
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
uint idst = i1*ne0 + i0;
const uint ix = rope_a_coord(i0, i01, i02, p);
uint idst = i0 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
const uint ix = rope_a_coord(i0, i1, i2, i3, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0;
idst += rope_data_i[i02].x * p.set_rows_stride;
idst = i1*p.nb11 + i0;
idst += rope_data_i[i2].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) {
@@ -63,7 +56,7 @@ void rope_norm(const uint i0, const uint i1, rope_params p) {
return;
}
const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f);
const float theta_base = rope_data_pos[i2] * pow(p.theta_scale, i0/2.0f);
const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
@@ -77,25 +70,19 @@ void rope_norm(const uint i0, const uint i1, rope_params p) {
rope_data_d[idst + 1] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
}
void rope_neox(const uint i0, const uint i1, rope_params p) {
uint ne0 = p.ncols;
uint ne1 = p.p_delta_rows;
if (i0 >= ne0) {
void rope_neox(const uint i0, const uint i1, const uint i2, const uint i3, rope_params p) {
if (i0 >= p.ne00) {
return;
}
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p);
uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
const uint ix = rope_a_coord(i0/2, i1, i2, i3, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0/2;
idst += rope_data_i[i02].x * p.set_rows_stride;
idst = i1*p.nb11 + i0/2;
idst += rope_data_i[i2].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) {
@@ -105,7 +92,7 @@ void rope_neox(const uint i0, const uint i1, rope_params p) {
return;
}
const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f);
const float theta_base = rope_data_pos[i2] * pow(p.theta_scale, i0/2.0f);
const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
@@ -120,26 +107,19 @@ void rope_neox(const uint i0, const uint i1, rope_params p) {
}
void rope_multi(const uint i0, const uint i1, rope_params p) {
uint ne0 = p.ncols;
uint ne1 = p.p_delta_rows;
uint ne2 = p.ne02;
if (i0 >= ne0) {
void rope_multi(const uint i0, const uint i1, const uint i2, const uint i3, rope_params p) {
if (i0 >= p.ne00) {
return;
}
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p);
uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
const uint ix = rope_a_coord(i0/2, i1, i2, i3, p);
// Fusion optimization: ROPE + VIEW + SET_ROWS.
// The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
if (p.set_rows_stride != 0) {
idst = i01*ne0 + i0/2;
idst += rope_data_i[i02].x * p.set_rows_stride;
idst = i1*p.nb11 + i0/2;
idst += rope_data_i[i2].x * p.set_rows_stride;
}
if (i0 >= p.n_dims) {
@@ -156,26 +136,26 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
float theta_base = 0.0;
if (p.is_imrope != 0) {
if (sector % 3 == 1 && sector < 3 * p.sections[1]) {
theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
theta_base = rope_data_pos[i2 + p.ne02 * 1]*pow(p.theta_scale, i0/2.0f);
} else if (sector % 3 == 2 && sector < 3 * p.sections[2]) {
theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
theta_base = rope_data_pos[i2 + p.ne02 * 2]*pow(p.theta_scale, i0/2.0f);
} else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
theta_base = rope_data_pos[i2]*pow(p.theta_scale, i0/2.0f);
} else {
theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
theta_base = rope_data_pos[i2 + p.ne02 * 3]*pow(p.theta_scale, i0/2.0f);
}
} else {
if (sector < p.sections[0]) {
theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
theta_base = rope_data_pos[i2]*pow(p.theta_scale, i0/2.0f);
}
else if (sector >= p.sections[0] && sector < sec_w) {
theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
theta_base = rope_data_pos[i2 + p.ne02 * 1]*pow(p.theta_scale, i0/2.0f);
}
else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
theta_base = rope_data_pos[i2 + p.ne02 * 2]*pow(p.theta_scale, i0/2.0f);
}
else if (sector >= sec_w + p.sections[2]) {
theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
theta_base = rope_data_pos[i2 + p.ne02 * 3]*pow(p.theta_scale, i0/2.0f);
}
}
@@ -191,20 +171,13 @@ void rope_multi(const uint i0, const uint i1, rope_params p) {
rope_data_d[idst + p.n_dims/2] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
}
void rope_vision(const uint i0, const uint i1, rope_params p) {
uint ne0 = p.ncols;
uint ne1 = p.p_delta_rows;
uint ne2 = p.ne02;
if (i0 >= ne0) {
void rope_vision(const uint i0, const uint i1, const uint i2, const uint i3, rope_params p) {
if (i0 >= p.ne00) {
return;
}
const uint i01 = i1 % ne1;
const uint i02 = i1 / ne1;
const uint idst = i1*ne0 + i0/2;
const uint ix = rope_a_coord(i0/2, i01, i02, p);
const uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13;
const uint ix = rope_a_coord(i0/2, i1, i2, i3, p);
const int sect_dims = p.sections[0] + p.sections[1];
const int sec_w = p.sections[1] + p.sections[0];
@@ -213,11 +186,11 @@ void rope_vision(const uint i0, const uint i1, rope_params p) {
float theta_base = 0.0;
if (sector < p.sections[0]) {
const uint p0 = sector;
theta_base = rope_data_pos[i02]*pow(p.theta_scale, p0);
theta_base = rope_data_pos[i2]*pow(p.theta_scale, p0);
}
else if (sector >= p.sections[0] && sector < sec_w) {
const uint p0 = sector - p.sections[0];
theta_base = rope_data_pos[i02 + ne2]*pow(p.theta_scale, p0);
theta_base = rope_data_pos[i2 + p.ne02]*pow(p.theta_scale, p0);
}
const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
@@ -5,10 +5,13 @@
void main() {
const uint i0 = 2*gl_GlobalInvocationID.y;
// i1 is actually i2*nb2+i1, but the rows are contiguous
const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
if (i1 >= pc.nrows) {
const uint row = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
if (row >= pc.nrows) {
return;
}
rope_multi(i0, i1, pc);
const uint i3 = row / (pc.ne01*pc.ne02);
const uint i2 = (row - i3 * pc.ne01*pc.ne02) / pc.ne01;
const uint i1 = (row - i3 * pc.ne01*pc.ne02 - i2 * pc.ne01);
rope_multi(i0, i1, i2, i3, pc);
}
@@ -5,10 +5,13 @@
void main() {
const uint i0 = 2*gl_GlobalInvocationID.y;
// i1 is actually i2*nb2+i1, but the rows are contiguous
const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
if (i1 >= pc.nrows) {
const uint row = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
if (row >= pc.nrows) {
return;
}
rope_neox(i0, i1, pc);
const uint i3 = row / (pc.ne01*pc.ne02);
const uint i2 = (row - i3 * pc.ne01*pc.ne02) / pc.ne01;
const uint i1 = (row - i3 * pc.ne01*pc.ne02 - i2 * pc.ne01);
rope_neox(i0, i1, i2, i3, pc);
}
@@ -5,10 +5,13 @@
void main() {
const uint i0 = 2*gl_GlobalInvocationID.y;
// i1 is actually i2*nb2+i1, but the rows are contiguous
const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
if (i1 >= pc.nrows) {
const uint row = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
if (row >= pc.nrows) {
return;
}
rope_norm(i0, i1, pc);
const uint i3 = row / (pc.ne01*pc.ne02);
const uint i2 = (row - i3 * pc.ne01*pc.ne02) / pc.ne01;
const uint i1 = (row - i3 * pc.ne01*pc.ne02 - i2 * pc.ne01);
rope_norm(i0, i1, i2, i3, pc);
}
@@ -5,24 +5,29 @@
struct rope_params {
uint rope_mode;
uint ncols;
uint nrows;
uint n_dims;
float freq_scale;
uint p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[2];
float theta_scale;
uint has_ff;
uint ne02;
uint nb01;
uint nb02;
int sections[4];
uint is_imrope;
uint is_back;
uint set_rows_stride;
uint ne00;
uint ne01;
uint ne02;
uint nb01;
uint nb02;
uint nb03;
uint nb11;
uint nb12;
uint nb13;
};
#endif // !defined(GGML_ROPE_PARAMS)
@@ -5,10 +5,13 @@
void main() {
const uint i0 = 2*gl_GlobalInvocationID.y;
// i1 is actually i2*nb2+i1, but the rows are contiguous
const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
if (i1 >= pc.nrows) {
const uint row = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z;
if (row >= pc.nrows) {
return;
}
rope_vision(i0, i1, pc);
const uint i3 = row / (pc.ne01*pc.ne02);
const uint i2 = (row - i3 * pc.ne01*pc.ne02) / pc.ne01;
const uint i1 = (row - i3 * pc.ne01*pc.ne02 - i2 * pc.ne01);
rope_vision(i0, i1, i2, i3, pc);
}
+2 -2
View File
@@ -12,8 +12,8 @@ vendor = {
# "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.23/miniaudio.h": "vendor/miniaudio/miniaudio.h",
"https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h",
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.1/httplib.h": "vendor/cpp-httplib/httplib.h",
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.1/LICENSE": "vendor/cpp-httplib/LICENSE",
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.2/httplib.h": "vendor/cpp-httplib/httplib.h",
"https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.2/LICENSE": "vendor/cpp-httplib/LICENSE",
"https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h",
}
+9 -3
View File
@@ -265,9 +265,15 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp);
ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
1, chunk_size, n_chunks, g_diff_exp->ne[3]);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
// state to be updated per chunk
ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
@@ -322,9 +328,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
: ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk));
ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
//ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff)));
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
+2
View File
@@ -8032,6 +8032,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (int mode : {GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_IMROPE, GGML_ROPE_TYPE_VISION}) {
for (bool ff : {false, true}) {
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 0, true, true));
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 1, true, true));
test_cases.emplace_back(new test_rope(type, {128, 32, 2, 3}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 1, true, true));
}
}
}
+5 -8
View File
@@ -674,15 +674,12 @@ int main(int argc, char ** argv) {
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (!embd.empty()) {
int n_eval = (int) embd.size();
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
GGML_ASSERT(n_eval <= params.n_batch);
if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
@@ -743,7 +740,7 @@ int main(int argc, char ** argv) {
common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
if ((int) embd.size() == params.n_batch) {
break;
}
}
+24 -2
View File
@@ -3,9 +3,14 @@ license_add_file("cpp-httplib" "LICENSE")
find_package(Threads REQUIRED)
llama_add_compile_flags()
add_library(${TARGET} STATIC httplib.cpp httplib.h)
if (NOT MSVC)
# disable warnings in 3rd party code
# disable warnings in 3rd party code
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
target_compile_options(${TARGET} PRIVATE /w)
else()
target_compile_options(${TARGET} PRIVATE -w)
endif()
@@ -146,6 +151,23 @@ elseif (LLAMA_OPENSSL)
endif()
endif()
# disable warnings in 3rd party code
if(LLAMA_BUILD_BORINGSSL OR LLAMA_BUILD_LIBRESSL)
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
target_compile_options(ssl PRIVATE /w)
target_compile_options(crypto PRIVATE /w)
if(LLAMA_BUILD_BORINGSSL)
target_compile_options(fipsmodule PRIVATE /w)
endif()
else()
target_compile_options(ssl PRIVATE -w)
target_compile_options(crypto PRIVATE -w)
if(LLAMA_BUILD_BORINGSSL)
target_compile_options(fipsmodule PRIVATE -w)
endif()
endif()
endif()
if (CPPHTTPLIB_OPENSSL_SUPPORT)
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) # used in server.cpp
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+190 -48
View File
@@ -117,6 +117,8 @@ time_t parse_http_date(const std::string &date_str) {
#ifdef _WIN32
return _mkgmtime(&tm_buf);
#elif defined _AIX
return mktime(&tm_buf);
#else
return timegm(&tm_buf);
#endif
@@ -1376,7 +1378,7 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
// Allocate on the heap, so the resolver thread can keep using the data.
auto state = std::make_shared<GetAddrInfoState>();
state->node = node;
if (node) { state->node = node; }
state->service = service;
state->hints = *hints;
@@ -2896,10 +2898,20 @@ bool parse_range_header(const std::string &s, Ranges &ranges) try {
return;
}
const auto first =
static_cast<ssize_t>(lhs.empty() ? -1 : std::stoll(lhs));
const auto last =
static_cast<ssize_t>(rhs.empty() ? -1 : std::stoll(rhs));
ssize_t first = -1;
if (!lhs.empty()) {
ssize_t v;
auto res = detail::from_chars(lhs.data(), lhs.data() + lhs.size(), v);
if (res.ec == std::errc{}) { first = v; }
}
ssize_t last = -1;
if (!rhs.empty()) {
ssize_t v;
auto res = detail::from_chars(rhs.data(), rhs.data() + rhs.size(), v);
if (res.ec == std::errc{}) { last = v; }
}
if ((first == -1 && last == -1) ||
(first != -1 && last != -1 && first > last)) {
all_valid_ranges = false;
@@ -2974,25 +2986,17 @@ bool parse_accept_header(const std::string &s,
return;
}
#ifdef CPPHTTPLIB_NO_EXCEPTIONS
{
std::istringstream iss(quality_str);
iss >> accept_entry.quality;
// Check if conversion was successful and entire string was consumed
if (iss.fail() || !iss.eof()) {
double v = 0.0;
auto res = detail::from_chars(
quality_str.data(), quality_str.data() + quality_str.size(), v);
if (res.ec == std::errc{}) {
accept_entry.quality = v;
} else {
has_invalid_entry = true;
return;
}
}
#else
try {
accept_entry.quality = std::stod(quality_str);
} catch (...) {
has_invalid_entry = true;
return;
}
#endif
// Check if quality is in valid range [0.0, 1.0]
if (accept_entry.quality < 0.0 || accept_entry.quality > 1.0) {
has_invalid_entry = true;
@@ -5570,13 +5574,26 @@ bool Server::read_content(Stream &strm, Request &req, Response &res) {
strm, req, res,
// Regular
[&](const char *buf, size_t n) {
// Prevent arithmetic overflow when checking sizes.
// Avoid computing (req.body.size() + n) directly because
// adding two unsigned `size_t` values can wrap around and
// produce a small result instead of indicating overflow.
// Instead, check using subtraction: ensure `n` does not
// exceed the remaining capacity `max_size() - size()`.
if (req.body.size() >= req.body.max_size() ||
n > req.body.max_size() - req.body.size()) {
return false;
}
// Limit decompressed body size to payload_max_length_ to protect
// against "zip bomb" attacks where a small compressed payload
// decompresses to a massive size.
if (req.body.size() + n > payload_max_length_ ||
req.body.size() + n > req.body.max_size()) {
if (payload_max_length_ > 0 &&
(req.body.size() >= payload_max_length_ ||
n > payload_max_length_ - req.body.size())) {
return false;
}
req.body.append(buf, n);
return true;
},
@@ -5666,22 +5683,29 @@ bool Server::read_content_core(
// oversized request and fail early (causing connection close). For SSL
// builds we cannot reliably peek the decrypted application bytes, so keep
// the original behaviour.
#if !defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(_WIN32)
#if !defined(CPPHTTPLIB_OPENSSL_SUPPORT)
if (!req.has_header("Content-Length") &&
!detail::is_chunked_transfer_encoding(req.headers)) {
socket_t s = strm.socket();
if (s != INVALID_SOCKET) {
// Peek up to payload_max_length_ + 1 bytes. If more than
// payload_max_length_ bytes are pending, reject the request.
size_t to_peek =
(payload_max_length_ > 0)
? (std::min)(payload_max_length_ + 1, static_cast<size_t>(4096))
: 1;
std::vector<char> peekbuf(to_peek);
ssize_t n = ::recv(s, peekbuf.data(), to_peek, MSG_PEEK);
if (n > 0 && static_cast<size_t>(n) > payload_max_length_) {
// Indicate failure so connection will be closed.
return false;
// Only peek if payload_max_length is set to a finite value
if (payload_max_length_ > 0 &&
payload_max_length_ < (std::numeric_limits<size_t>::max)()) {
socket_t s = strm.socket();
if (s != INVALID_SOCKET) {
// Peek to check if there is any pending data
char peekbuf[1];
ssize_t n = ::recv(s, peekbuf, 1, MSG_PEEK);
if (n > 0) {
// There is data, so read it with payload limit enforcement
auto result = detail::read_content_without_length(
strm, payload_max_length_, out);
if (result == detail::ReadContentResult::PayloadTooLarge) {
res.status = StatusCode::PayloadTooLarge_413;
return false;
} else if (result != detail::ReadContentResult::Success) {
return false;
}
return true;
}
}
}
return true;
@@ -6656,7 +6680,8 @@ void ClientImpl::close_socket(Socket &socket) {
}
bool ClientImpl::read_response_line(Stream &strm, const Request &req,
Response &res) const {
Response &res,
bool skip_100_continue) const {
std::array<char, 2048> buf{};
detail::stream_line_reader line_reader(strm, buf.data(), buf.size());
@@ -6677,8 +6702,8 @@ bool ClientImpl::read_response_line(Stream &strm, const Request &req,
res.status = std::stoi(std::string(m[2]));
res.reason = std::string(m[3]);
// Ignore '100 Continue'
while (res.status == StatusCode::Continue_100) {
// Ignore '100 Continue' (only when not using Expect: 100-continue explicitly)
while (skip_100_continue && res.status == StatusCode::Continue_100) {
if (!line_reader.getline()) { return false; } // CRLF
if (!line_reader.getline()) { return false; } // next response line
@@ -7463,7 +7488,8 @@ bool ClientImpl::write_content_with_provider(Stream &strm,
}
bool ClientImpl::write_request(Stream &strm, Request &req,
bool close_connection, Error &error) {
bool close_connection, Error &error,
bool skip_body) {
// Prepare additional headers
if (close_connection) {
if (!req.has_header("Connection")) {
@@ -7582,7 +7608,59 @@ bool ClientImpl::write_request(Stream &strm, Request &req,
}
}
// After sending request line and headers, wait briefly for an early server
// response (e.g. 4xx) and avoid sending a potentially large request body
// unnecessarily. This workaround is only enabled on Windows because Unix
// platforms surface write errors (EPIPE) earlier; on Windows kernel send
// buffering can accept large writes even when the peer already responded.
// Check the stream first (which covers SSL via `is_readable()`), then
// fall back to select on the socket. Only perform the wait for very large
// request bodies to avoid interfering with normal small requests and
// reduce side-effects. Poll briefly (up to 50ms as default) for an early
// response. Skip this check when using Expect: 100-continue, as the protocol
// handles early responses properly.
#if defined(_WIN32)
if (!skip_body &&
req.body.size() > CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD &&
req.path.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) {
auto start = std::chrono::high_resolution_clock::now();
for (;;) {
// Prefer socket-level readiness to avoid SSL_pending() false-positives
// from SSL internals. If the underlying socket is readable, assume an
// early response may be present.
auto sock = strm.socket();
if (sock != INVALID_SOCKET && detail::select_read(sock, 0, 0) > 0) {
return false;
}
// Fallback to stream-level check for non-socket streams or when the
// socket isn't reporting readable. Avoid using `is_readable()` for
// SSL, since `SSL_pending()` may report buffered records that do not
// indicate a complete application-level response yet.
if (!is_ssl() && strm.is_readable()) { return false; }
auto now = std::chrono::high_resolution_clock::now();
auto elapsed =
std::chrono::duration_cast<std::chrono::milliseconds>(now - start)
.count();
if (elapsed >= CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND) {
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
}
#endif
// Body
if (skip_body) { return true; }
return write_request_body(strm, req, error);
}
bool ClientImpl::write_request_body(Stream &strm, Request &req,
Error &error) {
if (req.body.empty()) {
return write_content_with_provider(strm, req, error);
}
@@ -7758,8 +7836,20 @@ void ClientImpl::output_error_log(const Error &err,
bool ClientImpl::process_request(Stream &strm, Request &req,
Response &res, bool close_connection,
Error &error) {
// Send request
if (!write_request(strm, req, close_connection, error)) { return false; }
// Auto-add Expect: 100-continue for large bodies
if (CPPHTTPLIB_EXPECT_100_THRESHOLD > 0 && !req.has_header("Expect")) {
auto body_size = req.body.empty() ? req.content_length_ : req.body.size();
if (body_size >= CPPHTTPLIB_EXPECT_100_THRESHOLD) {
req.set_header("Expect", "100-continue");
}
}
// Check for Expect: 100-continue
auto expect_100_continue = req.get_header_value("Expect") == "100-continue";
// Send request (skip body if using Expect: 100-continue)
auto write_request_success =
write_request(strm, req, close_connection, error, expect_100_continue);
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
if (is_ssl()) {
@@ -7774,14 +7864,48 @@ bool ClientImpl::process_request(Stream &strm, Request &req,
}
#endif
// Handle Expect: 100-continue with timeout
if (expect_100_continue && CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND > 0) {
time_t sec = CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND / 1000;
time_t usec = (CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND % 1000) * 1000;
auto ret = detail::select_read(strm.socket(), sec, usec);
if (ret <= 0) {
// Timeout or error: send body anyway (server didn't respond in time)
if (!write_request_body(strm, req, error)) { return false; }
expect_100_continue = false; // Switch to normal response handling
}
}
// Receive response and headers
if (!read_response_line(strm, req, res) ||
// When using Expect: 100-continue, don't auto-skip `100 Continue` response
if (!read_response_line(strm, req, res, !expect_100_continue) ||
!detail::read_headers(strm, res.headers)) {
error = Error::Read;
if (write_request_success) { error = Error::Read; }
output_error_log(error, &req);
return false;
}
if (!write_request_success) { return false; }
// Handle Expect: 100-continue response
if (expect_100_continue) {
if (res.status == StatusCode::Continue_100) {
// Server accepted, send the body
if (!write_request_body(strm, req, error)) { return false; }
// Read the actual response
res.headers.clear();
res.body.clear();
if (!read_response_line(strm, req, res) ||
!detail::read_headers(strm, res.headers)) {
error = Error::Read;
output_error_log(error, &req);
return false;
}
}
// If not 100 Continue, server returned an error; proceed with that response
}
// Body
if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" &&
req.method != "CONNECT") {
@@ -9543,7 +9667,7 @@ bool SSLClient::load_certs() {
last_openssl_error_ = ERR_get_error();
ret = false;
}
} else {
} else if (!ca_cert_store_) {
auto loaded = false;
#ifdef _WIN32
loaded =
@@ -9790,7 +9914,11 @@ bool SSLClient::verify_host_with_common_name(X509 *server_cert) const {
bool SSLClient::check_host_name(const char *pattern,
size_t pattern_len) const {
if (host_.size() == pattern_len && host_ == pattern) { return true; }
// Exact match (case-insensitive)
if (host_.size() == pattern_len &&
detail::case_ignore::equal(host_, std::string(pattern, pattern_len))) {
return true;
}
// Wildcard match
// https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484
@@ -9805,9 +9933,23 @@ bool SSLClient::check_host_name(const char *pattern,
auto itr = pattern_components.begin();
for (const auto &h : host_components_) {
auto &p = *itr;
if (p != h && p != "*") {
auto partial_match = (p.size() > 0 && p[p.size() - 1] == '*' &&
!p.compare(0, p.size() - 1, h));
if (!httplib::detail::case_ignore::equal(p, h) && p != "*") {
bool partial_match = false;
if (!p.empty() && p[p.size() - 1] == '*') {
const auto prefix_length = p.size() - 1;
if (prefix_length == 0) {
partial_match = true;
} else if (h.size() >= prefix_length) {
partial_match =
std::equal(p.begin(),
p.begin() + static_cast<std::string::difference_type>(
prefix_length),
h.begin(), [](const char ca, const char cb) {
return httplib::detail::case_ignore::to_lower(ca) ==
httplib::detail::case_ignore::to_lower(cb);
});
}
}
if (!partial_match) { return false; }
}
++itr;
+93 -9
View File
@@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.30.1"
#define CPPHTTPLIB_VERSION_NUM "0x001E01"
#define CPPHTTPLIB_VERSION "0.30.2"
#define CPPHTTPLIB_VERSION_NUM "0x001E02"
/*
* Platform compatibility check
@@ -98,6 +98,22 @@
#define CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND 0
#endif
#ifndef CPPHTTPLIB_EXPECT_100_THRESHOLD
#define CPPHTTPLIB_EXPECT_100_THRESHOLD 1024
#endif
#ifndef CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND
#define CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND 1000
#endif
#ifndef CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD
#define CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD (1024 * 1024)
#endif
#ifndef CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND
#define CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND 50
#endif
#ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND
#define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0
#endif
@@ -286,8 +302,10 @@ using socket_t = int;
#include <atomic>
#include <cassert>
#include <cctype>
#include <chrono>
#include <climits>
#include <condition_variable>
#include <cstdlib>
#include <cstring>
#include <errno.h>
#include <exception>
@@ -305,6 +323,7 @@ using socket_t = int;
#include <sstream>
#include <string>
#include <sys/stat.h>
#include <system_error>
#include <thread>
#include <unordered_map>
#include <unordered_set>
@@ -494,6 +513,69 @@ private:
bool execute_on_destruction;
};
// Simple from_chars implementation for integer and double types (C++17
// substitute)
template <typename T> struct from_chars_result {
const char *ptr;
std::errc ec;
};
template <typename T>
inline from_chars_result<T> from_chars(const char *first, const char *last,
T &value, int base = 10) {
value = 0;
const char *p = first;
bool negative = false;
if (p != last && *p == '-') {
negative = true;
++p;
}
if (p == last) { return {first, std::errc::invalid_argument}; }
T result = 0;
for (; p != last; ++p) {
char c = *p;
int digit = -1;
if ('0' <= c && c <= '9') {
digit = c - '0';
} else if ('a' <= c && c <= 'z') {
digit = c - 'a' + 10;
} else if ('A' <= c && c <= 'Z') {
digit = c - 'A' + 10;
} else {
break;
}
if (digit < 0 || digit >= base) { break; }
if (result > ((std::numeric_limits<T>::max)() - digit) / base) {
return {p, std::errc::result_out_of_range};
}
result = result * base + digit;
}
if (p == first || (negative && p == first + 1)) {
return {first, std::errc::invalid_argument};
}
value = negative ? -result : result;
return {p, std::errc{}};
}
// from_chars for double (simple wrapper for strtod)
inline from_chars_result<double> from_chars(const char *first, const char *last,
double &value) {
std::string s(first, last);
char *endptr = nullptr;
errno = 0;
value = std::strtod(s.c_str(), &endptr);
if (endptr == s.c_str()) { return {first, std::errc::invalid_argument}; }
if (errno == ERANGE) {
return {first + (endptr - s.c_str()), std::errc::result_out_of_range};
}
return {first + (endptr - s.c_str()), std::errc{}};
}
} // namespace detail
enum SSLVerifierResponse {
@@ -1848,10 +1930,11 @@ private:
Result send_(Request &&req);
socket_t create_client_socket(Error &error) const;
bool read_response_line(Stream &strm, const Request &req,
Response &res) const;
bool read_response_line(Stream &strm, const Request &req, Response &res,
bool skip_100_continue = true) const;
bool write_request(Stream &strm, Request &req, bool close_connection,
Error &error);
Error &error, bool skip_body = false);
bool write_request_body(Stream &strm, Request &req, Error &error);
void prepare_default_headers(Request &r, bool for_stream,
const std::string &ct);
bool redirect(Request &req, Response &res, Error &error);
@@ -3243,10 +3326,11 @@ private:
msg.id = value;
} else if (field == "retry") {
// Parse retry interval in milliseconds
try {
retry_ms = std::stoi(value);
} catch (...) {
// Invalid retry value, ignore
{
int v = 0;
auto res =
detail::from_chars(value.data(), value.data() + value.size(), v);
if (res.ec == std::errc{}) { retry_ms = v; }
}
}
// Unknown fields are ignored per SSE spec