Compare commits

..

8 Commits

Author SHA1 Message Date
Adrien Gallouët 84de01a1f1 llama : use LLM_KV for quantization_version & file_type (#24802)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-20 20:07:01 +02:00
Xuan-Son Nguyen 75f460ac28 arg: try fixing test-args-parser randomly fails (#24826)
* arg: try fixing test-args-parser randomly fails

* return ref

* try triggering the workflow

* exception wrapper

* wip

* test

* test 2

* arg: guard win32 utf8 argv override

make_utf8_argv rebuilds argv from GetCommandLineW to fix utf8 handling of
non ascii arguments on windows. the override runs unconditionally inside
common_params_parse, so it also clobbers a programmatic argv passed by a
caller. test-arg-parser builds a synthetic argv but then sees the real
process command line instead, the model argument is never parsed, and the
assert that expects success aborts via fastfail (0xC0000409). this shows up
as a random failure in the openvino windows workflow.

only override argv when its length matches the caller argc, so the utf8
repair still applies to real binaries while a programmatic argv stays intact.

---------

Co-authored-by: Pascal <admin@serveurperso.com>
2026-06-20 19:45:27 +02:00
Muhammad Salem 8452824611 release: add missing link for win opencl adreno arm64 (#24809) 2026-06-20 23:08:59 +08:00
Matti4 e27f308597 server: avoid forwarding auth headers in CORS proxy (#24373)
* server: avoid forwarding auth headers in CORS proxy

* format

* fix test

* fix e2e test

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2026-06-20 15:34:47 +02:00
Aldehir Rojas 67e9fd3b74 docker : prebuild web UI for s390x build [no release] (#24829) 2026-06-20 05:54:42 -05:00
davidrhodus 796f41bedc model : glm-dsa load DSA indexer tensors as optional (#24770)
GLM-5.2 ships the DSA "lightning indexer" on only a subset of layers (the
"full" layers; others omit it), but the GLM_DSA loader created the five
indexer tensors on every layer as required, so loading any GLM-5.2 GGUF
failed with e.g. `missing tensor 'blk.3.indexer.k_norm.weight'`.

GLM_DSA's graph is llama_model_deepseek2::graph (plain MLA) and does not use
the indexer tensors (indexer runtime not yet implemented), so they are
loaded-but-unused. Marking them TENSOR_NOT_REQUIRED lets layers without an
indexer load as nullptr and the model runs as full MLA attention.

DeepSeek-V3.2 (uniform indexer on all layers) is unaffected.
2026-06-20 13:48:24 +03:00
Adrien Gallouët 37a77fb057 ggml : optimize AMX (#24806)
Flatten the partition over n_batch * M so every thread participates in
the quantization

    | CPU                             | Model                         | Test   |   t/s OLD |   t/s NEW |   Speedup |
    |:--------------------------------|:------------------------------|:-------|----------:|----------:|----------:|
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw  | pp512  |    730.71 |    779.86 |      1.07 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw  | tg128  |     87.88 |     86.79 |      0.99 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | pp512  |    725.09 |   1023.31 |      1.41 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | tg128  |     83.64 |     83.62 |      1.00 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0              | pp512  |    820.51 |    924.05 |      1.13 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0              | tg128  |     90.59 |     92.46 |      1.02 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1              | pp512  |    776.88 |    872.79 |      1.12 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1              | tg128  |     89.39 |     90.94 |      1.02 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M            | pp512  |    719.28 |   1009.27 |      1.40 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M            | tg128  |     80.62 |     80.86 |      1.00 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S            | pp512  |    732.29 |   1077.29 |      1.47 |
    | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S            | tg128  |     86.42 |     83.53 |      0.97 |

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-20 13:43:06 +03:00
Sigbjørn Skjæret f4043fec01 convert : more consistent handling of rope_parameters (#24833) 2026-06-20 13:42:36 +03:00
33 changed files with 270 additions and 96 deletions
-16
View File
@@ -4,20 +4,6 @@ ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG NODE_VERSION=24
FROM docker.io/node:$NODE_VERSION AS web
ARG APP_VERSION
WORKDIR /app/tools/ui
COPY tools/ui/package.json tools/ui/package-lock.json ./
RUN npm ci
COPY tools/ui/ ./
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
### Build Llama.cpp stage
FROM docker.io/gcc:${GCC_VERSION} AS build
@@ -34,8 +20,6 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
WORKDIR /app
COPY . .
COPY --from=web /app/tools/ui/dist tools/ui/dist
RUN --mount=type=cache,target=/root/.ccache \
--mount=type=cache,target=/app/build \
cmake -S . -B build -G Ninja \
-1
View File
@@ -11,7 +11,6 @@
build*/
tools/ui/node_modules/
tools/ui/dist/
models/*
+16 -2
View File
@@ -58,6 +58,13 @@ jobs:
git tag ${{ steps.srctag.outputs.name }} || exit 0
git push origin ${{ steps.srctag.outputs.name }} || exit 0
build_ui:
name: Build UI
needs: create_tag
uses: ./.github/workflows/ui-build.yml
with:
hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
prepare_matrices:
name: Prepare Docker matrices
runs-on: ubuntu-24.04
@@ -79,7 +86,7 @@ jobs:
[
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
@@ -135,7 +142,7 @@ jobs:
push_to_registry:
name: Push Docker image to Docker Registry
needs: [prepare_matrices, create_tag]
needs: [prepare_matrices, create_tag, build_ui]
runs-on: ${{ matrix.config.runs_on }}
strategy:
@@ -150,6 +157,13 @@ jobs:
fetch-depth: 0
ref: ${{ needs.create_tag.outputs.source_tag }}
- name: Download prebuilt UI
if: ${{ matrix.config.prebuilt_ui == true }}
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
with:
name: ui-build
path: tools/ui/dist
- name: Set up QEMU
if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
+1
View File
@@ -1627,6 +1627,7 @@ jobs:
**Windows:**
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
- [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
+3 -3
View File
@@ -924,8 +924,8 @@ static utf8_argv make_utf8_argv() {
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
#ifdef _WIN32
auto utf8 = make_utf8_argv();
if (!utf8.ptrs.empty()) {
argc = static_cast<int>(utf8.buf.size());
// repair argv only when it matches the process command line
if (static_cast<int>(utf8.buf.size()) == argc) {
argv = utf8.ptrs.data();
}
#endif
@@ -2897,7 +2897,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.server_tools = parse_csv_row(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
add_opt(common_arg(
add_opt(common_arg(
{"-ag", "--agent"},
{"-no-ag", "--no-agent"},
"whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
+1 -1
View File
@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+7 -1
View File
@@ -1119,8 +1119,10 @@ class TextModel(ModelBase):
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
# Ensure global params are mirrored in rope_parameters
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
if local_rope_theta is not None:
self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
@@ -1128,6 +1130,10 @@ class TextModel(ModelBase):
self.rope_parameters["rope_theta"] = rope_theta
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
self.rope_parameters["rope_type"] = rope_type
if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings
@classmethod
def __init_subclass__(cls):
+1 -1
View File
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
rope_dim = self.hparams["attention_dim"]
else:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
self.gguf_writer.add_add_bos_token(False)
rope_freq = 10000
if "rope_ratio" in self.hparams:
+1 -1
View File
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
+3 -3
View File
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):
assert (hparams["activation_function"] == "silu")
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
rotary_factor = self.rope_parameters.get("partial_rotary_factor")
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
factor = rope_params.get("factor", 16.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
+1 -1
View File
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
self.gguf_writer.add_head_count_kv(value_arr)
# handle n_rot differently for global vs swa layers
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
self.gguf_writer.add_rope_dimension_count(n_rot_full)
+2 -2
View File
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
)
self.gguf_writer.add_rope_dimension_count(
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
)
# MoE parameters - Use only routed expert count (shared experts handled separately)
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
super().set_gguf_parameters()
rope_dim = self.hparams["qk_rope_head_dim"]
partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))
# NextN/MTP prediction layers
+1 -1
View File
@@ -289,7 +289,7 @@ class LlamaModel(TextModel):
factor = rope_params.get("factor", 8.0)
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
+1 -1
View File
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
+6 -10
View File
@@ -32,11 +32,9 @@ class MiniCPMModel(TextModel):
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if long_factors or short_factors:
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
@@ -85,13 +83,11 @@ class MiniCPM3Model(TextModel):
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is not None:
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if long_factors or short_factors:
rope_dims = self.hparams["qk_rope_head_dim"]
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+4 -3
View File
@@ -125,17 +125,18 @@ class NemotronModel(TextModel):
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
# * Partial RoPE
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
rot_pct = self.rope_parameters["partial_rotary_factor"]
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
# * RopeScaling for Nemotron
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
if factor is None:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
else:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
self.gguf_writer.add_rope_scaling_factor(factor)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
+9 -11
View File
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
model_arch = gguf.MODEL_ARCH.PHI2
def set_gguf_parameters(self):
rot_pct = self.find_hparam(["partial_rotary_factor"])
rot_pct = self.rope_parameters["partial_rotary_factor"]
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
rms_eps = self.find_hparam(["rms_norm_eps"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
rope_dims = int(rot_pct * n_embd) // n_head
self.gguf_writer.add_context_length(max_pos_embds)
@@ -174,18 +174,19 @@ class Phi3MiniModel(TextModel):
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
rope_dims = int(rot_pct * n_embd) // n_head
# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
if rope_scaling is None:
long_factors = self.rope_parameters.get('long_factor')
short_factors = self.rope_parameters.get('short_factor')
if not long_factors:
return
scale = max_pos_embds / orig_max_pos_embds
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')
@@ -198,9 +199,6 @@ class Phi3MiniModel(TextModel):
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+1 -1
View File
@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+1 -1
View File
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
rotary_factor = self.rope_parameters["partial_rotary_factor"]
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
+1 -1
View File
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
factor = float(rope_params.get("factor", 8.0))
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
+5 -6
View File
@@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
parallel_for_ggml(params, n_batch, [&](int begin, int end) {
for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
for (int idx = begin; idx < end; ++idx) {
int batch_idx = idx / M;
int m = idx % M;
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
const float * A_data = (const float *)((const char *)src1->data + src1_offset);
char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
for (int m = 0; m < M; ++m) {
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
}
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
}
});
});
+2 -2
View File
@@ -932,8 +932,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// copy the KV pairs from the input file
gguf_set_kv (ctx_out.get(), ml.metadata);
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION).c_str(), GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_FILE_TYPE).c_str(), ftype);
// Remove split metadata
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
+5 -5
View File
@@ -101,11 +101,11 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
// DSA indexer
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
if (i < (int) hparams.n_layer_dense_lead) {
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
+11 -1
View File
@@ -10,7 +10,7 @@
#undef NDEBUG
#include <cassert>
int main(void) {
static void test(void) {
common_params params;
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
@@ -210,3 +210,13 @@ int main(void) {
printf("test-arg-parser: all tests OK\n\n");
}
int main(void) {
try {
test();
} catch (std::exception & e) {
fprintf(stderr, "test-arg-parser: exception: %s\n", e.what());
return 1;
}
return 0;
}
+19 -3
View File
@@ -7,9 +7,18 @@
#include <unordered_set>
#include <list>
#include <map>
#include <algorithm>
#include <cctype>
#include "server-http.h"
static std::string proxy_header_to_lower(std::string header) {
std::transform(header.begin(), header.end(), header.begin(), [](unsigned char c) {
return std::tolower(c);
});
return header;
}
static server_http_res_ptr proxy_request(const server_http_req & req, std::string method) {
std::string target_url = req.get_param("url");
common_http_url parsed_url = common_http_parse_url(target_url);
@@ -33,11 +42,18 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
std::map<std::string, std::string> headers;
const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
for (auto [key, value] : req.headers) {
auto new_key = key;
if (string_starts_with(new_key, "x-proxy-header-")) {
string_replace_all(new_key, "x-proxy-header-", "");
const std::string lowered_key = proxy_header_to_lower(key);
if (!string_starts_with(lowered_key, proxy_header_prefix)) {
continue;
}
auto new_key = key.substr(proxy_header_prefix.size());
if (new_key.empty()) {
continue;
}
headers[new_key] = value;
}
+45
View File
@@ -1,6 +1,8 @@
import pytest
from openai import OpenAI
from utils import *
import threading
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
server = ServerPreset.tinyllama2()
@@ -105,6 +107,49 @@ def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
assert res.headers[cors_header] == cors_header_value
def test_cors_proxy_only_forwards_explicit_proxy_headers():
class CaptureHeadersHandler(BaseHTTPRequestHandler):
def do_GET(self):
self.server.captured_headers = dict(self.headers)
self.send_response(200)
self.end_headers()
self.wfile.write(b"ok")
def log_message(self, format, *args):
pass
target = ThreadingHTTPServer(("127.0.0.1", 0), CaptureHeadersHandler)
target.captured_headers = {}
target_thread = threading.Thread(target=target.serve_forever, daemon=True)
target_thread.start()
try:
server = ServerPreset.tinyllama2()
server.api_key = TEST_API_KEY
server.ui_mcp_proxy = True
server.start()
res = server.make_request("GET", f"/cors-proxy?url=http://127.0.0.1:{target.server_port}/capture", headers={
"Authorization": f"Bearer {TEST_API_KEY}",
"Proxy-Authorization": "Basic secret",
"X-Api-Key": TEST_API_KEY,
"Cookie": "session=secret",
"x-llama-server-proxy-header-accept": "application/json",
"x-llama-server-proxy-header-authorization": "Bearer explicit",
})
assert res.status_code == 200
captured = {key.lower(): value for key, value in target.captured_headers.items()}
assert captured["accept"] == "application/json"
assert captured["authorization"] == "Bearer explicit"
assert "proxy-authorization" not in captured
assert "x-api-key" not in captured
assert "cookie" not in captured
finally:
target.shutdown()
target.server_close()
@pytest.mark.parametrize(
"media_path, image_url, success",
[
+3
View File
@@ -51,6 +51,9 @@ export const EXPECTED_THEMED_ICON_PAIR_COUNT = 2;
/** CORS proxy URL query parameter name */
export const CORS_PROXY_URL_PARAM = 'url';
/** Header prefix for headers that should be forwarded by the CORS proxy */
export const CORS_PROXY_HEADER_PREFIX = 'x-llama-server-proxy-header-';
/** Number of trailing characters to keep visible when partially redacting mcp-session-id */
export const MCP_SESSION_ID_VISIBLE_CHARS = 5;
+17 -6
View File
@@ -16,6 +16,7 @@ import {
DEFAULT_MCP_CONFIG,
DEFAULT_CLIENT_VERSION,
DEFAULT_IMAGE_MIME_TYPE,
CORS_PROXY_HEADER_PREFIX,
MCP_PARTIAL_REDACT_HEADERS,
CORS_PROXY_ENDPOINT
} from '$lib/constants';
@@ -133,6 +134,20 @@ export class MCPService {
return details;
}
private static addRequestHeaders(
requestHeaders: Headers,
headers: HeadersInit,
useProxy: boolean
) {
for (const [key, value] of new Headers(headers).entries()) {
const proxiedKey =
useProxy && !key.toLowerCase().startsWith(CORS_PROXY_HEADER_PREFIX)
? `${CORS_PROXY_HEADER_PREFIX}${key}`
: key;
requestHeaders.set(proxiedKey, value);
}
}
private static summarizeError(error: unknown): Record<string, unknown> {
if (error instanceof Error) {
return {
@@ -271,15 +286,11 @@ export class MCPService {
const requestHeaders = new Headers(baseInit.headers);
if (typeof Request !== 'undefined' && input instanceof Request) {
for (const [key, value] of input.headers.entries()) {
requestHeaders.set(key, value);
}
this.addRequestHeaders(requestHeaders, input.headers, useProxy);
}
if (init?.headers) {
for (const [key, value] of new Headers(init.headers).entries()) {
requestHeaders.set(key, value);
}
this.addRequestHeaders(requestHeaders, init.headers, useProxy);
}
const request = this.createDiagnosticRequestDetails(
+12 -3
View File
@@ -1,5 +1,5 @@
import { config } from '$lib/stores/settings.svelte';
import { REDACTED_HEADERS } from '$lib/constants';
import { CORS_PROXY_HEADER_PREFIX, REDACTED_HEADERS } from '$lib/constants';
import { redactValue } from './redact';
/**
@@ -52,11 +52,20 @@ export function sanitizeHeaders(
for (const [key, value] of normalized.entries()) {
const normalizedKey = key.toLowerCase();
const partialChars = partialRedactHeaders?.get(normalizedKey);
const unproxiedKey = normalizedKey.startsWith(CORS_PROXY_HEADER_PREFIX)
? normalizedKey.slice(CORS_PROXY_HEADER_PREFIX.length)
: normalizedKey;
const partialChars =
partialRedactHeaders?.get(normalizedKey) ?? partialRedactHeaders?.get(unproxiedKey);
if (partialChars !== undefined) {
sanitized[key] = redactValue(value, partialChars);
} else if (REDACTED_HEADERS.has(normalizedKey) || redactedHeaders.has(normalizedKey)) {
} else if (
REDACTED_HEADERS.has(normalizedKey) ||
REDACTED_HEADERS.has(unproxiedKey) ||
redactedHeaders.has(normalizedKey) ||
redactedHeaders.has(unproxiedKey)
) {
sanitized[key] = redactValue(value);
} else {
sanitized[key] = value;
+6 -2
View File
@@ -3,7 +3,11 @@
*/
import { base } from '$app/paths';
import { CORS_PROXY_ENDPOINT, CORS_PROXY_URL_PARAM } from '$lib/constants';
import {
CORS_PROXY_ENDPOINT,
CORS_PROXY_HEADER_PREFIX,
CORS_PROXY_URL_PARAM
} from '$lib/constants';
/**
* Build a proxied URL that routes through llama-server's CORS proxy.
@@ -28,7 +32,7 @@ export function buildProxiedHeaders(headers: Record<string, string>): Record<str
const proxiedHeaders: Record<string, string> = {};
for (const [key, value] of Object.entries(headers)) {
proxiedHeaders[`x-proxy-header-${key}`] = value;
proxiedHeaders[`${CORS_PROXY_HEADER_PREFIX}${key}`] = value;
}
return proxiedHeaders;
+5 -5
View File
@@ -39,8 +39,8 @@ test.describe('PWA Service Worker', () => {
const swContent = await swResponse.text();
// Precache contains SvelteKit content-hashed bundle paths
expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
expect(swContent).toMatch(/"manifest\.webmanifest"/);
expect(swContent).toMatch(/"_app\/version\.json"/);
expect(swContent).toMatch(/NavigationRoute/);
@@ -99,8 +99,8 @@ test.describe('PWA Service Worker', () => {
const html = await response.text();
// SvelteKit outputs content-hashed bundle names in _app/immutable/
expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"/);
expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9-]+\.css"/);
expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9-]+\.js"\)/);
expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/);
expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"\)/);
});
});
+62 -2
View File
@@ -3,6 +3,7 @@ import { Client } from '@modelcontextprotocol/sdk/client';
import { MCPService } from '$lib/services/mcp.service';
import { MCPConnectionPhase, MCPTransportType } from '$lib/enums';
import type { MCPConnectionLog, MCPServerConfig } from '$lib/types';
import { CORS_PROXY_HEADER_PREFIX } from '$lib/constants';
type DiagnosticFetchFactory = (
serverName: string,
@@ -16,11 +17,12 @@ type DiagnosticFetchFactory = (
const createDiagnosticFetch = (
config: MCPServerConfig,
onLog?: (log: MCPConnectionLog) => void,
baseInit: RequestInit = {}
baseInit: RequestInit = {},
useProxy = false
) =>
(
MCPService as unknown as { createDiagnosticFetch: DiagnosticFetchFactory }
).createDiagnosticFetch('test-server', config, baseInit, new URL(config.url), false, onLog);
).createDiagnosticFetch('test-server', config, baseInit, new URL(config.url), useProxy, onLog);
describe('MCPService', () => {
afterEach(() => {
@@ -94,6 +96,64 @@ describe('MCPService', () => {
});
});
it('wraps dynamic request headers when using the CORS proxy', async () => {
const logs: MCPConnectionLog[] = [];
const proxiedAuthToken = `${CORS_PROXY_HEADER_PREFIX}x-auth-token`;
const proxiedContentType = `${CORS_PROXY_HEADER_PREFIX}content-type`;
const proxiedSessionId = `${CORS_PROXY_HEADER_PREFIX}mcp-session-id`;
const response = new Response('{}', {
status: 200,
headers: { 'content-type': 'application/json' }
});
const fetchMock = vi.fn().mockResolvedValue(response);
vi.stubGlobal('fetch', fetchMock);
const config: MCPServerConfig = {
url: 'https://example.com/mcp',
transport: MCPTransportType.STREAMABLE_HTTP,
useProxy: true
};
const controller = createDiagnosticFetch(
config,
(log) => logs.push(log),
{
headers: {
authorization: 'Bearer llama-server-key',
[proxiedAuthToken]: 'target-token'
}
},
true
);
await controller.fetch('http://localhost:8080/cors-proxy?url=https%3A%2F%2Fexample.com%2Fmcp', {
method: 'POST',
headers: {
'content-type': 'application/json',
'mcp-session-id': 'session-request-12345'
},
body: '{}'
});
const sentHeaders = fetchMock.mock.calls[0]?.[1]?.headers as Headers;
expect(sentHeaders.get('authorization')).toBe('Bearer llama-server-key');
expect(sentHeaders.get(proxiedAuthToken)).toBe('target-token');
expect(sentHeaders.get(proxiedContentType)).toBe('application/json');
expect(sentHeaders.get(proxiedSessionId)).toBe('session-request-12345');
expect(sentHeaders.has('content-type')).toBe(false);
expect(sentHeaders.has('mcp-session-id')).toBe(false);
expect(logs[0].details).toMatchObject({
request: {
headers: {
authorization: '[redacted]',
[proxiedAuthToken]: '[redacted]',
[proxiedSessionId]: '....12345'
}
}
});
});
it('partially redacts mcp-session-id in diagnostic request and response logs', async () => {
const logs: MCPConnectionLog[] = [];
const response = new Response('{}', {
@@ -1,5 +1,6 @@
import { describe, expect, it } from 'vitest';
import { sanitizeHeaders } from '$lib/utils/api-headers';
import { CORS_PROXY_HEADER_PREFIX } from '$lib/constants';
describe('sanitizeHeaders', () => {
it('returns empty object for undefined input', () => {
@@ -52,4 +53,21 @@ describe('sanitizeHeaders', () => {
const result = sanitizeHeaders(headers, ['X-CUSTOM-TOKEN']);
expect(result['x-custom-token']).toBe('[redacted]');
});
it('redacts proxied sensitive and custom target headers', () => {
const proxiedAuthorization = `${CORS_PROXY_HEADER_PREFIX}authorization`;
const proxiedSessionId = `${CORS_PROXY_HEADER_PREFIX}mcp-session-id`;
const proxiedVendorKey = `${CORS_PROXY_HEADER_PREFIX}x-vendor-key`;
const headers = new Headers({
[proxiedAuthorization]: 'Bearer secret',
[proxiedSessionId]: 'session-12345',
[proxiedVendorKey]: 'vendor-secret'
});
const partial = new Map([['mcp-session-id', 5]]);
const result = sanitizeHeaders(headers, ['x-vendor-key'], partial);
expect(result[proxiedAuthorization]).toBe('[redacted]');
expect(result[proxiedSessionId]).toBe('....12345');
expect(result[proxiedVendorKey]).toBe('[redacted]');
});
});