Compare commits

..

5 Commits

Author SHA1 Message Date
Georgi Gerganov 3c6391e748 speculative-simple : free batch on exit (#17985) 2025-12-13 09:48:34 +02:00
Sigbjørn Skjæret 8e4d678528 common : skip model validation when --completion-bash is requested (#17975) 2025-12-13 08:40:50 +01:00
Jeff Bolz 07a10c1090 vulkan: Allow non-pow2 n_experts in topk_moe (#17872) 2025-12-13 08:40:04 +01:00
Sigbjørn Skjæret 2bc94e7928 add llama-completion to completion-bash executables (#17976) 2025-12-13 08:35:50 +01:00
Daniel Bevenius fd1085ffb7 model-conversion : use CONVERTED_MODEL value for converted model [no ci] (#17984)
* model-conversion : use CONVERTED_MODEL value for converted model [no ci]

This commit updates the model verification scripts to use the
CONVERTED_MODEL environment variable instead of using the MODEL_PATH
(the original model path) as the basis for the converted model file
name.

The motivation for this that currently if the converted model file name
differs from the original model directory/name the verification scripts
will look for the wrong .bin files that were generating when running the
models.
For example, the following steps were not possible:
```console
(venv) $ huggingface-cli download google/gemma-3-270m-it --local-dir ggml-org/gemma-3-270m
(venv) $ python3 convert_hf_to_gguf.py ggml-org/gemma-3-270m --outfile test-bf16.gguf --outtype bf16
(venv) $ cd examples/model-conversion/
(venv) $ export MODEL_PATH=../../ggml-org/gemma-3-270m
(venv) $ export CONVERTED_MODEL=../../test-bf16.gguf
(venv) $ make causal-verify-logits
...
Data saved to data/llamacpp-test-bf16.bin
Data saved to data/llamacpp-test-bf16.txt
Error: llama.cpp logits file not found: data/llamacpp-gemma-3-270m.bin
Please run scripts/run-converted-model.sh first to generate this file.
make: *** [Makefile:62: causal-verify-logits] Error 1
```

With the changes in this commit, the above steps will now work as
expected.
2025-12-13 08:34:26 +01:00
10 changed files with 67 additions and 29 deletions
+2 -1
View File
@@ -504,7 +504,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
throw std::invalid_argument("error: --model is required\n");
}
@@ -639,6 +639,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-batched-bench",
"llama-bench",
"llama-cli",
"llama-completion",
"llama-convert-llama2c-to-ggml",
"llama-cvector-generator",
"llama-embedding",
@@ -1,10 +1,13 @@
#!/usr/bin/env python3
import numpy as np
import sys
import os
import numpy as np
from pathlib import Path
# Add utils directory to path for direct script execution
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
from common import get_model_name_from_env_path # type: ignore[import-not-found]
def quick_logits_check(pytorch_file, llamacpp_file):
"""Lightweight sanity check before NMSE"""
@@ -35,20 +38,13 @@ def quick_logits_check(pytorch_file, llamacpp_file):
return True
def main():
model_path = os.getenv('MODEL_PATH')
if not model_path:
print("Error: MODEL_PATH environment variable not set")
sys.exit(1)
if not os.path.exists(model_path):
print(f"Error: Model file not found: {model_path}")
sys.exit(1)
model_name = os.path.basename(model_path)
model_name = get_model_name_from_env_path('MODEL_PATH')
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
print(f"Using converted model: {llamacpp_model_name}")
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
if not pytorch_file.exists():
print(f"Error: PyTorch logits file not found: {pytorch_file}")
@@ -5,6 +5,7 @@ import sys
import os
import argparse
from pathlib import Path
from common import get_model_name_from_env_path # type: ignore[import-not-found]
def calculate_nmse(reference, test):
mse = np.mean((test - reference) ** 2)
@@ -67,11 +68,13 @@ def main():
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
args = parser.parse_args()
model_name = os.path.basename(args.model_path)
model_name = get_model_name_from_env_path('MODEL_PATH')
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
print(f"Model name: {model_name}")
print(f"PyTorch logits file: {pytorch_file}")
@@ -0,0 +1,20 @@
#!/usr/bin/env python3
import os
import sys
def get_model_name_from_env_path(env_path_name):
model_path = os.getenv(env_path_name)
if not model_path:
print(f"Error: {env_path_name} environment variable not set")
sys.exit(1)
if not os.path.exists(model_path):
print(f"Error: Model file not found: {model_path}")
sys.exit(1)
name = os.path.basename(os.path.normpath(model_path))
if name.endswith(".gguf"):
name = name[:-5]
return name
@@ -255,6 +255,8 @@ int main(int argc, char ** argv) {
LOG_INF("target:\n\n");
common_perf_print(ctx_tgt, smpl);
llama_batch_free(batch_tgt);
common_sampler_free(smpl);
common_speculative_free(spec);
+14 -8
View File
@@ -757,7 +757,8 @@ struct vk_device_struct {
vk_pipeline pipeline_flash_attn_split_k_reduce;
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
// [2] is for whether to take n_experts from spec constant (0) or push constant (1)
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
std::vector<vk_pipeline_ref> all_pipelines;
@@ -1149,6 +1150,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
struct vk_op_topk_moe_push_constants {
uint32_t n_rows;
uint32_t n_experts_push;
uint32_t n_expert_used;
float clamp_min;
float clamp_max;
@@ -4204,10 +4206,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
for (uint32_t use_push = 0; use_push < 2; ++use_push) {
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
}
}
for (auto &c : compiles) {
@@ -8554,7 +8558,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
GGML_ASSERT(idx < num_topk_moe_pipelines);
topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
return ctx->device->pipeline_topk_moe[idx][mode];
// use n_experts from push constant if it's not equal to the power of two spec constant
bool use_push = dst->ne[0] != (1u << idx);
return ctx->device->pipeline_topk_moe[idx][mode][use_push];
}
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
@@ -10158,6 +10164,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
vk_op_topk_moe_push_constants pc {};
pc.n_rows = n_rows;
pc.n_experts_push = n_experts;
pc.n_expert_used = n_expert_used;
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
@@ -12832,8 +12839,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
}
const int n_expert = softmax->ne[0];
// n_expert must be a power of 2
if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
return false;
}
@@ -10,6 +10,7 @@
layout (push_constant) uniform parameter
{
uint n_rows;
uint n_experts_push;
uint n_expert_used;
float clamp_min;
float clamp_max;
@@ -18,11 +19,16 @@ layout (push_constant) uniform parameter
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
layout(constant_id = 0) const uint WARP_SIZE = 32;
layout(constant_id = 1) const uint n_experts = 512;
layout(constant_id = 1) const uint n_experts_spec = 512;
layout(constant_id = 2) const bool with_norm = true;
layout(constant_id = 3) const bool late_softmax = false;
layout(constant_id = 4) const bool nexperts_use_push = false;
const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);
layout (binding = 0, std430) readonly buffer Logits {float logits[];};
layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
@@ -94,7 +100,7 @@ void main() {
}
if (!late_softmax) {
softmax_warp_inplace(wt, n_experts, lane, false);
softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
}
// at this point, each thread holds a portion of softmax,
+1 -1
View File
@@ -1,5 +1,5 @@
{
"extraPaths": ["gguf-py"],
"extraPaths": ["gguf-py", "examples/model-conversion/scripts"],
"pythonVersion": "3.9",
"pythonPlatform": "All",
"reportUnusedImport": "warning",
+4
View File
@@ -7971,8 +7971,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (bool with_norm : {false, true}) {
test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm));
test_cases.emplace_back(new test_topk_moe({31, 22, 1, 1}, 8, with_norm));
test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));
test_cases.emplace_back(new test_topk_moe({40, 22, 1, 1}, 8, with_norm));
test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm));
test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm));
test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm));
}
test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true));