server: fix remote preset handling, add test (#24938 )

* server: add test for remote preset * fix remote preset handling * fix * fix test
vulkan: link ggml-cpu when GGML_VULKAN_CHECK_RESULTS / RUN_TESTS are enabled (#24444 )
2026-06-23 14:17:39 +02:00 · 2026-06-23 13:28:34 +02:00 · 2026-06-23 12:55:46 +02:00 · 2026-06-23 12:03:31 +02:00
18 changed files with 144 additions and 29 deletions
@@ -301,6 +301,8 @@ static handle_model_result common_params_handle_model(struct common_params_model
                                                      const common_download_opts & opts) {
    handle_model_result result;

+    // TODO @ngxson : refactor this into a new common_model_download_context
+
    if (!model.docker_repo.empty()) {
        model.path = common_docker_resolve_model(model.docker_repo);
    } else if (!model.hf_repo.empty()) {
@@ -396,7 +398,7 @@ static bool parse_bool_value(const std::string & value) {
 // CLI argument parsing functions
 //

-bool common_params_handle_models(common_params & params, llama_example curr_ex, common_download_callback * callback) {
+bool common_params_handle_models(common_params & params, llama_example curr_ex, const common_params_handle_models_params & handle_params) {
    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
                                         params.speculative.types.end(),
                                         COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
@@ -407,9 +409,10 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex,
    opts.skip_download   = params.skip_download;
    opts.download_mtp    = spec_type_draft_mtp;
    opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
+    opts.preset_only     = handle_params.preset_only;

-    if (callback) {
-        opts.callback = callback;
+    if (handle_params.callback) {
+        opts.callback = handle_params.callback;
    }

    // sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
@@ -596,7 +599,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context

    if (!skip_model_download) {
        // handle model and download
-        common_params_handle_models(params, ctx_arg.ex);
+        common_params_handle_models(params, ctx_arg.ex, {});

        // model is required (except for server)
        // TODO @ngxson : maybe show a list of available models in CLI in this case
@@ -130,6 +130,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

+struct common_params_handle_models_params {
+    common_download_callback * callback = nullptr;
+    bool preset_only = false; // if true, only check & download remote preset (for router mode)
+};
+
 // populate model paths (main model, mmproj, etc) from -hf if necessary
 // return true if the model is ready to use
 // throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
@@ -137,7 +142,7 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
 bool common_params_handle_models(
    common_params & params,
    llama_example curr_ex,
-    common_download_callback * callback = nullptr);
+    const common_params_handle_models_params & handle_params);

 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
@@ -799,6 +799,7 @@ common_download_model_result common_download_model(const common_params_model  &

    bool download_mmproj = opts.download_mmproj;
    bool download_mtp = opts.download_mtp;
+    bool preset_only = opts.preset_only;
    bool is_hf = !model.hf_repo.empty();

    if (is_hf) {
@@ -806,7 +807,8 @@ common_download_model_result common_download_model(const common_params_model  &
        if (!hf.preset.path.empty()) {
            // if preset.ini exists, only download that file alone
            tasks.push_back({hf.preset.url, hf.preset.local_path});
-        } else {
+        } else if (!preset_only) {
+            // only add other files if we're NOT in preset-only mode (normal run, non-router)
            for (const auto & f : hf.model_files) {
                tasks.push_back({f.url, f.local_path});
            }
@@ -55,6 +55,7 @@ struct common_download_opts {
    bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
    bool download_mmproj = false;
    bool download_mtp = false;
+    bool preset_only = false; // if true, only check & download remote preset (for router mode)
    common_download_callback * callback = nullptr;
 };

@@ -96,6 +96,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "GraniteMoeHybridForCausalLM": "granite",
    "GraniteMoeSharedForCausalLM": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
+    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "Grok1ForCausalLM": "grok",
    "GrokForCausalLM": "grok",
    "GroveMoeForCausalLM": "grovemoe",
@@ -261,6 +262,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "GlmasrModel": "ultravox",
    "Granite4VisionForConditionalGeneration": "granite",
    "GraniteSpeechForConditionalGeneration": "granite",
+    "GraniteSpeechPlusForConditionalGeneration": "granite",
    "HunYuanVLForConditionalGeneration": "hunyuan",
    "Idefics3ForConditionalGeneration": "smolvlm",
    "InternVisionModel": "internvl",
@@ -348,6 +348,34 @@ class GraniteSpeechMmprojModel(MmprojModel):
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
+class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
+    """Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
+    has_vision_encoder = False
+    has_audio_encoder = True
+
+    def set_gguf_parameters(self):
+        assert self.hparams_audio is not None
+        super().set_gguf_parameters()
+
+        # Add feature_layer if present in encoder config
+        if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
+            self.gguf_writer.add_audio_feature_layers(feature_layers)
+            logger.info(f"gguf: audio feature_layers = {feature_layers}")
+
+            # Validate projector dimension matches concatenated encoder output
+            hidden_dim = self.hparams_audio["hidden_dim"]
+            expected_dim = hidden_dim * (len(feature_layers) + 1)
+            projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
+
+            if projector_dim != expected_dim:
+                raise ValueError(
+                    f"Projector encoder_hidden_size ({projector_dim}) does not match "
+                    f"expected concatenated dimension ({expected_dim}). "
+                    f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
+                )
+
+
@ModelBase.register("Granite4VisionForConditionalGeneration")
 class Granite4VisionMmprojModel(MmprojModel):
    has_vision_encoder = True
@@ -108,6 +108,9 @@ if (Vulkan_FOUND)

    if (GGML_VULKAN_CHECK_RESULTS)
        add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
+        # the result-checking path computes a CPU reference graph via
+        # ggml_graph_compute_with_ctx(), which is defined in ggml-cpu
+        target_link_libraries(ggml-vulkan PRIVATE ggml-cpu)
    endif()

    if (GGML_VULKAN_DEBUG)
@@ -129,6 +132,8 @@ if (Vulkan_FOUND)

    if (GGML_VULKAN_RUN_TESTS)
        add_compile_definitions(GGML_VULKAN_RUN_TESTS)
+        # the test path also calls ggml_graph_compute_with_ctx() (ggml-cpu)
+        target_link_libraries(ggml-vulkan PRIVATE ggml-cpu)
    endif()

    # Set up toolchain for host compilation whether cross-compiling or not
@@ -359,6 +359,7 @@ class Keys:
        CHUNK_SIZE          = "clip.audio.chunk_size"
        CONV_KERNEL_SIZE    = "clip.audio.conv_kernel_size"
        MAX_POS_EMB         = "clip.audio.max_pos_emb"
+        FEATURE_LAYERS      = "clip.audio.feature_layer" # Granite Speech Plus

        class Attention:
            HEAD_COUNT      = "clip.audio.attention.head_count"
@@ -1310,6 +1310,9 @@ class GGUFWriter:
    def add_audio_max_pos_emb(self, value: int) -> None:
        self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)

+    def add_audio_feature_layers(self, layers: Sequence[int]) -> None:
+        self.add_array(Keys.ClipAudio.FEATURE_LAYERS, layers)
+
    def add_audio_projector_window_size(self, value: int) -> None:
        self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)

@@ -42,6 +42,7 @@
 #define KEY_N_HEAD              "clip.%s.attention.head_count"
 #define KEY_N_HEAD_KV           "clip.%s.attention.head_count_kv"
 #define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+#define KEY_FEATURE_LAYERS      "clip.%s.feature_layer"

 // vision-specific
 #define KEY_VISION_PROJ_TYPE        "clip.vision.projector_type" // for models with mixed modalities
@@ -54,7 +55,6 @@
 #define KEY_PATCH_SIZE              "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN              "clip.vision.image_mean"
 #define KEY_IMAGE_STD               "clip.vision.image_std"
-#define KEY_FEATURE_LAYER           "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR       "clip.vision.projector.scale_factor"
 #define KEY_PROJ_SAMPLE_QUERY_SIDE  "clip.vision.projector.query_side"
 #define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side"
@@ -91,7 +91,7 @@ struct clip_hparams {

    float eps = 1e-6;
    float rope_theta = 0.0;
-    std::vector<int32_t> vision_feature_layer;
+    std::vector<int32_t> feature_layers;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
    std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
@@ -165,8 +165,8 @@ struct clip_hparams {
        return false;
    }

-    bool is_vision_feature_layer(int32_t layer) const {
-        return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end();
+    bool is_feature_layer(int32_t layer) const {
+        return std::find(feature_layers.begin(), feature_layers.end(), layer) != feature_layers.end();
    }
 };

@@ -1264,12 +1264,10 @@ struct clip_model_loader {
                }
            }

-            // Load the vision feature layer indices if they are explicitly provided;
-            // if multiple vision feature layers are present, the values will be concatenated
-            // to form the final visual features.
+            // Load the vision/audio feature layer indices if they are explicitly provided
            // NOTE: gguf conversions should standardize the values of the vision feature layer to
            // be non-negative, since we use -1 to mark values as unset here.
-            get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false);
+            get_arr_int(string_format(KEY_FEATURE_LAYERS, prefix), hparams.feature_layers, false);

            // model-specific params
            switch (model.proj_type) {
@@ -1651,6 +1649,7 @@ struct clip_model_loader {
                        get_u32(KEY_A_PROJ_WINDOW_SIZE,     hparams.audio_proj_window_size);
                        get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate);
                        get_u32(KEY_A_PROJ_HEAD_COUNT,      hparams.audio_proj_head_count);
+                        // NOTE: feature layers loaded above in common path
                    } break;
                case PROJECTOR_TYPE_JANUS_PRO:
                    {
@@ -1663,11 +1662,11 @@ struct clip_model_loader {
                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
                        hparams.image_resize_pad = PAD_CEIL;

-                        get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer);
+                        // NOTE: feature_layers loaded in common path as optional
                        get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets);
-                        if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) {
-                            throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d",
-                                                                   hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size()));
+                        if (hparams.feature_layers.size() != hparams.proj_spatial_offsets.size()) {
+                            throw std::runtime_error(string_format("%s: feature_layers.size() %d != proj_spatial_offsets.size() %d",
+                                                                   hparams.feature_layers.size(), hparams.proj_spatial_offsets.size()));
                        }

                        get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE,  hparams.downsample_query_side);
@@ -2740,7 +2739,7 @@ struct clip_model_loader {
                    model.image_newline = get_tensor(TN_IMAGE_NEWLINE);

                    // Load separate layerwise and spatial projector tensors
-                    const auto projector_count = hparams.vision_feature_layer.size();
+                    const auto projector_count = hparams.feature_layers.size();
                    model.qf_proj_blocks.resize(projector_count);
                    for (size_t bid = 0; bid < projector_count; ++bid) {
                        auto & b = model.qf_proj_blocks[bid];
@@ -4388,7 +4387,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32

                // Stage 1b only uses block 0's permutations; future stages
                // will upload all blocks.
-                for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) {
+                for (size_t bid = 0; bid < hparams.feature_layers.size(); ++bid) {
                    const std::string prefix = "g4v_blk" + std::to_string(bid) + "_";
                    upload(prefix + "win_idx",     make_win_idx(image_side, window_side));
                    upload(prefix + "qwin_idx",    make_win_idx(new_side, query_side));
@@ -1,5 +1,7 @@
 #include "models.h"

+#include <algorithm>
+
 ggml_cgraph * clip_graph_granite_speech::build() {
    const int n_frames     = img.nx();
    const int context_size = hparams.audio_chunk_size;
@@ -11,6 +13,10 @@ ggml_cgraph * clip_graph_granite_speech::build() {
    const int padded_len   = num_blocks * context_size;
    const int remainder    = n_frames % context_size;

+    // Calculate projector input dimension based on feature layers
+    const int proj_input_dim = n_embd * (hparams.feature_layers.size() + 1);
+    const bool use_feature_concat = !hparams.feature_layers.empty();
+
    ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
    ggml_set_name(attn_dists, "attn_dists");
    ggml_set_input(attn_dists);
@@ -31,6 +37,15 @@ ggml_cgraph * clip_graph_granite_speech::build() {
    cur = ggml_add(ctx0, cur, model.inp_proj_b);
    cb(cur, "inp_linear", -1);

+    // Capture layer 0 if requested (after input_linear)
+    ggml_tensor * concat_result = nullptr;
+    if (use_feature_concat) {
+        if (std::find(hparams.feature_layers.begin(), hparams.feature_layers.end(), 0) != hparams.feature_layers.end()) {
+            concat_result = cur;
+            cb(concat_result, "feature_layer_0", -1);
+        }
+    }
+
    for (int il = 0; il < n_layer; il++) {
        const auto & layer = model.layers[il];
        auto * residual = cur;
@@ -168,6 +183,18 @@ ggml_cgraph * clip_graph_granite_speech::build() {
                         NORM_TYPE_NORMAL, eps, il);
        cb(cur, "layer_out", il);

+        // Capture intermediate layer (il + 1) if requested
+        if (use_feature_concat) {
+            if (hparams.is_feature_layer(il + 1)) {
+                if (concat_result == nullptr) {
+                    concat_result = cur;
+                } else {
+                    concat_result = ggml_concat(ctx0, concat_result, cur, 0);
+                }
+                cb(concat_result, string_format("feature_layer_%d", il + 1).c_str(), il);
+            }
+        }
+
        // CTC branch
        if (il + 1 == ctc_layer) {
            auto * mid = build_mm(model.ctc_out_w, cur);
@@ -180,6 +207,13 @@ ggml_cgraph * clip_graph_granite_speech::build() {
        }
    }

+    // Append final output to concatenated features if using feature concatenation
+    if (use_feature_concat && concat_result != nullptr) {
+        concat_result = ggml_concat(ctx0, concat_result, cur, 0);
+        cb(concat_result, "concat_final", -1);
+        cur = concat_result;
+    }
+
    cb(cur, "encoder_out", -1);

    // QFormer projector
@@ -197,7 +231,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
            cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
        }

-        ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
+        ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, proj_input_dim, window_size, nblocks_proj);

        ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query,
            model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b,
@@ -304,14 +304,14 @@ ggml_cgraph * clip_graph_granite4_vision::build() {
    }

    // --- Stage 1b/1c: WindowQFormer blocks ---
-    const int projector_count = hparams.vision_feature_layer.size();
+    const int projector_count = hparams.feature_layers.size();
    const float qformer_eps = 1e-12f;

    ggml_tensor * mmproj = nullptr;
    for (int bid = 0; bid < projector_count; ++bid) {
        const auto & blk = model.qf_proj_blocks[bid];

-        int vlayer = hparams.vision_feature_layer[bid];
+        int vlayer = hparams.feature_layers[bid];
        GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
        ggml_tensor * h = layer_outs[vlayer];

@@ -21,7 +21,7 @@ ggml_cgraph * clip_graph_llava::build() {

        // If we set explicit vision feature layers, only go up to the deepest one
        // NOTE: only used by granite-vision models for now
-        for (const auto & feature_layer : hparams.vision_feature_layer) {
+        for (const auto & feature_layer : hparams.feature_layers) {
            if (feature_layer > deepest_feature_layer) {
                deepest_feature_layer = feature_layer;
            }
@@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() {

        // If this is an embedding feature layer, save the output.
        // NOTE: 0 index here refers to the input to the encoder.
-        if (hparams.is_vision_feature_layer(il)) {
+        if (hparams.is_feature_layer(il)) {
            embedding_stack.push_back(cur);
        }

@@ -134,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() {
    // process vision feature layers (used by granite)
    {
        // final layer is a vision feature layer
-        if (hparams.is_vision_feature_layer(max_feature_layer)) {
+        if (hparams.is_feature_layer(max_feature_layer)) {
            embedding_stack.push_back(inpL);
        }

@@ -224,7 +224,7 @@ void server_model_meta::update_caps() {
        });
        params.offline = true;
        // params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time
-        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
+        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
        if (params.mmproj.path.empty()) {
            multimodal = { false, false };
        } else {
@@ -1393,7 +1393,9 @@ struct server_download_state : public common_download_callback {

    bool run(common_params & params) {
        try {
-            common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, this);
+            common_params_handle_models_params p;
+            p.callback = this;
+            common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, p);
            is_ok = true;
        } catch (const std::exception & e) {
            auto model_name = params.model.get_name();
@@ -89,6 +89,17 @@ int llama_server(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

+    // note: router mode also accepts -hf remote-preset, so we need to check that first
+    if (!params.model.hf_repo.empty()) {
+        try {
+            common_params_handle_models_params handle_params;
+            handle_params.preset_only = true;
+            common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, handle_params);
+        } catch (const std::exception & e) {
+            // ignored for now
+        }
+    }
+
    // router server never loads a model and must not touch the GPU
    const bool is_router_server = params.model.path.empty()
                               && params.model.hf_repo.empty();
@@ -263,7 +274,7 @@ int llama_server(int argc, char ** argv) {
        return child.run_download(params);
    } else if (!is_router_server) {
        // single-model mode (NOT spawned by router)
-        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER);
+        common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {});
    }

    //
@@ -256,6 +256,25 @@ def test_router_reload_models():
        os.remove(preset_path)


+def test_router_remote_preset():
+    global server
+    server.model_hf_repo = "ggml-org/test-preset-ci"
+    server.model_hf_file = None
+    server.offline = False
+    server.start()
+
+    # Should see preset models in GET /models
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    ids = {item["id"] for item in res.body.get("data", [])}
+    assert "tinygemma3-preset" in ids
+    assert "stories260K-test" in ids
+
+    # Should be able to load a preset model
+    model_id = "tinygemma3-preset"
+    _load_model_and_wait(model_id)
+
+
 MODEL_DOWNLOAD_ID = "ggml-org/test-model-router-download:F16"
 MODEL_DOWNLOAD_TIMEOUT = 30