rpc : code cleanup (#11107 )

Remove duplicated macros, use GGML_LOG_ERROR for errors
SYCL: Use get_multi_ptr instead of deprecated get_pointer in wkv6 (#11087 )
2026-07-01 10:07:44 +02:00 · 2025-01-07 08:37:02 +02:00 · 2025-01-07 14:26:07 +08:00 · 2025-01-06 23:45:28 +01:00 · 2025-01-06 17:52:35 +02:00 · 2025-01-06 16:34:49 +01:00
31 changed files with 153 additions and 136 deletions
@@ -65,12 +65,22 @@ body:
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
+  - type: textarea
+    id: command
+    attributes:
+      label: Compile command
+      description: >
+        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
+        This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: true
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          Please copy and paste any relevant log output, including any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
@@ -52,6 +52,16 @@ body:
        - Other (Please specify in the next section)
    validations:
      required: false
+  - type: textarea
+    id: command
+    attributes:
+      label: Command line
+      description: >
+        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
+        This will be automatically formatted into code, so no need for backticks.
+      render: shell
+    validations:
+      required: false
  - type: textarea
    id: info
    attributes:
@@ -74,7 +84,7 @@ body:
    attributes:
      label: Relevant log output
      description: >
-          If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
+          If applicable, please copy and paste any relevant log output, including any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
@@ -1,5 +1,5 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs

 /ci/ @ggerganov
-/.devops/ @ngxson
+/.devops/*.Dockerfile @ngxson
 /examples/server/ @ngxson
@@ -846,7 +846,7 @@ struct common_init_result common_init_from_params(common_params & params) {
    } else if (!params.model_url.empty()) {
        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
    } else {
-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
    }

    if (model == NULL) {
@@ -873,7 +873,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        }

        if (!ok) {
-            llama_free_model(model);
+            llama_model_free(model);

            return iparams;
        }
@@ -884,7 +884,7 @@ struct common_init_result common_init_from_params(common_params & params) {
    llama_context * lctx = llama_new_context_with_model(model, cparams);
    if (lctx == NULL) {
        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
-        llama_free_model(model);
+        llama_model_free(model);
        return iparams;
    }

@@ -900,7 +900,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        const auto cvec = common_control_vector_load(params.control_vectors);
        if (cvec.n_embd == -1) {
            llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);

            return iparams;
        }
@@ -913,7 +913,7 @@ struct common_init_result common_init_from_params(common_params & params) {
                                             params.control_vector_layer_end);
        if (err) {
            llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);

            return iparams;
        }
@@ -926,7 +926,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        if (lora == nullptr) {
            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
            llama_free(lctx);
-            llama_free_model(model);
+            llama_model_free(model);
            return iparams;
        }

@@ -1411,7 +1411,7 @@ struct llama_model * common_load_model_from_url(
        }
    }

-    return llama_load_model_from_file(local_path.c_str(), params);
+    return llama_model_load_from_file(local_path.c_str(), params);
 }

 struct llama_model * common_load_model_from_hf(
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
    llama_batch_free(batch);

    llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);

    llama_backend_free();

@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
        LOG_ERR("%s: error: unable to load model\n" , __func__);
@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {

    llama_sampler_free(smpl);
    llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);

    llama_backend_free();

@@ -165,7 +165,7 @@ int main(int argc, char * argv[]) {

    llama_backend_init();

-    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);

    // create generation context
    llama_context * ctx = llama_new_context_with_model(model, cparams);
@@ -219,7 +219,7 @@ int main(int argc, char * argv[]) {

    llama_sampler_free(smpl);
    llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);
    llama_backend_free();

    return 0;
@@ -1526,10 +1526,10 @@ int main(int argc, char ** argv) {
        // keep the same model between tests when possible
        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
            if (lmodel) {
-                llama_free_model(lmodel);
+                llama_model_free(lmodel);
            }

-            lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
+            lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
            if (lmodel == NULL) {
                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
                return 1;
@@ -1540,7 +1540,7 @@ int main(int argc, char ** argv) {
        llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
-            llama_free_model(lmodel);
+            llama_model_free(lmodel);
            return 1;
        }

@@ -1626,7 +1626,7 @@ int main(int argc, char ** argv) {
        ggml_threadpool_free_fn(threadpool);
    }

-    llama_free_model(lmodel);
+    llama_model_free(lmodel);

    if (p) {
        p->print_footer();
@@ -221,7 +221,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -265,7 +265,7 @@ static void llava_free(struct llava_context * ctx_llava) {
    }

    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
+    llama_model_free(ctx_llava->model);
    llama_backend_free();
 }

@@ -323,7 +323,7 @@ int main(int argc, char ** argv) {
        }
    }

-    llama_free_model(model);
+    llama_model_free(model);

    return 0;
 }
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -75,7 +75,7 @@ static void llava_free(struct llava_context * ctx_llava) {
    }

    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
+    llama_model_free(ctx_llava->model);
    llama_backend_free();
 }

@@ -310,7 +310,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -354,7 +354,7 @@ static void llava_free(struct llava_context * ctx_llava) {
    }

    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
+    llama_model_free(ctx_llava->model);
    llama_backend_free();
 }

@@ -575,7 +575,7 @@ int main(int argc, char ** argv) {
        }
    }

-    llama_free_model(model);
+    llama_model_free(model);

    return 0;
 }
@@ -63,7 +63,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
@@ -266,7 +266,7 @@ int main(int argc, char ** argv) {
    llama_batch_free(batch);

    llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);

    llama_backend_free();

@@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
        auto mparams = llama_model_default_params();
        mparams.use_mlock  = false;

-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        model = llama_model_load_from_file(params.model.c_str(), mparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
@@ -323,7 +323,7 @@ int main(int argc, char ** argv) {

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
-            llama_free_model(model);
+            llama_model_free(model);
            return 1;
        }
    }
@@ -347,7 +347,7 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
            llama_free(ctx);
-            llama_free_model(model);
+            llama_model_free(model);
            return 1;
        }
        included_layers++;
@@ -409,7 +409,7 @@ int main(int argc, char ** argv) {


    llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();
@@ -83,6 +83,7 @@ class Opt {
        }

        ctx_params.n_batch        = context_size >= 0 ? context_size : context_size_default;
+        ctx_params.n_ctx          = ctx_params.n_batch;
        model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
        temperature               = temperature >= 0 ? temperature : temperature_default;

@@ -664,7 +665,7 @@ class LlamaData {
            "\r%*s"
            "\rLoading model",
            get_terminal_width(), " ");
-        llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params));
+        llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
        if (!model) {
            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
        }
@@ -3797,7 +3797,7 @@ int main(int argc, char ** argv) {
        data["input_extra"] = input_extra; // default to empty array if it's not exist

        std::string prompt = json_value(data, "prompt", std::string());
-        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
+        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true);
        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
        data["prompt"] = format_infill(
            ctx_server.ctx,
@@ -18,7 +18,7 @@ def test_infill_without_input_extra():
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
-    assert match_regex("(Ann|small|shiny)+", res.body["content"])
+    assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])


 def test_infill_with_input_extra():
@@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;

-    llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
    if (!model) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
    }
    llama_sampler_free(smpl);
    llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);

    return 0;
 }
@@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;

-    llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -199,7 +199,7 @@ int main(int argc, char ** argv) {

    llama_sampler_free(smpl);
    llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);

    return 0;
 }
@@ -338,7 +338,7 @@ int main(int raw_argc, char ** raw_argv) {

    llama_model_params model_params = llama_model_default_params();
    model_params.vocab_only = true;
-    llama_model * model = llama_load_model_from_file(model_path, model_params);
+    llama_model * model = llama_model_load_from_file(model_path, model_params);
    if (!model) {
        fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
        return 1;
@@ -408,7 +408,7 @@ int main(int raw_argc, char ** raw_argv) {
    }
    // silence valgrind
    llama_free(ctx);
-    llama_free_model(model);
+    llama_model_free(model);

    return 0;
 }
@@ -27,15 +27,6 @@
 #endif
 #include <cstring>

-#define UNUSED GGML_UNUSED
-
-#define GGML_DEBUG 0
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
 #ifdef _WIN32
 typedef SOCKET sockfd_t;
 using ssize_t = __int64;
@@ -411,7 +402,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
        initialized = true;
    }
 #else
-    UNUSED(initialized);
+    GGML_UNUSED(initialized);
 #endif
    auto sock = socket_connect(host.c_str(), port);
    if (sock == nullptr) {
@@ -640,7 +631,7 @@ static void ggml_backend_rpc_free(ggml_backend_t backend) {
 }

 static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
-    UNUSED(backend);
+    GGML_UNUSED(backend);
    // this is no-op because we don't have any async operations
 }

@@ -850,7 +841,7 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
        GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
        buffers.insert(buffer);
    } else {
-        GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
+        GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
    }
 }

@@ -872,7 +863,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
    if (buffers.find(buffer) == buffers.end()) {
-        GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
+        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
        return false;
    }
    void * base = ggml_backend_buffer_get_base(buffer);
@@ -884,7 +875,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
    if (buffers.find(buffer) == buffers.end()) {
-        GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
+        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
        return false;
    }
    ggml_backend_buffer_free(buffer);
@@ -896,7 +887,7 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
    GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
    ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
    if (buffers.find(buffer) == buffers.end()) {
-        GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
+        GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
        return false;
    }
    ggml_backend_buffer_clear(buffer, request.value);
@@ -952,7 +943,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
    struct ggml_context * ctx = ggml_init(params);
    ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
    if (tensor == nullptr) {
-        GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
+        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        ggml_free(ctx);
        return false;
    }
@@ -1017,7 +1008,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
    struct ggml_context * ctx = ggml_init(params);
    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
    if (tensor == nullptr) {
-        GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
+        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        ggml_free(ctx);
        return false;
    }
@@ -1051,7 +1042,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
    ggml_tensor * src = deserialize_tensor(ctx, &request.src);
    ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
    if (src == nullptr || dst == nullptr) {
-        GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
+        GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
        ggml_free(ctx);
        return false;
    }
@@ -1385,14 +1376,14 @@ static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t *

    ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);

-    UNUSED(dev);
+    GGML_UNUSED(dev);
 }

 static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
    // TODO: obtain value from the server
    return GGML_BACKEND_DEVICE_TYPE_GPU;

-    UNUSED(dev);
+    GGML_UNUSED(dev);
 }

 static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
@@ -1413,7 +1404,7 @@ static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const

    return ggml_backend_rpc_init(ctx->endpoint.c_str());

-    UNUSED(params);
+    GGML_UNUSED(params);
 }

 static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
@@ -1421,12 +1412,12 @@ static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_b

    return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());

-    UNUSED(dev);
+    GGML_UNUSED(dev);
 }

 static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    UNUSED(dev);
-    UNUSED(op);
+    GGML_UNUSED(dev);
+    GGML_UNUSED(op);
    //TODO: call the remote backend and cache the results
    return true;
 }
@@ -1463,20 +1454,20 @@ static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
 static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
    return "RPC";

-    UNUSED(reg);
+    GGML_UNUSED(reg);
 }

 static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
    return 0;

-    UNUSED(reg);
+    GGML_UNUSED(reg);
 }

 static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
    GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");

-    UNUSED(reg);
-    UNUSED(index);
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
 }

 static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
@@ -1485,7 +1476,7 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
    }
    return NULL;

-    UNUSED(reg);
+    GGML_UNUSED(reg);
 }

 static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
@@ -131,7 +131,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
            [=](sycl::nd_item<3> item_ct1) {
                rwkv_wkv_f32_kernel(
                    B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
-                    item_ct1, shared_mem_acc.get_pointer()
+                    item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
                );
            });
    });
@@ -9,7 +9,7 @@
 #include "llama.h"

 struct llama_model_deleter {
-    void operator()(llama_model * model) { llama_free_model(model); }
+    void operator()(llama_model * model) { llama_model_free(model); }
 };

 struct llama_context_deleter {
@@ -413,12 +413,19 @@ extern "C" {
    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);

-    LLAMA_API struct llama_model * llama_load_model_from_file(
+    DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
+                             const char * path_model,
+              struct llama_model_params   params),
+            "use llama_model_load_from_file instead");
+
+    LLAMA_API struct llama_model * llama_model_load_from_file(
                             const char * path_model,
              struct llama_model_params   params);

-    // TODO: rename to llama_model_free
-    LLAMA_API void llama_free_model(struct llama_model * model);
+    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
+            "use llama_model_free instead");
+
+    LLAMA_API void llama_model_free(struct llama_model * model);

    // TODO: rename to llama_init_from_model
    LLAMA_API struct llama_context * llama_new_context_with_model(
@@ -119,10 +119,10 @@ bool llama_kv_cache_init(

 struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
           struct llama_kv_cache & cache,
-       const struct llama_ubatch & batch) {
-    const uint32_t n_tokens = batch.n_tokens;
-    const uint32_t n_seqs   = batch.n_seqs;
-    const uint32_t n_seq_tokens = batch.n_seq_tokens;
+       const struct llama_ubatch & ubatch) {
+    const uint32_t n_tokens = ubatch.n_tokens;
+    const uint32_t n_seqs   = ubatch.n_seqs;
+    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;

    if (cache.recurrent) {
        // For recurrent state architectures (like Mamba or RWKV),
@@ -130,16 +130,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
        // A slot should be always be contiguous.

        // can only process batches with an equal number of new tokens in each sequence
-        GGML_ASSERT(batch.equal_seqs);
+        GGML_ASSERT(ubatch.equal_seqs);

        int32_t min = cache.size - 1;
        int32_t max = 0;

        // everything should fit if all seq_ids are smaller than the max
        for (uint32_t s = 0; s < n_seqs; ++s) {
-            const uint32_t n_seq_id = batch.n_seq_id[s];
+            const uint32_t n_seq_id = ubatch.n_seq_id[s];
            for (uint32_t j = 0; j < n_seq_id; ++j) {
-                const llama_seq_id seq_id = batch.seq_id[s][j];
+                const llama_seq_id seq_id = ubatch.seq_id[s][j];

                if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
                    // too big seq_id
@@ -198,7 +198,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(

        // find usable cell range
        for (uint32_t s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = batch.seq_id[s][0];
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
            llama_kv_cell & seq_meta = cache.cells[seq_id];
            bool has_cell = false;
            if (seq_meta.tail >= 0) {
@@ -237,7 +237,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
        // gather and re-order
        for (uint32_t s = 0; s < n_seqs; ++s) {
            int32_t dst_id = s + min;
-            int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
+            int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
            if (dst_id != src_id) {
                llama_kv_cell & dst_cell = cache.cells[dst_id];
                llama_kv_cell & src_cell = cache.cells[src_id];
@@ -258,7 +258,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(

        // update the pos of the used seqs
        for (uint32_t s = 0; s < n_seqs; ++s) {
-            const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+            const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
            int32_t cell_id = s + min;
            llama_kv_cell & cell = cache.cells[cell_id];

@@ -266,12 +266,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
                // What should happen when the pos backtracks or skips a value?
                // Clearing the state mid-batch would require special-casing which isn't done.
                LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
-                    __func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
+                    __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
            }
            cell.pos = last_pos;
            cell.seq_id.clear();
-            for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
-                const llama_seq_id seq_id = batch.seq_id[s][j];
+            for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
+                const llama_seq_id seq_id = ubatch.seq_id[s][j];
                cell.seq_id.insert(seq_id);
                cache.cells[seq_id].tail = cell_id;
            }
@@ -325,10 +325,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
    for (uint32_t s = 0; s < n_seqs; s++) {
        for (uint32_t i = 0; i < n_seq_tokens; ++i) {
            uint32_t k = s*n_seq_tokens + i;
-            cache.cells[cache.head + k].pos = batch.pos[k];
+            cache.cells[cache.head + k].pos = ubatch.pos[k];

-            for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
-                cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
+            for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
+                cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
            }
        }
    }
@@ -2009,6 +2009,10 @@ struct llama_model_params llama_model_default_params() {
 }

 void llama_free_model(struct llama_model * model) {
+    llama_model_free(model);
+}
+
+void llama_model_free(struct llama_model * model) {
    delete model;
 }

@@ -8,7 +8,6 @@
 #include "llama-kv-cache.h"
 #include "llama-model-loader.h"
 #include "llama-model.h"
-#include "llama-quant.h"

 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -18,12 +17,8 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
-#include <cctype>
 #include <cfloat>
-#include <cinttypes>
-#include <climits>
 #include <cmath>
-#include <cstdarg>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
@@ -31,10 +26,7 @@
 #include <ctime>
 #include <functional>
 #include <initializer_list>
-#include <locale>
 #include <map>
-#include <numeric>
-#include <type_traits>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -2540,21 +2532,21 @@ static struct ggml_tensor * llm_build_inp_embd(
        struct ggml_context * ctx,
       struct llama_context & lctx,
        const llama_hparams & hparams,
-         const llama_ubatch & batch,
+         const llama_ubatch & ubatch,
         struct ggml_tensor * tok_embd,
         const llm_build_cb & cb) {
    const int64_t n_embd = hparams.n_embd;

    struct ggml_tensor * inpL;

-    if (batch.token) {
-        lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
+    if (ubatch.token) {
+        lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens);
        cb(lctx.inp_tokens, "inp_tokens", -1);
        ggml_set_input(lctx.inp_tokens);

        inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
    } else {
-        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
        inpL = lctx.inp_embd;
        ggml_set_input(lctx.inp_embd);
    }
@@ -3149,7 +3141,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
 static struct ggml_tensor * llm_build_mamba(
        struct ggml_context * ctx,
       struct llama_context & lctx,
-         const llama_ubatch & batch,
+         const llama_ubatch & ubatch,
         struct ggml_cgraph * graph,
         struct ggml_tensor * cur,
         struct ggml_tensor * state_copy,
@@ -3165,17 +3157,17 @@ static struct ggml_tensor * llm_build_mamba(
    const int64_t d_inner = hparams.ssm_d_inner;
    const int64_t d_state = hparams.ssm_d_state;
    const int64_t dt_rank = hparams.ssm_dt_rank;
-    const int64_t n_seqs  = batch.n_seqs;
+    const int64_t n_seqs  = ubatch.n_seqs;
    // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
    const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
    // Use the same RMS norm as the final layer norm
    const float norm_rms_eps = hparams.f_norm_rms_eps;

-    const int64_t n_seq_tokens = batch.n_seq_tokens;
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;

    GGML_ASSERT(n_seqs != 0);
-    GGML_ASSERT(batch.equal_seqs);
-    GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs);
+    GGML_ASSERT(ubatch.equal_seqs);
+    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);

    struct ggml_tensor * conv_states_all = kv.k_l[il];
    struct ggml_tensor * ssm_states_all  = kv.v_l[il];
@@ -11519,13 +11511,7 @@ int32_t llama_lora_adapter_set(
            struct llama_context * ctx,
            struct llama_lora_adapter * adapter,
            float scale) {
-    if (ctx->cparams.flash_attn) {
-        LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
-        return -1;
-    }
-
    ctx->lora_adapters[adapter] = scale;
-
    return 0;
 }

@@ -11656,6 +11642,12 @@ int64_t llama_time_us(void) {
 struct llama_model * llama_load_model_from_file(
        const char * path_model,
        struct llama_model_params params) {
+    return llama_model_load_from_file(path_model, params);
+}
+
+struct llama_model * llama_model_load_from_file(
+        const char * path_model,
+        struct llama_model_params params) {
    ggml_time_init();

    llama_model * model = new llama_model;
@@ -11694,7 +11686,7 @@ struct llama_model * llama_load_model_from_file(
        ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
        if (!rpc_reg) {
            LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
-            llama_free_model(model);
+            llama_model_free(model);
            return nullptr;
        }

@@ -11702,7 +11694,7 @@ struct llama_model * llama_load_model_from_file(
        ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
        if (!ggml_backend_rpc_add_device_fn) {
            LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
-            llama_free_model(model);
+            llama_model_free(model);
            return nullptr;
        }

@@ -11712,7 +11704,7 @@ struct llama_model * llama_load_model_from_file(
                model->devices.push_back(dev);
            } else {
                LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
-                llama_free_model(model);
+                llama_model_free(model);
                return nullptr;
            }
        }
@@ -11744,7 +11736,7 @@ struct llama_model * llama_load_model_from_file(
    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
        if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
            LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
-            llama_free_model(model);
+            llama_model_free(model);
            return nullptr;
        }
        ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
@@ -11767,7 +11759,7 @@ struct llama_model * llama_load_model_from_file(
            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
        }

-        llama_free_model(model);
+        llama_model_free(model);
        return nullptr;
    }

@@ -12434,16 +12426,16 @@ int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix,
    return 0;
 }

-int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
+int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
    std::string str_split_path(split_path);
    char postfix[32];
    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
    std::string str_postfix(postfix);

-    // check if dest ends with postfix
+    // check if split_prefix ends with postfix
    int size_prefix = str_split_path.size() - str_postfix.size();
    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
-        snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
+        snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
        return size_prefix;
    }

@@ -12452,6 +12444,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int

 const char * llama_print_system_info(void) {
    static std::string s;
+    s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
+

    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
        auto * reg = ggml_backend_reg_get(i);
@@ -13,10 +13,10 @@ int main(int argc, char ** argv) {

    std::thread([&model_path]() {
        llama_backend_init();
-        auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
+        auto * model = llama_model_load_from_file(model_path, llama_model_default_params());
        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
        llama_free(ctx);
-        llama_free_model(model);
+        llama_model_free(model);
        llama_backend_free();
    }).join();

@@ -21,7 +21,7 @@ int main(int argc, char *argv[] ) {
        (void) ctx;
        return progress > 0.50;
    };
-    auto * model = llama_load_model_from_file(model_path, params);
+    auto * model = llama_model_load_from_file(model_path, params);
    llama_backend_free();
    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }
@@ -152,7 +152,7 @@ int main(int argc, char **argv) {

        mparams.vocab_only = true;

-        model = llama_load_model_from_file(fname.c_str(), mparams);
+        model = llama_model_load_from_file(fname.c_str(), mparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
@@ -165,7 +165,7 @@ int main(int argc, char **argv) {

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
+            llama_model_free(model);
            return 1;
        }
    }
@@ -300,7 +300,7 @@ int main(int argc, char **argv) {
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
    }

-    llama_free_model(model);
+    llama_model_free(model);
    llama_free(ctx);

    llama_backend_free();
@@ -46,7 +46,7 @@ int main(int argc, char **argv) {

        mparams.vocab_only = true;

-        model = llama_load_model_from_file(fname.c_str(), mparams);
+        model = llama_model_load_from_file(fname.c_str(), mparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
@@ -59,7 +59,7 @@ int main(int argc, char **argv) {

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
+            llama_model_free(model);
            return 1;
        }
    }
@@ -143,7 +143,7 @@ int main(int argc, char **argv) {
        }
    }

-    llama_free_model(model);
+    llama_model_free(model);
    llama_free(ctx);

    llama_backend_free();
@@ -34,7 +34,7 @@ int main(int argc, char ** argv) {

        mparams.vocab_only = true;

-        model = llama_load_model_from_file(fname.c_str(), mparams);
+        model = llama_model_load_from_file(fname.c_str(), mparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
@@ -47,7 +47,7 @@ int main(int argc, char ** argv) {

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
+            llama_model_free(model);
            return 1;
        }
    }
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
        }
    }

-    llama_free_model(model);
+    llama_model_free(model);
    llama_free(ctx);

    llama_backend_free();
Author	SHA1	Message	Date
Radoslav Gerganov	a4dd490069	rpc : code cleanup (#11107 ) Remove duplicated macros, use GGML_LOG_ERROR for errors	2025-01-07 08:37:02 +02:00
Akarshan Biswas	c0d6f790d0	SYCL: Use get_multi_ptr instead of deprecated get_pointer in wkv6 (#11087 ) * SYCL: Use get_multi_ptr instead of deprecated get_pointer in wkv6 * Revert "SYCL: Use get_multi_ptr instead of deprecated get_pointer in wkv6" This reverts commit `f62dc45f31`. * Reland: Use get_multi_ptr instead of deprecated get_pointer in wkv6	2025-01-07 14:26:07 +08:00
Eric Curtin	dc7cef9f37	llama-run : fix context size (#11094 ) Set `n_ctx` equal to `n_batch` in `Opt` class. Now context size is a more reasonable 2048. Signed-off-by: Eric Curtin <ecurtin@redhat.com>	2025-01-06 23:45:28 +01:00
Georgi Gerganov	ecebbd292d	llama : remove unused headers (#11109 ) ggml-ci	2025-01-06 17:52:35 +02:00
Xuan Son Nguyen	96be8c3264	github : add cmd line field to bug report (#11090 ) * github : cmd line to bug report * codeowners : (@ngxson) only watch dockerfile * Apply suggestions from code review [no ci] Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * rm cmd in log output [no ci] * rm 2 [no ci] * no need backticks [no ci] --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>	2025-01-06 16:34:49 +01:00
Georgi Gerganov	e6e7c75d94	server : fix extra BOS in infill endpoint (#11106 ) * server : fix extra BOS in infill endpoing ggml-ci * server : update infill tests	2025-01-06 15:36:08 +02:00
Xuan Son Nguyen	09186fabbe	llama : remove check flash_attn with lora (#11104 )	2025-01-06 13:41:12 +01:00
Asghar Ghorbani	96a1dc27c3	llama : prevent system info string accumulation across calls (#11101 )	2025-01-06 13:21:46 +02:00
Daniel Bevenius	6369f867a4	llama : rename missed batch params/vars to ubatch (#10059 ) This commit renames the `batch` parameter to `ubatch` in the `llama_kv_cache_find_slot`, `llm_build_inp_embd`, and `llm_build_mamba` functions. The motivation for this is that this should have been done as part of Commit `19d900a756` ("llama : rename batch to ubatch (#9950)") but for some reason I missed these functions in that commit and only noticed them now (sorry).	2025-01-06 11:28:17 +02:00
Georgi Gerganov	47182dd03f	llama : update llama_model API names (#11063 ) * llama : deprecate llama_free_model, add llama_model_free ggml-ci * llama : change `llama_load_model_from_file` -> `llama_model_load_from_file` ggml-ci	2025-01-06 10:55:18 +02:00