metal : utilize max shared memory for mul_mat_id (#7935 )

llama-bench : fix RPC indication (#7936 )
Show "<backend_name>+RPC" when RPC offloading is used
2026-06-16 10:46:43 +02:00 · 2024-06-14 17:14:09 +03:00 · 2024-06-14 16:47:41 +03:00
2 changed files with 8 additions and 7 deletions
@@ -714,7 +714,6 @@ struct test {
    static const bool kompute;
    static const bool metal;
    static const bool sycl;
-    static const bool rpc;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
@@ -726,6 +725,7 @@ struct test {
    int n_batch;
    int n_ubatch;
    int n_threads;
+    bool has_rpc;
    ggml_type type_k;
    ggml_type type_v;
    int n_gpu_layers;
@@ -751,6 +751,7 @@ struct test {
        n_batch = inst.n_batch;
        n_ubatch = inst.n_ubatch;
        n_threads = inst.n_threads;
+        has_rpc = !inst.rpc_servers.empty();
        type_k = inst.type_k;
        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
@@ -810,9 +811,6 @@ struct test {
        if (sycl) {
            return GGML_SYCL_NAME;
        }
-        if (rpc) {
-            return "RPC";
-        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
@@ -882,7 +880,7 @@ struct test {
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
@@ -916,7 +914,6 @@ const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::sycl         = !!ggml_cpu_has_sycl();
-const bool        test::rpc          = !!ggml_cpu_has_rpc();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();

@@ -1182,6 +1179,9 @@ struct markdown_printer : public printer {
                value = buf;
            } else if (field == "backend") {
                value = test::get_backend();
+                if (t.has_rpc) {
+                    value += "+RPC";
+                }
            } else if (field == "test") {
                if (t.n_prompt > 0 && t.n_gen == 0) {
                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
@@ -1862,9 +1862,10 @@ static enum ggml_status ggml_metal_graph_compute(
                        // ne21 = n_rows
                        const int dst_rows = ne20*ne21;
                        const int dst_rows_min = n_as;
+                        const int dst_rows_max = (ctx->device.maxThreadgroupMemoryLength - 32 - 8192)/4;

                        // max size of the rowids array in the kernel shared buffer
-                        GGML_ASSERT(dst_rows <= 2048);
+                        GGML_ASSERT(dst_rows <= dst_rows_max);

                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
Author	SHA1	Message	Date
Georgi Gerganov	66ef1ceedf	metal : utilize max shared memory for mul_mat_id (#7935 )	2024-06-14 17:14:09 +03:00
Radoslav Gerganov	e65bbf606c	llama-bench : fix RPC indication (#7936 ) Show "<backend_name>+RPC" when RPC offloading is used	2024-06-14 16:47:41 +03:00