rpc: fix register position (#11424 )

Signed-off-by: thxCode <thxcode0824@gmail.com>
readme : update hot topics
2026-06-30 17:47:40 +02:00 · 2025-01-26 16:20:34 +01:00 · 2025-01-26 14:30:15 +02:00 · 2025-01-26 03:10:03 +01:00 · 2025-01-25 22:29:57 +01:00 · 2025-01-25 21:01:12 +01:00
11 changed files with 94 additions and 45 deletions
@@ -2,6 +2,10 @@ ARG UBUNTU_VERSION=22.04

 FROM ubuntu:$UBUNTU_VERSION AS build

+ARG TARGETARCH
+
+ARG GGML_CPU_ARM_ARCH=armv8-a
+
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev

@@ -9,7 +13,14 @@ WORKDIR /app

 COPY . .

-RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
+    elif [ "$TARGETARCH" = "arm64" ]; then \
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
+    else \
+        echo "Unsupported architecture"; \
+        exit 1; \
+    fi && \
    cmake --build build -j $(nproc)

 RUN mkdir -p /app/lib && \
@@ -50,6 +50,8 @@ endif()
 if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

 #
@@ -16,6 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

+- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427
 - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
@@ -1427,16 +1427,16 @@ struct server_queue {
    int post(server_task task, bool front = false) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        GGML_ASSERT(task.id != -1);
+        // if this is cancel task make sure to clean up pending tasks
+        if (task.type == SERVER_TASK_TYPE_CANCEL) {
+            cleanup_pending_task(task.id_target);
+        }
        QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
        if (front) {
            queue_tasks.push_front(std::move(task));
        } else {
            queue_tasks.push_back(std::move(task));
        }
-        // if this is cancel task make sure to clean up pending tasks
-        if (task.type == SERVER_TASK_TYPE_CANCEL) {
-            cleanup_pending_task(task.id_target);
-        }
        condition_tasks.notify_one();
        return task.id;
    }
@@ -1448,16 +1448,16 @@ struct server_queue {
            if (task.id == -1) {
                task.id = id++;
            }
+            // if this is cancel task make sure to clean up pending tasks
+            if (task.type == SERVER_TASK_TYPE_CANCEL) {
+                cleanup_pending_task(task.id_target);
+            }
            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
            if (front) {
                queue_tasks.push_front(std::move(task));
            } else {
                queue_tasks.push_back(std::move(task));
            }
-            // if this is cancel task make sure to clean up pending tasks
-            if (task.type == SERVER_TASK_TYPE_CANCEL) {
-                cleanup_pending_task(task.id_target);
-            }
        }
        condition_tasks.notify_one();
        return 0;
@@ -1554,10 +1554,10 @@ struct server_queue {
    }

 private:
-    void cleanup_pending_task(int id_task) {
+    void cleanup_pending_task(int id_target) {
        // no need lock because this is called exclusively by post()
-        auto rm_func = [id_task](const server_task & task) {
-            return task.id_target == id_task;
+        auto rm_func = [id_target](const server_task & task) {
+            return task.id_target == id_target;
        };
        queue_tasks.erase(
            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
@@ -155,6 +155,7 @@ option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp on

 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
 option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
+option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
@@ -131,6 +131,10 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif // GGML_CUDA_F16

+#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
+#define GGML_USE_VMM
+#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
+
 #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 #define FP16_AVAILABLE
 #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
@@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;

-#if !defined(GGML_CUDA_NO_VMM)
+#if defined(GGML_USE_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
            alloc_prop.location.id = id;
            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
        }
-#endif // !defined(GGML_CUDA_NO_VMM)
+#endif // defined(GGML_USE_VMM)
        info.devices[id].vmm = !!device_vmm;

        cudaDeviceProp prop;
@@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };

 // pool with virtual memory
-#if !defined(GGML_CUDA_NO_VMM)
+#if defined(GGML_USE_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB

@@ -408,14 +408,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
        GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
    }
 };
-#endif // !defined(GGML_CUDA_NO_VMM)
+#endif // defined(GGML_USE_VMM)

 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_CUDA_NO_VMM)
+#if defined(GGML_USE_VMM)
    if (ggml_cuda_info().devices[device].vmm) {
        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
    }
-#endif // !defined(GGML_CUDA_NO_VMM)
+#endif // defined(GGML_USE_VMM)
    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }

@@ -3250,7 +3250,7 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
        features.push_back({ "FORCE_CUBLAS", "1" });
    #endif

-    #ifdef GGML_CUDA_NO_VMM
+    #ifndef GGML_USE_VMM
        features.push_back({ "NO_VMM", "1" });
    #endif

@@ -96,8 +96,8 @@ if (GGML_HIP_GRAPHS)
    add_compile_definitions(GGML_HIP_GRAPHS)
 endif()

-if (GGML_CUDA_NO_VMM)
-    add_compile_definitions(GGML_CUDA_NO_VMM)
+if (GGML_HIP_NO_VMM)
+    add_compile_definitions(GGML_HIP_NO_VMM)
 endif()

 if (CXX_IS_HIPCC)
@@ -85,6 +85,10 @@ struct vk_pipeline_struct {
    uint32_t parameter_count;
    std::array<uint32_t, 3> wg_denoms;
    uint32_t align;
+    // set to true to request the pipeline is compiled after the dryrun
+    bool needed {};
+    // set to true when the shader has been compiled
+    bool compiled {};
 };

 typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -186,8 +190,11 @@ struct vk_device_struct {
    bool mul_mat_id_m;
    bool mul_mat_id_s;

-    vk_matmul_pipeline pipeline_matmul_f32;
-    vk_matmul_pipeline pipeline_matmul_f32_f16;
+    // set to true to indicate that some shaders need to be compiled after the dryrun
+    bool need_compiles {};
+
+    vk_matmul_pipeline pipeline_matmul_f32 {};
+    vk_matmul_pipeline pipeline_matmul_f32_f16 {};
    vk_matmul_pipeline2 pipeline_matmul_f16;
    vk_matmul_pipeline2 pipeline_matmul_f16_f32;
    vk_pipeline pipeline_matmul_split_k_reduce;
@@ -195,7 +202,7 @@ struct vk_device_struct {
    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];

-    vk_matmul_pipeline pipeline_matmul_id_f32;
+    vk_matmul_pipeline pipeline_matmul_id_f32 {};
    vk_matmul_pipeline2 pipeline_matmul_id_f16;
    vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;

@@ -776,13 +783,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
    GGML_ASSERT(parameter_count > 0);
    GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT

-    pipeline = std::make_shared<vk_pipeline_struct>();
-    pipeline->name = name;
-    pipeline->parameter_count = parameter_count;
-    pipeline->push_constant_size = push_constant_size;
-    pipeline->wg_denoms = wg_denoms;
-    pipeline->align = align;
-
    vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
    pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);

@@ -865,6 +865,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
    }

    pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
+    pipeline->compiled = true;

    {
        std::lock_guard<std::mutex> guard(device->mutex);
@@ -875,12 +876,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        std::lock_guard<std::mutex> guard(compile_count_mutex);
        assert(compile_count > 0);
        compile_count--;
-
-        // "Progress bar" for shader compiles
-        static uint32_t total_compile_count = 0;
-        if ((total_compile_count++ % 10) == 0) {
-            std::cerr << ".";
-        }
    }
    compile_count_cond.notify_all();
 }
@@ -906,6 +901,10 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
 static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
    VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
    device->pipeline_descriptor_set_requirements[pipeline->name] += n;
+    if (!pipeline->compiled) {
+        pipeline->needed = true;
+        device->need_compiles = true;
+    }
 }

 static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
@@ -1388,8 +1387,6 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
 static void ggml_vk_load_shaders(vk_device& device) {
    VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");

-    std::cerr << "ggml_vulkan: Compiling shaders";
-
    // some shaders have a minimum subgroup size
    const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
    const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
@@ -1527,15 +1524,33 @@ static void ggml_vk_load_shaders(vk_device& device) {
        }
    }

-    device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
-    device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
-
-    device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
+    if (!device->pipeline_matmul_f32) {
+        device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
+    }
+    if (!device->pipeline_matmul_f32_f16) {
+        device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
+    }
+    if (!device->pipeline_matmul_id_f32) {
+        device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
+    }

    std::vector<std::future<void>> compiles;
    auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
                                              uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
                                              uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
+
+        if (!pipeline) {
+            pipeline = std::make_shared<vk_pipeline_struct>();
+            pipeline->name = name;
+            pipeline->parameter_count = parameter_count;
+            pipeline->push_constant_size = push_constant_size;
+            pipeline->wg_denoms = wg_denoms;
+            pipeline->align = align;
+        }
+
+        if (!pipeline->needed || pipeline->compiled) {
+            return;
+        }
        {
            // wait until fewer than N compiles are in progress
            uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@@ -2050,7 +2065,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    for (auto &c : compiles) {
        c.wait();
    }
-    std::cerr << "Done!" << std::endl;
+    device->need_compiles = false;
 }

 static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
@@ -7656,6 +7671,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
    }
+    if (ctx->device->need_compiles) {
+        ggml_vk_load_shaders(ctx->device);
+    }
    ggml_vk_preallocate_buffers(ctx);
    ggml_pipeline_allocate_descriptor_sets(ctx->device);

@@ -1303,10 +1303,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
+            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
            return {cpu_dev, &pimpl->cpu_buft_list};
        }
        const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
        auto * dev = devices.at(layer_gpu);
+        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
        return {dev, &pimpl->gpu_buft_list.at(dev)};
    };

@@ -9405,6 +9405,7 @@ static struct llama_model * llama_model_load_from_file_impl(
            model->devices.push_back(*dev);
        }
    } else {
+        std::vector<ggml_backend_dev_t> rpc_servers;
        // use all available devices
        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -9415,10 +9416,19 @@ static struct llama_model * llama_model_load_from_file_impl(
                    break;

                case GGML_BACKEND_DEVICE_TYPE_GPU:
-                    model->devices.push_back(dev);
+                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
+                        rpc_servers.push_back(dev);
+                    } else {
+                        model->devices.push_back(dev);
+                    }
                    break;
            }
        }
+        // add RPC servers at the front of the list
+        if (!rpc_servers.empty()) {
+            model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
+        }
    }

    // if using single GPU mode, remove all except the main GPU
Author	SHA1	Message	Date
Frank Mai	1d8ee06000	rpc: fix register position (#11424 ) Signed-off-by: thxCode <thxcode0824@gmail.com>	2025-01-26 16:20:34 +01:00
Georgi Gerganov	2cc9b8c32c	readme : update hot topics	2025-01-26 14:30:15 +02:00
Jeff Bolz	f35726c2fb	build: apply MSVC /bigobj option to c/cpp files only (#11423 )	2025-01-26 03:10:03 +01:00
Jeff Bolz	4a75d19376	vulkan: compile shaders on-demand (#11406 ) Reduce first-run startup time and memory consumption. Should fix #11339.	2025-01-25 22:29:57 +01:00
uvos	26771a1491	Hip: disable VMM on hip as it seams that it dosent work in some configurations (#11420 )	2025-01-25 21:01:12 +01:00
Jeff Bolz	ca6baf76c1	build: add /bigobj to MSVC build (#11407 )	2025-01-25 11:26:37 -06:00
Diego Devesa	6e264a905b	docker : add GGML_CPU_ARM_ARCH arg to select ARM architecture to build for (#11419 )	2025-01-25 17:22:41 +01:00
Xuan Son Nguyen	49b0e3cec4	server : fix cleaning up stream task (#11418 ) * server : fix cleaning up stream task * one more spot	2025-01-25 16:36:44 +01:00
Diego Devesa	20a758155b	docker : fix CPU ARM build (#11403 ) * docker : fix CPU ARM build * add CURL to other builds	2025-01-25 15:22:29 +01:00