vulkan: record actual memory properties during buffer creation (#24326 )

Revert "cuda: reset cuda context after reading memory size (#23935 )" (#24715 )
This reverts commit 0f7fada56b.
2026-06-17 19:17:37 +02:00 · 2026-06-17 11:14:48 +02:00 · 2026-06-17 10:59:35 +02:00
2 changed files with 10 additions and 67 deletions
@@ -622,18 +622,6 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {

 // cuda buffer

-struct ggml_backend_cuda_device_context {
-    int device;
-    std::string name;
-    std::string description;
-    std::string pci_bus_id;
-    int op_offload_min_batch_size;
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    std::mutex device_mutex;
-    int active_count = 0;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-};
-
 struct ggml_backend_cuda_buffer_context {
    int device;
    void * dev_ptr = nullptr;
@@ -651,13 +639,6 @@ struct ggml_backend_cuda_buffer_context {

 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count--;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    delete ctx;
 }

@@ -810,12 +791,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac

    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count++;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 }

@@ -1515,12 +1490,6 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
 }

 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count--;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    CUDA_CHECK(cudaFreeHost(buffer->context));
 }

@@ -1529,8 +1498,6 @@ static void * ggml_cuda_host_malloc(size_t size) {
        return nullptr;
    }

-    ggml_cuda_set_device(0); // cudaMallocHost can create the implicit CUDA device context, make sure that this is consistently done on device 0.
-
    void * ptr = nullptr;
    cudaError_t err = cudaMallocHost((void **) &ptr, size);
    if (err != cudaSuccess) {
@@ -1556,12 +1523,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
    buffer->buft = buft;
    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count++;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    return buffer;
 }

@@ -3179,12 +3140,6 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
 static void ggml_backend_cuda_free(ggml_backend_t backend) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) backend->device->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count--;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    delete cuda_ctx;
    delete backend;
 }
@@ -4916,6 +4871,14 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {

 // backend device

+struct ggml_backend_cuda_device_context {
+    int device;
+    std::string name;
+    std::string description;
+    std::string pci_bus_id;
+    int op_offload_min_batch_size;
+};
+
 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    return ctx->name.c_str();
@@ -5004,11 +4967,6 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k

 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    std::lock_guard<std::mutex> lock(ctx->device_mutex);
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    ggml_cuda_set_device(ctx->device);
    CUDA_CHECK(cudaMemGetInfo(free, total));

@@ -5035,13 +4993,6 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
    }
 #endif // defined(__linux__)

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    // If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA
-    // context that permanently consumes VRAM. Reset the device to free it.
-    if (ctx->active_count == 0) {
-        CUDA_CHECK(cudaDeviceReset());
-    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 }

 static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
@@ -5745,21 +5696,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
        return nullptr;
    }

-    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device);
-
    ggml_backend_t cuda_backend = new ggml_backend {
        /* .guid    = */ ggml_backend_cuda_guid(),
        /* .iface   = */ ggml_backend_cuda_interface,
-        /* .device  = */ dev,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
        /* .context = */ ctx,
    };

-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
-    std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
-    dev_ctx->active_count++;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-
    return cuda_backend;
 }

@@ -3008,13 +3008,13 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
            if (memory_type_indices.empty()) {
                continue;
            }
-            buf->memory_property_flags = req_flags;

            bool done = false;

            for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
                try {
                    buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
+                    buf->memory_property_flags = mem_props.memoryTypes[*mtype_it].propertyFlags;
                    done = true;
                    break;
                } catch (const vk::SystemError& e) {
Author	SHA1	Message	Date
Winston Ma	558e221b70	vulkan: record actual memory properties during buffer creation (#24326 )	2026-06-17 11:14:48 +02:00
Ruben Ortlam	ea21e03955	Revert "cuda: reset cuda context after reading memory size (#23935 )" (#24715 ) This reverts commit `0f7fada56b`.	2026-06-17 10:59:35 +02:00