From 201f69ce6a8f5894b12467be5be6e35248c59387 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Wed, 17 Jun 2026 10:59:35 +0200 Subject: [PATCH] Revert "cuda: reset cuda context after reading memory size (llama/23935)" (llama/24715) This reverts commit 0f7fada56bea32c9e0db3874b801d887ff6c4529. --- ggml/src/ggml-cuda/ggml-cuda.cu | 75 ++++----------------------------- 1 file changed, 9 insertions(+), 66 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 01c29a8c6..34cdbc81c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -622,18 +622,6 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() { // cuda buffer -struct ggml_backend_cuda_device_context { - int device; - std::string name; - std::string description; - std::string pci_bus_id; - int op_offload_min_batch_size; -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - std::mutex device_mutex; - int active_count = 0; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) -}; - struct ggml_backend_cuda_buffer_context { int device; void * dev_ptr = nullptr; @@ -651,13 +639,6 @@ struct ggml_backend_cuda_buffer_context { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count--; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - delete ctx; } @@ -810,12 +791,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count++; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } @@ -1515,12 +1490,6 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) { } static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count--; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - CUDA_CHECK(cudaFreeHost(buffer->context)); } @@ -1529,8 +1498,6 @@ static void * ggml_cuda_host_malloc(size_t size) { return nullptr; } - ggml_cuda_set_device(0); // cudaMallocHost can create the implicit CUDA device context, make sure that this is consistently done on device 0. - void * ptr = nullptr; cudaError_t err = cudaMallocHost((void **) &ptr, size); if (err != cudaSuccess) { @@ -1556,12 +1523,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm buffer->buft = buft; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count++; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - return buffer; } @@ -3179,12 +3140,6 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) { static void ggml_backend_cuda_free(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) backend->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count--; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - delete cuda_ctx; delete backend; } @@ -4916,6 +4871,14 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) { // backend device +struct ggml_backend_cuda_device_context { + int device; + std::string name; + std::string description; + std::string pci_bus_id; + int op_offload_min_batch_size; +}; + static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; return ctx->name.c_str(); @@ -5004,11 +4967,6 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; - -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - std::lock_guard lock(ctx->device_mutex); -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemGetInfo(free, total)); @@ -5035,13 +4993,6 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * } #endif // defined(__linux__) -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - // If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA - // context that permanently consumes VRAM. Reset the device to free it. - if (ctx->active_count == 0) { - CUDA_CHECK(cudaDeviceReset()); - } -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) } static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { @@ -5745,21 +5696,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return nullptr; } - ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device); - ggml_backend_t cuda_backend = new ggml_backend { /* .guid = */ ggml_backend_cuda_guid(), /* .iface = */ ggml_backend_cuda_interface, - /* .device = */ dev, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), /* .context = */ ctx, }; -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count++; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - return cuda_backend; }