From 918e0ad20954beaf4a57675749e0a54f12f4233b Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 17 Apr 2026 23:24:21 +0800 Subject: [PATCH] CUDA: use LRU based eviction for cuda graphs (llama/21611) * CUDA: use a ring-buffer for cuda graphs * bump limit to 128 * use LRU eviction * better naming * do periodic clean-up --- ggml/src/ggml-cuda/common.cuh | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 66ed02d2..ddf50baf 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1187,6 +1187,7 @@ struct ggml_cuda_graph { bool disable_due_to_gpu_arch = false; bool warmup_complete = false; uint64_t uid = 0; + int64_t last_used_time = 0; struct node_properties { ggml_tensor node; void * node_src_data_ptrs[GGML_MAX_SRC]; @@ -1368,12 +1369,28 @@ struct ggml_backend_cuda_context { // when the computation is split across CPU/GPU (e.g., with --n-cpu-moe) std::unordered_map> cuda_graphs; + int64_t last_graph_eviction_sweep = 0; + ggml_cuda_graph * cuda_graph(const void * first_node_ptr) { + const int64_t time_now = ggml_time_us(); + + // sweep every 5s, evicting cuda graphs unused for >=10s + if (time_now - last_graph_eviction_sweep >= 5'000'000) { + last_graph_eviction_sweep = time_now; + for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) { + if (time_now - it->second->last_used_time >= 10'000'000) { + it = cuda_graphs.erase(it); + } else { + ++it; + } + } + } + auto it = cuda_graphs.find(first_node_ptr); if (it == cuda_graphs.end()) { - cuda_graphs[first_node_ptr] = std::make_unique(); - return cuda_graphs[first_node_ptr].get(); + it = cuda_graphs.emplace(first_node_ptr, std::make_unique()).first; } + it->second->last_used_time = time_now; return it->second.get(); }