From 8d55ccdb8cafe5a5e9b5b8ed5b6ce0e9a6a642af Mon Sep 17 00:00:00 2001
From: agray3 <agray3@users.noreply.github.com>
Date: Wed, 15 May 2024 14:44:49 +0100
Subject: [PATCH] Avoid unnecessarily disabling CUDA graphs (llama/7302)

As discussed in PR #6766, CUDA graphs were being disabled in the presence of long prompts.
This fixes the issue by avoiding the consective update counter from incrementing unnecessarily
for tokens in which cuda graphs are disabled due to batch size > 1.
---
 ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 75a2ad48..04b6e528 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2558,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
         }
 
         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (cuda_graph_update_required) {
+        if (use_cuda_graph && cuda_graph_update_required) {
             cuda_ctx->cuda_graph->number_consecutive_updates++;
         } else {
             cuda_ctx->cuda_graph->number_consecutive_updates = 0;