From bc77933c2de8c5d104bfc234e3358217265fe147 Mon Sep 17 00:00:00 2001
From: Nikhil Jain <nikhil.jain0987@gmail.com>
Date: Mon, 25 May 2026 20:32:49 -0700
Subject: [PATCH] Check batch_compute_passes before sending passes when not
 doing GPU profiling (llama/23457)

* Only run webgpu CI on my fork

* Add webgpu only workflow

* refactor batch_compute_passes to a per-thread variable, and submit individual passes when it is set to false and no GPU profiling is enabled

* restore build.yml
---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 35 +++++++++++++++++-----------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 921c12b41..1561a4e30 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -259,6 +259,7 @@ struct webgpu_context_struct {
     wgpu::Buffer             set_rows_host_error_buf;
     wgpu::CommandEncoder     active_command_encoder;
     wgpu::ComputePassEncoder active_compute_pass;
+    bool                     batch_compute_passes = true;
 
     size_t memset_bytes_per_thread;
 
@@ -590,9 +591,18 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(webgpu_context &
     }
 #else
     for (size_t i = 0; i < dispatches.size(); i++) {
-        ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
-        ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
-        ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
+        if (ctx->batch_compute_passes) {
+            ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
+            ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
+            ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second,
+                                                        1);
+        } else {
+            wgpu::ComputePassEncoder pass = ctx->active_command_encoder.BeginComputePass();
+            pass.SetPipeline(dispatches[i].pipeline.pipeline);
+            pass.SetBindGroup(0, bind_groups[i]);
+            pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
+            pass.End();
+        }
     }
 #endif
 
@@ -1956,10 +1966,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
     std::vector<wgpu::BindGroupEntry> reduce_entries;
     if (use_vec_reduce) {
         const uint32_t reduce_sg_size = ctx->global_ctx->capabilities.max_subgroup_size;
-        const uint32_t reduce_wg_size =
-            std::max(reduce_sg_size, (uint32_t) std::min<uint64_t>(
-                                         (uint64_t) nwg * reduce_sg_size,
-                                         ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
+        const uint32_t reduce_wg_size = std::max(
+            reduce_sg_size,
+            (uint32_t) std::min<uint64_t>((uint64_t) nwg * reduce_sg_size,
+                                          ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
         ggml_webgpu_shader_lib_context reduce_shader_ctx = shader_lib_ctx;
         reduce_shader_ctx.max_wg_size                    = reduce_wg_size;
         reduce_pipeline = ctx->shader_lib->get_flash_attn_vec_reduce_pipeline(reduce_shader_ctx);
@@ -3110,18 +3120,16 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     uint32_t num_batched_kernels  = 0;
     uint32_t num_inflight_batches = 0;
     bool     contains_set_rows    = false;
-    bool     batch_compute_passes = true;
     int      num_encoded_ops      = 1;
     int      node_idx             = 0;
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
     ctx->profile_timestamp_query_count = 0;
-    batch_compute_passes               = false;
     std::vector<std::string> profile_pipeline_names;
 #endif
 
     ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
-    if (batch_compute_passes) {
+    if (ctx->batch_compute_passes) {
         ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
     }
 
@@ -3148,7 +3156,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
 
             // reset state for next batch
             ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
-            if (batch_compute_passes) {
+            if (ctx->batch_compute_passes) {
                 ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
             }
             ctx->param_arena.reset();
@@ -3548,8 +3556,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                         const uint32_t kv_tile = decisions.kv_tile;
 
                         const uint32_t vec_nwg_cap = ctx->webgpu_global_ctx->capabilities.min_subgroup_size;
-                        uint32_t       nwg     = 1u;
-                        const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile);
+                        uint32_t       nwg         = 1u;
+                        const uint64_t kv_span     = (uint64_t) std::max(1u, kv_tile);
                         while ((2u * nwg * kv_span) < (uint64_t) K->ne[1] && nwg < vec_nwg_cap) {
                             nwg <<= 1;
                         }
@@ -3839,6 +3847,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
                               wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
+    webgpu_ctx->batch_compute_passes = false;
     ggml_webgpu_create_buffer(
         webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
         wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, "profile_timestamp_dev_buf");