From 15e5d401d18dae5968d98ef54241317d5b8bab33 Mon Sep 17 00:00:00 2001
From: Nikhil Jain <nikhil.jain0987@gmail.com>
Date: Mon, 8 Jun 2026 08:07:31 -0700
Subject: [PATCH] Handle buffer overlap / buffer aliasing for concat operator
 (llama/24000)

* Only run webgpu CI on my fork

* Add webgpu only workflow

* handle buffer overlap case for concat operator

* restore build-webgpu.yml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* Run clang-format

* Update ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
Co-authored-by: Reese Levine <reeselevine1@gmail.com>
---
 .../ggml-webgpu/ggml-webgpu-shader-lib.hpp    | 17 ++++-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 75 ++++++++++++-------
 ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl | 20 ++++-
 3 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index a5e7de785..c75a98a8d 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -448,15 +448,19 @@ struct ggml_webgpu_upscale_pipeline_key_hash {
 /** Concat **/
 
 struct ggml_webgpu_concat_pipeline_key {
-    int type;
+    int  type;
+    bool src_overlap;
 
-    bool operator==(const ggml_webgpu_concat_pipeline_key & other) const { return type == other.type; }
+    bool operator==(const ggml_webgpu_concat_pipeline_key & other) const {
+        return type == other.type && src_overlap == other.src_overlap;
+    }
 };
 
 struct ggml_webgpu_concat_pipeline_key_hash {
     size_t operator()(const ggml_webgpu_concat_pipeline_key & key) const {
         size_t seed = 0;
         ggml_webgpu_hash_combine(seed, key.type);
+        ggml_webgpu_hash_combine(seed, key.src_overlap);
         return seed;
     }
 };
@@ -2634,6 +2638,7 @@ class ggml_webgpu_shader_lib {
     webgpu_pipeline get_concat_pipeline(const ggml_webgpu_shader_lib_context & context) {
         ggml_webgpu_concat_pipeline_key key = {};
         key.type                            = context.dst->type;
+        key.src_overlap                     = ggml_webgpu_tensor_overlap(context.src0, context.src1);
 
         auto it = concat_pipelines.find(key);
         if (it != concat_pipelines.end()) {
@@ -2656,11 +2661,17 @@ class ggml_webgpu_shader_lib {
                 GGML_ABORT("Unsupported type for concat shader");
         }
 
+        if (key.src_overlap) {
+            defines.push_back("SRC_OVERLAP");
+            variant += "_src_overlap";
+        }
+
         defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
 
         auto processed           = preprocessor.preprocess(wgsl_concat, defines);
-        auto decisions           = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+        auto decisions           = std::make_shared<ggml_webgpu_binary_shader_decisions>();
         decisions->wg_size       = context.max_wg_size;
+        decisions->src_overlap   = key.src_overlap;
         webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
         pipeline.context         = decisions;
         concat_pipelines[key]    = pipeline;
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 94a108dfa..79d513802 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -2310,33 +2310,6 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx,
     uint32_t ne  = (uint32_t) ggml_nelements(dst);
     uint32_t dim = (uint32_t) dst->op_params[0];
 
-    std::vector<uint32_t> params = {
-        ne,
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
-        (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
-        (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
-        (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
-        (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
-        (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
-        (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
-        (uint32_t) dst->ne[0],
-        (uint32_t) dst->ne[1],
-        (uint32_t) dst->ne[2],
-        (uint32_t) dst->ne[3],
-        dim,
-        (uint32_t) src0->ne[dim]
-    };
-
-    std::vector<wgpu::BindGroupEntry> entries = {
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
-    };
-
     ggml_webgpu_shader_lib_context shader_lib_ctx = {};
     shader_lib_ctx.src0                           = src0;
     shader_lib_ctx.src1                           = src1;
@@ -2344,8 +2317,52 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx,
     shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
 
     webgpu_pipeline pipeline  = ctx->shader_lib->get_concat_pipeline(shader_lib_ctx);
-    auto *          decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
-    uint32_t        wg_x      = CEIL_DIV(ne, decisions->wg_size);
+    auto *          decisions = static_cast<ggml_webgpu_binary_shader_decisions *>(pipeline.context.get());
+
+    uint32_t offset_src0   = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type));
+    uint32_t offset_src1   = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type));
+    size_t   merged_offset = 0;
+    size_t   merged_size   = 0;
+    if (decisions->src_overlap) {
+        const ggml_webgpu_merged_binding_range merged_range =
+            ggml_webgpu_tensor_merged_binding_range(ctx, { src0, src1 });
+        merged_offset = merged_range.offset;
+        merged_size   = merged_range.size;
+        offset_src0   = ggml_webgpu_tensor_merged_element_offset(src0, merged_range);
+        offset_src1   = ggml_webgpu_tensor_merged_element_offset(src1, merged_range);
+    }
+
+    std::vector<uint32_t> params = { ne,
+                                     offset_src0,
+                                     offset_src1,
+                                     (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+                                     (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
+                                     (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
+                                     (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
+                                     (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
+                                     (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
+                                     (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
+                                     (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
+                                     (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
+                                     (uint32_t) dst->ne[0],
+                                     (uint32_t) dst->ne[1],
+                                     (uint32_t) dst->ne[2],
+                                     (uint32_t) dst->ne[3],
+                                     dim,
+                                     (uint32_t) src0->ne[dim] };
+
+    std::vector<wgpu::BindGroupEntry> entries = {};
+    if (decisions->src_overlap) {
+        entries.push_back(
+            ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(src0), merged_offset, merged_size));
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst));
+    } else {
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0));
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1));
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
+    }
+
+    uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl
index a22d245d2..eb901bf05 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl
@@ -31,6 +31,16 @@ struct Params {
 #define DataType i32
 #endif
 
+#ifdef SRC_OVERLAP
+@group(0) @binding(0)
+var<storage, read_write> merged_src: array<DataType>;
+
+@group(0) @binding(1)
+var<storage, read_write> dst: array<DataType>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+#else
 @group(0) @binding(0)
 var<storage, read_write> src0: array<DataType>;
 
@@ -42,7 +52,7 @@ var<storage, read_write> dst: array<DataType>;
 
 @group(0) @binding(3)
 var<uniform> params: Params;
-
+#endif
 @compute @workgroup_size(WG_SIZE)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 
@@ -62,14 +72,22 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
                              ni[1] * params.stride_src0_1 +
                              ni[2] * params.stride_src0_2 +
                              ni[3] * params.stride_src0_3;
+#ifdef SRC_OVERLAP
+            dst[params.offset_dst + gid.x] = merged_src[params.offset_src0 + src_i];
+#else
             dst[params.offset_dst + gid.x] = src0[params.offset_src0 + src_i];
+#endif
         } else {
             ni[params.dim] -= params.src0_nedim;
             let src_i = ni[0] * params.stride_src1_0 +
                              ni[1] * params.stride_src1_1 +
                              ni[2] * params.stride_src1_2 +
                              ni[3] * params.stride_src1_3;
+#ifdef SRC_OVERLAP
+            dst[params.offset_dst + gid.x] = merged_src[params.offset_src1 + src_i];
+#else
             dst[params.offset_dst + gid.x] = src1[params.offset_src1 + src_i];
+#endif
         }
     }
 }