From 15e5d401d18dae5968d98ef54241317d5b8bab33 Mon Sep 17 00:00:00 2001 From: Nikhil Jain Date: Mon, 8 Jun 2026 08:07:31 -0700 Subject: [PATCH] Handle buffer overlap / buffer aliasing for concat operator (llama/24000) * Only run webgpu CI on my fork * Add webgpu only workflow * handle buffer overlap case for concat operator * restore build-webgpu.yml Co-Authored-By: Claude Sonnet 4.6 * Run clang-format * Update ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl --------- Co-authored-by: Claude Sonnet 4.6 Co-authored-by: Reese Levine --- .../ggml-webgpu/ggml-webgpu-shader-lib.hpp | 17 ++++- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 75 ++++++++++++------- ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl | 20 ++++- 3 files changed, 79 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp index a5e7de785..c75a98a8d 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp @@ -448,15 +448,19 @@ struct ggml_webgpu_upscale_pipeline_key_hash { /** Concat **/ struct ggml_webgpu_concat_pipeline_key { - int type; + int type; + bool src_overlap; - bool operator==(const ggml_webgpu_concat_pipeline_key & other) const { return type == other.type; } + bool operator==(const ggml_webgpu_concat_pipeline_key & other) const { + return type == other.type && src_overlap == other.src_overlap; + } }; struct ggml_webgpu_concat_pipeline_key_hash { size_t operator()(const ggml_webgpu_concat_pipeline_key & key) const { size_t seed = 0; ggml_webgpu_hash_combine(seed, key.type); + ggml_webgpu_hash_combine(seed, key.src_overlap); return seed; } }; @@ -2634,6 +2638,7 @@ class ggml_webgpu_shader_lib { webgpu_pipeline get_concat_pipeline(const ggml_webgpu_shader_lib_context & context) { ggml_webgpu_concat_pipeline_key key = {}; key.type = context.dst->type; + key.src_overlap = ggml_webgpu_tensor_overlap(context.src0, context.src1); auto it = concat_pipelines.find(key); if (it != concat_pipelines.end()) { @@ -2656,11 +2661,17 @@ class ggml_webgpu_shader_lib { GGML_ABORT("Unsupported type for concat shader"); } + if (key.src_overlap) { + defines.push_back("SRC_OVERLAP"); + variant += "_src_overlap"; + } + defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size)); auto processed = preprocessor.preprocess(wgsl_concat, defines); - auto decisions = std::make_shared(); + auto decisions = std::make_shared(); decisions->wg_size = context.max_wg_size; + decisions->src_overlap = key.src_overlap; webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant); pipeline.context = decisions; concat_pipelines[key] = pipeline; diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 94a108dfa..79d513802 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -2310,33 +2310,6 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx, uint32_t ne = (uint32_t) ggml_nelements(dst); uint32_t dim = (uint32_t) dst->op_params[0]; - std::vector params = { - ne, - (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), - (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), - (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), - (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)), - (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), - (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), - (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), - (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)), - (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), - (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), - (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), - (uint32_t) dst->ne[0], - (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3], - dim, - (uint32_t) src0->ne[dim] - }; - - std::vector entries = { - ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0), - ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1), - ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst), - }; - ggml_webgpu_shader_lib_context shader_lib_ctx = {}; shader_lib_ctx.src0 = src0; shader_lib_ctx.src1 = src1; @@ -2344,8 +2317,52 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx, shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup; webgpu_pipeline pipeline = ctx->shader_lib->get_concat_pipeline(shader_lib_ctx); - auto * decisions = static_cast(pipeline.context.get()); - uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); + auto * decisions = static_cast(pipeline.context.get()); + + uint32_t offset_src0 = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)); + uint32_t offset_src1 = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)); + size_t merged_offset = 0; + size_t merged_size = 0; + if (decisions->src_overlap) { + const ggml_webgpu_merged_binding_range merged_range = + ggml_webgpu_tensor_merged_binding_range(ctx, { src0, src1 }); + merged_offset = merged_range.offset; + merged_size = merged_range.size; + offset_src0 = ggml_webgpu_tensor_merged_element_offset(src0, merged_range); + offset_src1 = ggml_webgpu_tensor_merged_element_offset(src1, merged_range); + } + + std::vector params = { ne, + offset_src0, + offset_src1, + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), + (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)), + (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), + (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), + (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), + (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)), + (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), + (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), + (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), + (uint32_t) dst->ne[0], + (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3], + dim, + (uint32_t) src0->ne[dim] }; + + std::vector entries = {}; + if (decisions->src_overlap) { + entries.push_back( + ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(src0), merged_offset, merged_size)); + entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst)); + } else { + entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0)); + entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1)); + entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst)); + } + + uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl index a22d245d2..eb901bf05 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl @@ -31,6 +31,16 @@ struct Params { #define DataType i32 #endif +#ifdef SRC_OVERLAP +@group(0) @binding(0) +var merged_src: array; + +@group(0) @binding(1) +var dst: array; + +@group(0) @binding(2) +var params: Params; +#else @group(0) @binding(0) var src0: array; @@ -42,7 +52,7 @@ var dst: array; @group(0) @binding(3) var params: Params; - +#endif @compute @workgroup_size(WG_SIZE) fn main(@builtin(global_invocation_id) gid: vec3) { @@ -62,14 +72,22 @@ fn main(@builtin(global_invocation_id) gid: vec3) { ni[1] * params.stride_src0_1 + ni[2] * params.stride_src0_2 + ni[3] * params.stride_src0_3; +#ifdef SRC_OVERLAP + dst[params.offset_dst + gid.x] = merged_src[params.offset_src0 + src_i]; +#else dst[params.offset_dst + gid.x] = src0[params.offset_src0 + src_i]; +#endif } else { ni[params.dim] -= params.src0_nedim; let src_i = ni[0] * params.stride_src1_0 + ni[1] * params.stride_src1_1 + ni[2] * params.stride_src1_2 + ni[3] * params.stride_src1_3; +#ifdef SRC_OVERLAP + dst[params.offset_dst + gid.x] = merged_src[params.offset_src1 + src_i]; +#else dst[params.offset_dst + gid.x] = src1[params.offset_src1 + src_i]; +#endif } } }