From 5cd228494af3973294e90aad95b58c2ede400f43 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 12 May 2026 08:30:00 +0200 Subject: [PATCH] ggml-cuda : add ar_add() to avoid ambiguous operator+ for half/bfloat16 in CUDA 11.8 --- ggml/src/ggml-cuda/allreduce.cu | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/allreduce.cu b/ggml/src/ggml-cuda/allreduce.cu index 434689abd..03d88968c 100644 --- a/ggml/src/ggml-cuda/allreduce.cu +++ b/ggml/src/ggml-cuda/allreduce.cu @@ -105,6 +105,20 @@ static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8; // blocks. Tail elements (the leftover < ELEMS_PER_VEC at the end) are // handled only by block 0 to avoid cross-block writes to the same slots. // --------------------------------------------------------------------------- + +// CUDA 11.8 does not expose operator+ for half/bfloat16 below sm_530, +// so use the explicit intrinsics to avoid ambiguous implicit conversions. +template +static __device__ inline T ar_add(T a, T b) { + if constexpr (std::is_same_v) { + return __hadd(a, b); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b)); + } else { + return a + b; + } +} + template static __global__ void ggml_cuda_ar_kernel( const T_dst * sendbuf, @@ -184,13 +198,13 @@ static __global__ void ggml_cuda_ar_kernel( #pragma unroll for (int k = 0; k < ELEMS_PER_VEC; ++k) { const T_wire d_low = ggml_cuda_cast(sendbuf[off + k]); - recvbuf[off + k] = ggml_cuda_cast(d_low) + ggml_cuda_cast(wire[k]); + recvbuf[off + k] = ar_add(ggml_cuda_cast(d_low), ggml_cuda_cast(wire[k])); } } if (bid == 0 && tid < count - tail) { const T_wire d_low = ggml_cuda_cast(sendbuf[tail + tid]); recvbuf[tail + tid] = - ggml_cuda_cast(d_low) + ggml_cuda_cast(host_other[tail + tid]); + ar_add(ggml_cuda_cast(d_low), ggml_cuda_cast(host_other[tail + tid])); } } } @@ -210,7 +224,7 @@ static __global__ void ggml_cuda_ar_add_kernel( const int nt = gridDim.x * blockDim.x; for (int i = tid; i < count; i += nt) { const T_src d_low = ggml_cuda_cast(dst[i]); - dst[i] = ggml_cuda_cast(d_low) + ggml_cuda_cast(src[i]); + dst[i] = ar_add(ggml_cuda_cast(d_low), ggml_cuda_cast(src[i])); } }