From 5cd228494af3973294e90aad95b58c2ede400f43 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 12 May 2026 08:30:00 +0200
Subject: [PATCH] ggml-cuda : add ar_add() to avoid ambiguous operator+ for
 half/bfloat16 in CUDA 11.8

---
 ggml/src/ggml-cuda/allreduce.cu | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/ggml/src/ggml-cuda/allreduce.cu b/ggml/src/ggml-cuda/allreduce.cu
index 434689abd..03d88968c 100644
--- a/ggml/src/ggml-cuda/allreduce.cu
+++ b/ggml/src/ggml-cuda/allreduce.cu
@@ -105,6 +105,20 @@ static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
 // blocks.  Tail elements (the leftover < ELEMS_PER_VEC at the end) are
 // handled only by block 0 to avoid cross-block writes to the same slots.
 // ---------------------------------------------------------------------------
+
+// CUDA 11.8 does not expose operator+ for half/bfloat16 below sm_530,
+// so use the explicit intrinsics to avoid ambiguous implicit conversions.
+template<typename T>
+static __device__ inline T ar_add(T a, T b) {
+    if constexpr (std::is_same_v<T, half>) {
+        return __hadd(a, b);
+    } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+        return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
+    } else {
+        return a + b;
+    }
+}
+
 template <typename T_dst, typename T_wire>
 static __global__ void ggml_cuda_ar_kernel(
         const T_dst  *              sendbuf,
@@ -184,13 +198,13 @@ static __global__ void ggml_cuda_ar_kernel(
             #pragma unroll
             for (int k = 0; k < ELEMS_PER_VEC; ++k) {
                 const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
-                recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
+                recvbuf[off + k] = ar_add(ggml_cuda_cast<T_dst>(d_low), ggml_cuda_cast<T_dst>(wire[k]));
             }
         }
         if (bid == 0 && tid < count - tail) {
             const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
             recvbuf[tail + tid] =
-                ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
+                ar_add(ggml_cuda_cast<T_dst>(d_low), ggml_cuda_cast<T_dst>(host_other[tail + tid]));
         }
     }
 }
@@ -210,7 +224,7 @@ static __global__ void ggml_cuda_ar_add_kernel(
     const int nt  = gridDim.x * blockDim.x;
     for (int i = tid; i < count; i += nt) {
         const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
-        dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
+        dst[i] = ar_add(ggml_cuda_cast<T_dst>(d_low), ggml_cuda_cast<T_dst>(src[i]));
     }
 }