CUDA: fix half2 -> half conversion for HIP (llama/15529)
This commit is contained in:
parent
2f6288c33c
commit
b0d15e1eb6
|
|
@ -258,7 +258,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
|||
const half val = hexp(sink - kqmax[j0/nwarps]);
|
||||
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum[j0/nwarps].x = __hadd(kqsum[j0/nwarps].x, val);
|
||||
kqsum[j0/nwarps].x = __hadd(__low2half(kqsum[j0/nwarps]), val);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
|
|
|
|||
Loading…
Reference in New Issue