From 36a83b84bb29651e9802b8d287d3941240fc860b Mon Sep 17 00:00:00 2001 From: leonardHONG <2695316095@qq.com> Date: Mon, 4 May 2026 22:24:05 +0800 Subject: [PATCH] CUDA: use fastdiv for batch index split in get_rows (llama/22650) --- ggml/src/ggml-cuda/getrows.cu | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index e99cba63d..36b840e81 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -6,17 +6,18 @@ template static __global__ void k_get_rows( const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst, const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/ - /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/ + /*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/ /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { + for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) { for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) { // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. const int i10 = blockIdx.x; - const int i11 = z / ne12; // TODO fastdiv - const int i12 = z % ne12; + const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv); + const int i11 = dm.x; + const int i12 = dm.y; const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; @@ -42,17 +43,18 @@ template static __global__ void k_get_rows_float( const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst, const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/ - /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/ + /*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/ /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { + for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) { for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. const int i10 = blockIdx.x; - const int i11 = z / ne12; // TODO fastdiv - const int i12 = z % ne12; + const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv); + const int i11 = dm.x; + const int i12 = dm.y; if (i00 >= ne00) { return; @@ -115,10 +117,14 @@ static void get_rows_cuda_q( GGML_ASSERT(ne00 % 2 == 0); + GGML_ASSERT(ne12 > 0); + GGML_ASSERT(ne11 <= std::numeric_limits::max() / ne12); + const uint3 ne12_fdv = init_fastdiv_values(ne12); + k_get_rows<<>>( src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ - /*ne10,*/ ne11, ne12, /*ne13,*/ + /*ne10,*/ ne11, ne12_fdv, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); @@ -146,10 +152,14 @@ static void get_rows_cuda_float( const size_t s12 = nb12 / sizeof(int32_t); // const size_t s13 = nb13 / sizeof(int32_t); + GGML_ASSERT(ne12 > 0); + GGML_ASSERT(ne11 <= std::numeric_limits::max() / ne12); + const uint3 ne12_fdv = init_fastdiv_values(ne12); + k_get_rows_float<<>>( src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ - /*ne10,*/ ne11, ne12, /*ne13,*/ + /*ne10,*/ ne11, ne12_fdv, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/);