CUDA: use fastdiv for batch index split in get_rows (llama/22650)

This commit is contained in:
leonardHONG 2026-05-04 22:24:05 +08:00 committed by Georgi Gerganov
parent 0fffe2cdb8
commit 36a83b84bb
1 changed files with 20 additions and 10 deletions

View File

@ -6,17 +6,18 @@ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(
const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i10 = blockIdx.x;
const int i11 = z / ne12; // TODO fastdiv
const int i12 = z % ne12;
const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
const int i11 = dm.x;
const int i12 = dm.y;
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
@ -42,17 +43,18 @@ template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i10 = blockIdx.x;
const int i11 = z / ne12; // TODO fastdiv
const int i12 = z % ne12;
const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
const int i11 = dm.x;
const int i12 = dm.y;
if (i00 >= ne00) {
return;
@ -115,10 +117,14 @@ static void get_rows_cuda_q(
GGML_ASSERT(ne00 % 2 == 0);
GGML_ASSERT(ne12 > 0);
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
const uint3 ne12_fdv = init_fastdiv_values(ne12);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10,*/ ne11, ne12, /*ne13,*/
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
@ -146,10 +152,14 @@ static void get_rows_cuda_float(
const size_t s12 = nb12 / sizeof(int32_t);
// const size_t s13 = nb13 / sizeof(int32_t);
GGML_ASSERT(ne12 > 0);
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
const uint3 ne12_fdv = init_fastdiv_values(ne12);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10,*/ ne11, ne12, /*ne13,*/
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);