From f7dfa39104dbb756fc0d839698edaffaf3c7ddaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 30 Oct 2025 08:56:28 +0100 Subject: [PATCH] cuda : fix argsort with 64k+ rows (llama/16849) --- ggml/src/ggml-cuda/argsort.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 6e7b90d4..3722cf3a 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -87,7 +87,7 @@ template static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) { // bitonic sort int col = threadIdx.x; - int row = blockIdx.y; + int row = blockIdx.x; if (col >= ncols_pad) { return; @@ -151,7 +151,7 @@ static void argsort_f32_i32_cuda_bitonic(const float * x, const int ncols_pad = next_power_of_2(ncols); const dim3 block_dims(ncols_pad, 1, 1); - const dim3 block_nums(1, nrows, 1); + const dim3 block_nums(nrows, 1, 1); const size_t shared_mem = ncols_pad * sizeof(int); // FIXME: this limit could be raised by ~2-4x on Ampere or newer