From 587f602ea1cc081b83de5c23f70ccf7a170d1392 Mon Sep 17 00:00:00 2001 From: shalinib-ibm Date: Fri, 19 Jun 2026 11:25:38 +0530 Subject: [PATCH] ggml-cpu: support K tails in power10 Q8/Q4 MMA matmul (llama/24753) * ggml-cpu: support K tails in Power10 MMA Q8/Q4 matmul This patch removes the requirement that K be divisible by kc in the tinyBlas_Q0_PPC tiled matmul path. Process the final K panel using its actual depth and pass the reduced panel size through packing and kernel execution. This allows more workloads to use the MMA kernel and reduces fallback to mnpack. * Apply suggestion from @taronaeo Co-authored-by: Aaron Teo --------- Co-authored-by: Aaron Teo --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index e13828e3b..0b8323e60 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -2345,7 +2345,7 @@ class tinyBLAS_Q0_PPC { else if (n_aligned % 16 == 0) nc = 16; else nc = 8; } - bool can_use_tiled = n_aligned > 0 && (m % mc == 0) && (k % kc == 0); + bool can_use_tiled = n_aligned > 0 && (m % mc == 0); if (can_use_tiled) { matmul_tiled(m, n_aligned, mc, nc, kc); if (n > n_aligned) { @@ -3063,13 +3063,14 @@ class tinyBLAS_Q0_PPC { int64_t ii = (job / xtiles) * mc; int64_t jj = (job % xtiles) * nc; for (int64_t kk = 0; kk < k; kk += kc) { + int64_t k_cur = MIN(kc, k - kk); if constexpr(is_Ablock_q4) { - packNormal_q4_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack); + packNormal_q4_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack); } else { - packNormal_q8_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack); + packNormal_q8_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack); } - packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, kc, (uint8_t *)B_pack); - KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack); + packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, k_cur, (uint8_t *)B_pack); + KERNEL_Q0(ii, jj, mc, nc, k_cur, kk, A_pack, B_pack); } } }