From f5c3ce17d563b7a86561062c1cd82ad7b1ebdd24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Mon, 27 Apr 2026 08:30:55 +0200 Subject: [PATCH] ggml : use 64 bytes aligned tile buffers (llama/21058) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit | Model | Test | t/s OLD | t/s NEW | Speedup | |:---------------------------------|:-------|----------:|----------:|----------:| | qwen35 0.8B BF16 | pp512 | 584.59 | 595.41 | 1.02 | | qwen35 0.8B BF16 | tg128 | 52.23 | 52.82 | 1.01 | | qwen35 0.8B IQ2_M - 2.7 bpw | pp512 | 260.64 | 261.70 | 1.00 | | qwen35 0.8B IQ2_M - 2.7 bpw | tg128 | 81.17 | 80.89 | 1.00 | | qwen35 0.8B IQ2_XXS - 2.0625 bpw | pp512 | 302.36 | 302.56 | 1.00 | | qwen35 0.8B IQ2_XXS - 2.0625 bpw | tg128 | 84.93 | 85.12 | 1.00 | | qwen35 0.8B IQ3_XXS - 3.0625 bpw | pp512 | 263.22 | 260.01 | 0.99 | | qwen35 0.8B IQ3_XXS - 3.0625 bpw | tg128 | 80.29 | 78.94 | 0.98 | | qwen35 0.8B IQ4_NL - 4.5 bpw | pp512 | 728.65 | 742.09 | 1.02 | | qwen35 0.8B IQ4_NL - 4.5 bpw | tg128 | 82.39 | 84.46 | 1.03 | | qwen35 0.8B IQ4_XS - 4.25 bpw | pp512 | 681.33 | 677.06 | 0.99 | | qwen35 0.8B IQ4_XS - 4.25 bpw | tg128 | 80.18 | 79.28 | 0.99 | | qwen35 0.8B Q2_K_M | pp512 | 413.28 | 415.94 | 1.01 | | qwen35 0.8B Q2_K_M | tg128 | 81.90 | 82.78 | 1.01 | | qwen35 0.8B Q3_K_M | pp512 | 493.17 | 495.08 | 1.00 | | qwen35 0.8B Q3_K_M | tg128 | 82.75 | 83.23 | 1.01 | | qwen35 0.8B Q3_K_S | pp512 | 429.35 | 427.64 | 1.00 | | qwen35 0.8B Q3_K_S | tg128 | 86.69 | 87.02 | 1.00 | | qwen35 0.8B Q4_0 | pp512 | 783.46 | 782.32 | 1.00 | | qwen35 0.8B Q4_0 | tg128 | 88.23 | 87.90 | 1.00 | | qwen35 0.8B Q4_1 | pp512 | 741.71 | 729.76 | 0.98 | | qwen35 0.8B Q4_1 | tg128 | 85.44 | 86.01 | 1.01 | | qwen35 0.8B Q4_K_M | pp512 | 676.24 | 681.31 | 1.01 | | qwen35 0.8B Q4_K_M | tg128 | 76.59 | 77.06 | 1.01 | | qwen35 0.8B Q4_K_S | pp512 | 683.12 | 688.81 | 1.01 | | qwen35 0.8B Q4_K_S | tg128 | 80.50 | 81.19 | 1.01 | | qwen35 0.8B Q5_K_M | pp512 | 635.33 | 642.11 | 1.01 | | qwen35 0.8B Q5_K_M | tg128 | 72.07 | 72.49 | 1.01 | | qwen35 0.8B Q5_K_S | pp512 | 660.95 | 658.18 | 1.00 | | qwen35 0.8B Q5_K_S | tg128 | 72.19 | 72.95 | 1.01 | | qwen35 0.8B Q6_K | pp512 | 647.97 | 638.84 | 0.99 | | qwen35 0.8B Q6_K | tg128 | 72.83 | 72.49 | 1.00 | | qwen35 0.8B Q8_0 | pp512 | 805.01 | 785.49 | 0.98 | | qwen35 0.8B Q8_0 | tg128 | 70.10 | 70.13 | 1.00 | Signed-off-by: Adrien Gallouët --- ggml/src/ggml-cpu/amx/mmq.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index 93a6d397..d9383a04 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -2005,12 +2005,12 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v const int lda = KB * sizeof(TA); //const int ldb = KB * sizeof(TB); - static thread_local packed_B_t Tile0[TILE_N * TILE_K]; - static thread_local packed_B_t Tile1[TILE_N * TILE_K]; - static thread_local int8_t Tile23[TILE_M * TILE_K]; + alignas(64) static thread_local packed_B_t Tile0[TILE_N * TILE_K]; + alignas(64) static thread_local packed_B_t Tile1[TILE_N * TILE_K]; + alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K]; - static thread_local int32_t TileC0[TILE_M * TILE_N * 4]; - static thread_local int32_t TileC1[TILE_M * TILE_N * 4]; + alignas(64) static thread_local int32_t TileC0[TILE_M * TILE_N * 4]; + alignas(64) static thread_local int32_t TileC1[TILE_M * TILE_N * 4]; // double buffering C to interleave avx512 and amx int32_t * C_cur = TileC0; @@ -2187,21 +2187,21 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v const int m1 = std::max(M - TILE_M, 0); //const int lda = KB * sizeof(TA); - static thread_local int8_t Tile0[TILE_N * TILE_K]; - static thread_local int8_t Tile1[TILE_N * TILE_K]; - static thread_local int8_t Tile23[TILE_M * TILE_K]; + alignas(64) static thread_local int8_t Tile0[TILE_N * TILE_K]; + alignas(64) static thread_local int8_t Tile1[TILE_N * TILE_K]; + alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K]; // mat mul result for each group - static thread_local int32_t Tile4[TILE_M * TILE_N]; - static thread_local int32_t Tile5[TILE_M * TILE_N]; - static thread_local int32_t Tile6[TILE_M * TILE_N]; - static thread_local int32_t Tile7[TILE_M * TILE_N]; + alignas(64) static thread_local int32_t Tile4[TILE_M * TILE_N]; + alignas(64) static thread_local int32_t Tile5[TILE_M * TILE_N]; + alignas(64) static thread_local int32_t Tile6[TILE_M * TILE_N]; + alignas(64) static thread_local int32_t Tile7[TILE_M * TILE_N]; // sum of each QK_K block, contains 8 groups, int32 - static thread_local int32_t Sumi4[TILE_M * TILE_N]; - static thread_local int32_t Sumi5[TILE_M * TILE_N]; - static thread_local int32_t Sumi6[TILE_M * TILE_N]; - static thread_local int32_t Sumi7[TILE_M * TILE_N]; + alignas(64) static thread_local int32_t Sumi4[TILE_M * TILE_N]; + alignas(64) static thread_local int32_t Sumi5[TILE_M * TILE_N]; + alignas(64) static thread_local int32_t Sumi6[TILE_M * TILE_N]; + alignas(64) static thread_local int32_t Sumi7[TILE_M * TILE_N]; const int k_group_size = std::is_same::value ? 16 : 32; for (int i = 0; i < KB; ++i) {