From 982533fc0c38dabc6f7fa9155b7e33e5f565e223 Mon Sep 17 00:00:00 2001 From: Matt Corallo <649246+TheBlueMatt@users.noreply.github.com> Date: Mon, 1 Jun 2026 09:46:48 +0000 Subject: [PATCH] vulkan: Block-load Q3_K/Q6_K block data and subtract on 32b ints (llama/23056) Q2_K/Q3_K/Q6_K do much better when using MMVQ on Intel BMG even though they're only 2-byte aligned, and Q3_K still wins on NVIDIA as well. mesa isn't all that great at coalescing back-to-back loads from alternating arrays, so we force it instead. Further, we can do subtraction directly on a full int32_t rather than an i8vec4 with bit twiddling because the high bit is always free to start. On Intel BMG on mesa, the switch to MMVQ provides an immediate ~57% perf increase in tg128 for unsloth/Qwen3.5-9B-GGUF:Q3_K and ~78% perf increase in tg128 for unsloth/Qwen3.5-9B-GGUF:Q6_K. The futher switch to block loads leads to a ~24% perf increase in tg128 for unsloth/Qwen3.5-9B-GGUF:Q3_K and a ~48% perf increase in tg128 for unsloth/Qwen3.5-9B-GGUF:Q6_K. Finally, Xe2 wins on MMVQ even for small k, so we take the NVIDIA override for K quants on Xe2 as well. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 19 +++- .../vulkan-shaders/mul_mat_vecq_funcs.glsl | 106 +++++++++++------- 2 files changed, 79 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 74104149d..3cf191f20 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -8336,8 +8336,10 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ return false; } - // General performance issue with q3_k and q6_k due to 2-byte alignment - if (src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) { + // q6_k only has 2-byte alignment which makes it somewhat problematic, + // using MMVQ is only a win on Intel. + bool mmvq_q6 = device->vendor_id == VK_VENDOR_ID_INTEL; + if (src0_type == GGML_TYPE_Q6_K && !mmvq_q6) { return false; } @@ -8349,7 +8351,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ // Quantization overhead is not worth it for small k switch (device->vendor_id) { case VK_VENDOR_ID_NVIDIA: - if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) { + if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_IQ1_S || src0_type == GGML_TYPE_IQ1_M) { return true; } @@ -8376,9 +8378,16 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ return true; } case VK_VENDOR_ID_INTEL: + if (device->architecture == vk_device_architecture::INTEL_XE2) { + if (src0_type == GGML_TYPE_Q2_K || src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q6_K) { + return true; + } + } + if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) { - // Intel Windows proprietary driver MMVQ performance is worse than fp16, see - // https://github.com/ggml-org/llama.cpp/issues/17628 + // Intel Windows proprietary driver MMVQ performance for !Q2/Q3/Q6 is worse than fp16, + // see https://github.com/ggml-org/llama.cpp/issues/17628 and + // https://github.com/ggml-org/llama.cpp/pull/23056 return false; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl index bc580aeeb..73cf9c799 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl @@ -212,28 +212,40 @@ i32vec4 repack4(uint ib, uint iqs) { const uint qs_shift = ((iqs_k % 32) / 8) * 2; const uint hm_shift = iqs_k / 8; - // bitwise OR to add 4 if hmask is set, subtract later - const i8vec2 vals00 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 ] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 ] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals01 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 1] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 1] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals10 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 2] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 2] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals11 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 3] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 3] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals20 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 4] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 4] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals21 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 5] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 5] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals30 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 6] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 6] >> hm_shift) & uint16_t(0x0101)) << 2)); - const i8vec2 vals31 = unpack8(int16_t((data_a_packed16[ib_k].qs[qs_idx * 2 + 7] >> qs_shift) & uint16_t(0x0303))) | - unpack8(int16_t(((data_a_packed16[ib_k].hmask[iqs * 2 + 7] >> hm_shift) & uint16_t(0x0101)) << 2)); + const uvec4 qs = uvec4( uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 ]) | + (uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 1]) << 16), + uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 2]) | + (uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 3]) << 16), + uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 4]) | + (uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 5]) << 16), + uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 6]) | + (uint32_t(data_a_packed16[ib_k].qs[qs_idx * 2 + 7]) << 16)); - return i32vec4(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y) - int8_t(4)), - pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y) - int8_t(4)), - pack32(i8vec4(vals20.x, vals20.y, vals21.x, vals21.y) - int8_t(4)), - pack32(i8vec4(vals30.x, vals30.y, vals31.x, vals31.y) - int8_t(4))); + const uvec4 hmask = uvec4( uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 ]) | + (uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 1]) << 16), + uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 2]) | + (uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 3]) << 16), + uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 4]) | + (uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 5]) << 16), + uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 6]) | + (uint32_t(data_a_packed16[ib_k].hmask[iqs * 2 + 7]) << 16)); + + // bitwise OR to add 4 if hmask is set, subtract later + const uint vals0 = (( qs.x >> qs_shift) & 0x03030303) | + (((hmask.x >> hm_shift) & 0x01010101) << 2); + const uint vals1 = (( qs.y >> qs_shift) & 0x03030303) | + (((hmask.y >> hm_shift) & 0x01010101) << 2); + const uint vals2 = (( qs.z >> qs_shift) & 0x03030303) | + (((hmask.z >> hm_shift) & 0x01010101) << 2); + const uint vals3 = (( qs.w >> qs_shift) & 0x03030303) | + (((hmask.w >> hm_shift) & 0x01010101) << 2); + + // Subtract 4 by twiddling bits rather than using re-packing as mesa + // compiles repacking poorly. + return i32vec4(int32_t(((vals0 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals1 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals2 ^ 0x80808080) - 0x04040404) ^ 0x80808080), + int32_t(((vals3 ^ 0x80808080) - 0x04040404) ^ 0x80808080)); } float get_d_scale(uint ib, uint iqs) { @@ -343,27 +355,39 @@ i32vec4 repack4(uint ib, uint iqs) { const uint qh_idx = (iqs_k / 32) * 8 + iqs; const uint qh_shift = ((iqs_k % 32) / 8) * 2; - const i8vec2 vals00 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 ] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 ] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals01 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 1] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 1] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals10 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 2] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 2] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals11 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 3] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 3] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals20 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 4] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 4] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals21 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 5] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 5] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals30 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 6] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 6] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); - const i8vec2 vals31 = (unpack8(int16_t((data_a_packed16[ib_k].ql[ql_idx * 2 + 7] >> ql_shift) & uint16_t(0x0F0F))) | - unpack8(int16_t(((data_a_packed16[ib_k].qh[qh_idx * 2 + 7] >> qh_shift) & uint16_t(0x0303)) << 4))) - int8_t(32); + const uvec4 ql = uvec4( uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 ]) | + (uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 1]) << 16), + uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 2]) | + (uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 3]) << 16), + uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 4]) | + (uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 5]) << 16), + uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 6]) | + (uint32_t(data_a_packed16[ib_k].ql[ql_idx * 2 + 7]) << 16)); - return i32vec4(pack32(i8vec4(vals00.x, vals00.y, vals01.x, vals01.y)), - pack32(i8vec4(vals10.x, vals10.y, vals11.x, vals11.y)), - pack32(i8vec4(vals20.x, vals20.y, vals21.x, vals21.y)), - pack32(i8vec4(vals30.x, vals30.y, vals31.x, vals31.y))); + const uvec4 qh = uvec4( uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 ]) | + (uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 1]) << 16), + uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 2]) | + (uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 3]) << 16), + uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 4]) | + (uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 5]) << 16), + uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 6]) | + (uint32_t(data_a_packed16[ib_k].qh[qh_idx * 2 + 7]) << 16)); + + const uint vals0 = (( ql.x >> ql_shift) & 0x0F0F0F0F) | + (((qh.x >> qh_shift) & 0x03030303) << 4); + const uint vals1 = (( ql.y >> ql_shift) & 0x0F0F0F0F) | + (((qh.y >> qh_shift) & 0x03030303) << 4); + const uint vals2 = (( ql.z >> ql_shift) & 0x0F0F0F0F) | + (((qh.z >> qh_shift) & 0x03030303) << 4); + const uint vals3 = (( ql.w >> ql_shift) & 0x0F0F0F0F) | + (((qh.w >> qh_shift) & 0x03030303) << 4); + + // Subtract 32 by twiddling bits rather than using re-packing as mesa + // compiles repacking poorly. + return i32vec4(int32_t(((vals0 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals1 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals2 ^ 0x80808080) - 0x20202020) ^ 0x80808080), + int32_t(((vals3 ^ 0x80808080) - 0x20202020) ^ 0x80808080)); } float get_d_scale(uint ib, uint iqs) {