sycl : Optimize Q3_K mul_mat by reorder (llama/23725)

This commit is contained in:
Neo Zhang 2026-06-01 14:50:55 +08:00 committed by Georgi Gerganov
parent 1c0d1f0f7c
commit 687fbcb149
7 changed files with 340 additions and 4 deletions

View File

@ -107,6 +107,19 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
#endif
}
template <typename dst_t>
static void dequantize_row_q3_K_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int64_t nb = k / QK_K;
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
[=](sycl::nd_item<3> item_ct1) {
dequantize_block_q3_K_reorder(vx, y, item_ct1, nb);
});
}
template <typename dst_t>
static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
@ -652,7 +665,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_sycl;
case GGML_TYPE_Q3_K:
return dequantize_row_q3_K_sycl;
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q3_K_sycl_reorder;
} else {
return dequantize_row_q3_K_sycl;
}
case GGML_TYPE_Q4_K:
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q4_K_sycl_reorder;
@ -730,7 +747,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_sycl;
case GGML_TYPE_Q3_K:
return dequantize_row_q3_K_sycl;
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q3_K_sycl_reorder;
} else {
return dequantize_row_q3_K_sycl;
}
case GGML_TYPE_Q4_K:
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {

View File

@ -390,6 +390,63 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
}
template<typename dst_t>
static void dequantize_block_q3_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
#if QK_K == 256
const int64_t i = item_ct1.get_group(2);
if (i >= n_blocks) {
return;
}
const uint8_t * base = static_cast<const uint8_t *>(vx);
const size_t qs_offset = i * (QK_K / 4);
const size_t hmask_offset = n_blocks * (QK_K / 4) + i * (QK_K / 8);
const size_t scales_offset = n_blocks * (QK_K / 4) + n_blocks * (QK_K / 8) + i * 12;
const size_t d_offset = n_blocks * (QK_K / 4) + n_blocks * (QK_K / 8) + n_blocks * 12 +
i * sizeof(ggml_half);
const uint8_t * qs = base + qs_offset;
const uint8_t * hmask = base + hmask_offset;
const uint8_t * scales = base + scales_offset;
const float d_all = static_cast<float>(*reinterpret_cast<const ggml_half *>(base + d_offset));
const int64_t r = item_ct1.get_local_id(2) / 4;
const int64_t tid = r / 2;
const int64_t is0 = r % 2;
const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
const int64_t n = tid / 4;
const int64_t j = tid - 4 * n;
const int64_t is = 8 * n + 2 * j + is0;
const int shift = 2 * j;
uint8_t m = 1 << (4 * n + j);
uint8_t us = is < 4
? (scales[is - 0] & 0xF) | (((scales[is + 8] >> 0) & 3) << 4)
: is < 8
? (scales[is - 0] & 0xF) | (((scales[is + 4] >> 2) & 3) << 4)
: is < 12
? (scales[is - 8] >> 4) | (((scales[is + 0] >> 4) & 3) << 4)
: (scales[is - 8] >> 4) | (((scales[is - 4] >> 6) & 3) << 4);
const float dl = d_all * (us - 32);
dst_t * y = yy + i * QK_K + 128 * n + 32 * j;
const uint8_t * q = qs + 32 * n;
const uint8_t * hm = hmask;
for (int l = l0; l < l0 + 4; ++l) {
y[l] = dl * ((int8_t) ((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
}
#else
GGML_UNUSED(vx);
GGML_UNUSED(yy);
GGML_UNUSED(item_ct1);
GGML_UNUSED(n_blocks);
GGML_ABORT("Q3_K reorder dequantize not supported for QK_K != 256");
#endif
}
#if QK_K == 256
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
if (j < 4) {

View File

@ -501,6 +501,103 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
}
}
static void dequantize_mul_mat_vec_q3_k_reorder(const void *__restrict__ vx,
const float *__restrict__ yy,
float *__restrict__ dst,
const int ncols, int nrows,
const sycl::nd_item<3> &item_ct1) {
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
item_ct1.get_local_id(1);
if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
// SOA base pointers for the reordered layout:
// [qs: nb * (QK_K/4)] [hmask: nb * (QK_K/8)] [scales: nb * 12] [d: nb * sizeof(half)]
const int nb = nrows * num_blocks_per_row;
const uint8_t * qs_base = (const uint8_t *)vx;
const uint8_t * hmask_base = qs_base + (size_t)nb * (QK_K / 4);
const uint8_t * scales_base = hmask_base + (size_t)nb * (QK_K / 8);
const sycl::half * d_base = (const sycl::half *)(scales_base + (size_t)nb * 12);
float tmp = 0; // partial sum for thread in warp
#if QK_K == 256
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
const int tid =
item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix =
item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
const int step = 16/K_QUANTS_PER_ITERATION;
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0....15 or 0...7
const uint8_t m = 1 << (4*im);
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
const int q_offset = 32*im + l0;
const int y_offset = 128*im + l0;
uint16_t utmp[4];
const int8_t * s = (const int8_t *)utmp;
const uint16_t s_shift = 4*im;
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
const int bi = ib0 + i;
const float * y = yy + i * QK_K + y_offset;
const uint8_t * q = qs_base + bi * (QK_K / 4) + q_offset;
const uint8_t * h = hmask_base + bi * (QK_K / 8) + l0;
const uint16_t * a = (const uint16_t *)(scales_base + bi * 12);
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
const float d = d_base[bi];
float sum = 0;
for (int l = 0; l < n; ++l) {
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
}
tmp += d * sum;
}
#else
GGML_UNUSED(vx);
GGML_UNUSED(yy);
GGML_UNUSED(ncols);
GGML_UNUSED(item_ct1);
GGML_ABORT("Q3_K reorder DMMV not supported for QK_K != 256");
#endif
// sum up partial sums and write back result
#pragma unroll
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
if (item_ct1.get_local_id(2) == 0) {
dst[row] = tmp;
}
}
/*
DPCT1110:6: The total declared local variable size in device function
dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
@ -1440,6 +1537,22 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
});
}
static void dequantize_mul_mat_vec_q3_K_sycl_reorder(const void *vx, const float *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
dequantize_mul_mat_vec_q3_k_reorder(vx, y, dst, ncols, nrows, item_ct1);
});
}
static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
float *dst, const int ncols,
const int nrows,
@ -1581,7 +1694,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q3_K:
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
dequantize_mul_mat_vec_q3_K_sycl_reorder(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
} else {
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q4_K:
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&

View File

@ -3549,6 +3549,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
@ -3572,6 +3573,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
@ -3791,6 +3793,54 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
return true;
}
static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q3_K) == 0);
GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
const int nblocks = size / sizeof(block_q3_K);
sycl_reorder_temp_buffer tmp(stream, size);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}
auto * qs_ptr = data_device;
auto * hmask_ptr = qs_ptr + (QK_K / 4) * nblocks;
auto * scales_ptr = hmask_ptr + (QK_K / 8) * nblocks;
sycl::half * d_ptr = (sycl::half *) (scales_ptr + 12 * nblocks);
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
const block_q3_K * x = (const block_q3_K *) tmp_buf;
const int ib = i;
for (int j = 0; j < QK_K / 4; ++j) {
qs_ptr[ib * (QK_K / 4) + j] = x[ib].qs[j];
}
for (int j = 0; j < QK_K / 8; ++j) {
hmask_ptr[ib * (QK_K / 8) + j] = x[ib].hmask[j];
}
for (int j = 0; j < 12; ++j) {
scales_ptr[ib * 12 + j] = x[ib].scales[j];
}
d_ptr[ib] = x[ib].d;
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
return true;
}
static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q5_K) == 0);
GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
@ -3903,6 +3953,8 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
case GGML_TYPE_Q8_0:
return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
case GGML_TYPE_Q3_K:
return reorder_qw_q3_k(data_device, size, 0, stream);
case GGML_TYPE_Q4_K:
return reorder_qw_q4_k(data_device, size, 0, stream);
case GGML_TYPE_Q5_K:

View File

@ -770,6 +770,26 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
}
}
static void reorder_mul_mat_vec_q3_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
// Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel.
constexpr size_t num_subgroups = WARP_SIZE;
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups;
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q3_K>>(vx, vy, dst, ncols, nrows,
nd_item);
});
});
}
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
@ -1153,7 +1173,15 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
case GGML_TYPE_Q3_K:
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q3_k_q8_1_sycl\n");
reorder_mul_mat_vec_q3_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff,
stream);
} else {
GGML_SYCL_DEBUG("Calling mul_mat_vec_q3_K_q8_1_sycl\n");
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q4_K:
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&

View File

@ -58,6 +58,31 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
};
template <> struct block_q_t<GGML_TYPE_Q3_K> {
struct traits {
static constexpr uint32_t qk = QK_K;
static constexpr uint32_t qi = QI3_K;
static constexpr uint32_t qr = QR3_K;
static constexpr uint32_t vdr_mmvq = 1;
};
// Reordered layout: [qs (QK_K/4 per block)] [hmask (QK_K/8 per block)] [scales] [d]
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
auto qs_offset = block_index * (QK_K / 4);
auto hmask_offset = n_blocks * (QK_K / 4) + block_index * (QK_K / 8);
return { qs_offset, hmask_offset };
}
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
auto nblocks = (nrows * (ncols / QK_K));
auto total_qs_bytes = nblocks * (QK_K / 4) + nblocks * (QK_K / 8);
return { total_qs_bytes + block_index * 12,
total_qs_bytes + nblocks * 12 + block_index * sizeof(ggml_half) };
}
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
};
template <> struct block_q_t<GGML_TYPE_Q4_K> {
struct traits {
static constexpr uint32_t qk = QK_K;

View File

@ -394,6 +394,41 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0> {
}
};
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q3_K> {
static constexpr ggml_type gtype = GGML_TYPE_Q3_K;
using q3_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q3_K>;
using q3_k_traits = typename q3_k_block::traits;
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
const sycl::half2 * q8_1_ds, const int & iqs) {
const uint8_t * base = static_cast<const uint8_t *>(vbq);
const uint8_t * qs = base + ibx_offset.first;
const uint8_t * hmask = base + ibx_offset.second;
const uint8_t * scales = base + d_offset.first;
const ggml_half d = *reinterpret_cast<const ggml_half *>(base + d_offset.second);
const int bq8_offset = QR3_K * (iqs / (QI3_K / 2));
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1 / 2);
const int vl = get_int_from_uint8(qs, iqs);
const int vh = ~get_int_from_uint8(hmask, iqs % (QI3_K / 2)) >> bq8_offset;
int u[QR3_K];
float d8[QR3_K];
#pragma unroll
for (int i = 0; i < QR3_K; ++i) {
const int8_t * quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
u[i] = get_int_from_int8_aligned(quant_base_ptr, iqs % QI8_1);
d8[i] = (*(q8_1_ds + bq8_offset + i))[0];
}
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, scales, scale_offset, static_cast<float>(d), d8);
}
};
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
const int & iqs) {