opencl: use flat variants of q4_K and q6_K gemv for very large M (llama/24006)
This commit is contained in:
parent
d31cb20b25
commit
f110ff540c
|
|
@ -4950,6 +4950,21 @@ inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backen
|
|||
return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
|
||||
}
|
||||
|
||||
static inline bool use_flat_gemv_for_large_m_q4_K(const ggml_tensor *tensor) {
|
||||
// gemv_noshuffle variant perf drops for large M, use flat variant for large M.
|
||||
// threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
|
||||
// note that this forces large M weights to use LM GEMM.
|
||||
return tensor->ne[1] >= 32768 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||
}
|
||||
|
||||
static inline bool use_flat_gemv_for_large_m_q6_K(const ggml_tensor *tensor) {
|
||||
// gemv_noshuffle variant perf drops for large M, use flat variant for large M.
|
||||
// threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
|
||||
// q6_K flat gemv is worse for smaller K; 2048 seems to be a reasonable threshold.
|
||||
// note that this forces large M weights to use LM GEMM.
|
||||
return tensor->ne[1] >= 32768 && tensor->ne[0] >= 2048 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
||||
}
|
||||
|
||||
static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
|
||||
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
|
||||
|
|
@ -6595,7 +6610,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
|
||||
}
|
||||
#else
|
||||
|
|
@ -6623,7 +6638,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
|
||||
tensor->extra = extra;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
|
||||
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
|
|
@ -6923,7 +6938,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
cl_kernel kernel;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
kernel = backend_ctx->kernel_convert_block_q6_K;
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q6_K_noshuffle;
|
||||
}
|
||||
#else
|
||||
|
|
@ -6956,7 +6971,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|||
tensor->extra = extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
|
||||
cl_int M = tensor->ne[1]; // ne01
|
||||
cl_int K = tensor->ne[0]; // ne00
|
||||
|
||||
|
|
@ -7599,7 +7614,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
|
||||
|
|
@ -7820,7 +7835,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
|
||||
static ggml_cl_buffer buf_trans_ql;
|
||||
static ggml_cl_buffer buf_trans_qh;
|
||||
static ggml_cl_buffer buf_trans_s;
|
||||
|
|
@ -13213,13 +13228,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|||
}
|
||||
|
||||
// q4_k x fp32
|
||||
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
|
||||
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q4_K(src0)) {
|
||||
ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// q6_K x fp32
|
||||
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
|
||||
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q6_K(src0)) {
|
||||
ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue