vulkan: reduce iq1 shared memory usage for mul_mm (llama/24287)
This commit is contained in:
parent
686bc802d1
commit
dc794303d8
|
|
@ -3394,7 +3394,9 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
|
|||
switch (src0_type) {
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
lut_size = 2*2048 + 4*2048;
|
||||
// Regular matmul uses the compact uint16_t IQ1 grid; the expanded
|
||||
// uint32_t grid is only enabled for the q8_1/int-dot vector path.
|
||||
lut_size = 2*2048;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
lut_size = 8*256;
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#extension GL_EXT_integer_dot_product : require
|
||||
|
||||
#define MMQ
|
||||
#define NEEDS_IQ1S_GRID_GPU
|
||||
#define B_TYPE block_q8_1_x4
|
||||
|
||||
#include "mul_mat_vec_base.glsl"
|
||||
|
|
|
|||
|
|
@ -598,9 +598,10 @@ const uint[1024] iq1s_grid_const = {
|
|||
0x55dd55df, 0x55d555d7, 0x5503550c, 0x557f5501, 0x5577557d, 0x55405575, 0x555d555f, 0x55555557
|
||||
};
|
||||
|
||||
#if defined(NEEDS_IQ1S_GRID_GPU)
|
||||
// Same content as iq1s_grid_const except each 2-bit value is expanded to 4-bit
|
||||
// and has 1 added to it (allows packed values to be extracted with & 0x0F0F0F0F
|
||||
// and 0xF0F0F0F0).
|
||||
// and 0xF0F0F0F0). This is only used by the q8_1/int-dot vector path.
|
||||
const uint32_t[2048] iq1s_grid_gpu_const = {
|
||||
0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
|
||||
0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
|
||||
|
|
@ -859,9 +860,12 @@ const uint32_t[2048] iq1s_grid_gpu_const = {
|
|||
0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
|
||||
0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
|
||||
};
|
||||
#endif
|
||||
|
||||
shared uint16_t iq1s_grid[2048];
|
||||
#if defined(NEEDS_IQ1S_GRID_GPU)
|
||||
shared uint32_t iq1s_grid_gpu[2048];
|
||||
#endif
|
||||
|
||||
#define NEEDS_INIT_IQ_SHMEM
|
||||
void init_iq_shmem(uvec3 wgsize)
|
||||
|
|
@ -875,12 +879,14 @@ void init_iq_shmem(uvec3 wgsize)
|
|||
iq1s_grid[2*idx+1] = g.y;
|
||||
}
|
||||
}
|
||||
#if defined(NEEDS_IQ1S_GRID_GPU)
|
||||
[[unroll]] for (uint i = 0; i < iq1s_grid_gpu_const.length(); i += wgsize.x) {
|
||||
uint idx = i + gl_LocalInvocationIndex.x;
|
||||
if (iq1s_grid_gpu_const.length() % wgsize.x == 0 || idx < iq1s_grid_gpu_const.length()) {
|
||||
iq1s_grid_gpu[idx] = iq1s_grid_gpu_const[idx];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
barrier();
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in New Issue