metal: template GLU kernels to support f16/f32 (llama/23882)
Drops the hardcoded f32 GLU kernels in favor of a single template. We now load/store in the native tensor type (half or float) to save memory bandwidth, but keep the actual ALU compute in float to avoid exploding math in geglu/swiglu. Also opened up the dispatch gate to allow f16 inputs.
This commit is contained in:
parent
71d80aa49e
commit
050b8567a0
|
|
@ -1107,7 +1107,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
|||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous_1(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1421,7 +1421,8 @@ template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat
|
|||
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||
|
||||
kernel void kernel_reglu_f32(
|
||||
template<typename T>
|
||||
kernel void kernel_reglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
|
|
@ -1429,19 +1430,25 @@ kernel void kernel_reglu_f32(
|
|||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
dst_row[i0] = x0*x1*(x0 > 0.0f);
|
||||
dst_row[i0] = (T)(x0*x1*(x0 > 0.0f));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_f32(
|
||||
typedef decltype(kernel_reglu<float>) kernel_reglu_t;
|
||||
|
||||
template [[host_name("kernel_reglu_f32")]] kernel kernel_reglu_t kernel_reglu<float>;
|
||||
template [[host_name("kernel_reglu_f16")]] kernel kernel_reglu_t kernel_reglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
|
|
@ -1449,9 +1456,9 @@ kernel void kernel_geglu_f32(
|
|||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
|
|
@ -1459,11 +1466,17 @@ kernel void kernel_geglu_f32(
|
|||
|
||||
const float gelu = 0.5f*x0*(1.0f + precise::tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0)));
|
||||
|
||||
dst_row[i0] = gelu*x1;
|
||||
dst_row[i0] = (T)(gelu*x1);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_swiglu_f32(
|
||||
typedef decltype(kernel_geglu<float>) kernel_geglu_t;
|
||||
|
||||
template [[host_name("kernel_geglu_f32")]] kernel kernel_geglu_t kernel_geglu<float>;
|
||||
template [[host_name("kernel_geglu_f16")]] kernel kernel_geglu_t kernel_geglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_swiglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
|
|
@ -1471,9 +1484,9 @@ kernel void kernel_swiglu_f32(
|
|||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
|
|
@ -1481,11 +1494,17 @@ kernel void kernel_swiglu_f32(
|
|||
|
||||
const float silu = x0 / (1.0f + exp(-x0));
|
||||
|
||||
dst_row[i0] = silu*x1;
|
||||
dst_row[i0] = (T)(silu*x1);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_swiglu_oai_f32(
|
||||
typedef decltype(kernel_swiglu<float>) kernel_swiglu_t;
|
||||
|
||||
template [[host_name("kernel_swiglu_f32")]] kernel kernel_swiglu_t kernel_swiglu<float>;
|
||||
template [[host_name("kernel_swiglu_f16")]] kernel kernel_swiglu_t kernel_swiglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_swiglu_oai(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
|
|
@ -1493,9 +1512,9 @@ kernel void kernel_swiglu_oai_f32(
|
|||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
float x0 = src0_row[i0];
|
||||
|
|
@ -1507,11 +1526,17 @@ kernel void kernel_swiglu_oai_f32(
|
|||
float out_glu = x0 / (1.0f + exp(-x0 * args.alpha));
|
||||
out_glu = out_glu * (1.0f + x1);
|
||||
|
||||
dst_row[i0] = out_glu;
|
||||
dst_row[i0] = (T)out_glu;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_erf_f32(
|
||||
typedef decltype(kernel_swiglu_oai<float>) kernel_swiglu_oai_t;
|
||||
|
||||
template [[host_name("kernel_swiglu_oai_f32")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai<float>;
|
||||
template [[host_name("kernel_swiglu_oai_f16")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu_erf(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
|
|
@ -1519,9 +1544,9 @@ kernel void kernel_geglu_erf_f32(
|
|||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
|
|
@ -1529,11 +1554,17 @@ kernel void kernel_geglu_erf_f32(
|
|||
|
||||
const float gelu_erf = 0.5f*x0*(1.0f+erf_approx<float>(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = gelu_erf*x1;
|
||||
dst_row[i0] = (T)(gelu_erf*x1);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_quick_f32(
|
||||
typedef decltype(kernel_geglu_erf<float>) kernel_geglu_erf_t;
|
||||
|
||||
template [[host_name("kernel_geglu_erf_f32")]] kernel kernel_geglu_erf_t kernel_geglu_erf<float>;
|
||||
template [[host_name("kernel_geglu_erf_f16")]] kernel kernel_geglu_erf_t kernel_geglu_erf<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu_quick(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
|
|
@ -1541,9 +1572,9 @@ kernel void kernel_geglu_quick_f32(
|
|||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
|
|
@ -1551,10 +1582,15 @@ kernel void kernel_geglu_quick_f32(
|
|||
|
||||
const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = gelu_quick*x1;
|
||||
dst_row[i0] = (T)(gelu_quick*x1);
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_geglu_quick<float>) kernel_geglu_quick_t;
|
||||
|
||||
template [[host_name("kernel_geglu_quick_f32")]] kernel kernel_geglu_quick_t kernel_geglu_quick<float>;
|
||||
template [[host_name("kernel_geglu_quick_f16")]] kernel kernel_geglu_quick_t kernel_geglu_quick<half>;
|
||||
|
||||
kernel void kernel_op_sum_f32(
|
||||
constant ggml_metal_kargs_sum & args,
|
||||
device const float * src0,
|
||||
|
|
|
|||
Loading…
Reference in New Issue