diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index 6fcf5a433..5ad121ae5 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -8,10 +8,10 @@ extern "C" { #define RPC_PROTO_MAJOR_VERSION 4 #define RPC_PROTO_MINOR_VERSION 0 -#define RPC_PROTO_PATCH_VERSION 0 +#define RPC_PROTO_PATCH_VERSION 1 #ifdef __cplusplus -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION"); #endif #define GGML_RPC_MAX_SERVERS 16 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f67252655..374934aac 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -535,6 +535,7 @@ extern "C" { GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, GGML_OP_IM2COL_3D, + GGML_OP_COL2IM_1D, GGML_OP_CONV_2D, GGML_OP_CONV_3D, GGML_OP_CONV_2D_DW, @@ -2007,6 +2008,16 @@ extern "C" { int d1, // dilation dimension 1 bool is_2D); + // col2im_1d: scatter-add GEMM columns back to 1D signal + // a: [K*OC, T_in] (columns from matmul, K = a->ne[0]/OC) + // result: [T_out, OC] where T_out = (T_in - 1)*s0 + K - 2*p0 + GGML_API struct ggml_tensor * ggml_col2im_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, // columns [K*OC, T_in] + int s0, // stride + int oc, // output channels + int p0); // padding to crop from both sides + GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index cd5c61a81..af7827aec 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1912,6 +1912,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_im2col_3d(params, tensor); } break; + case GGML_OP_COL2IM_1D: + { + ggml_compute_forward_col2im_1d(params, tensor); + } break; case GGML_OP_CONV_2D: { ggml_compute_forward_conv_2d(params, tensor); @@ -2343,6 +2347,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_CONV_2D: case GGML_OP_CONV_3D: case GGML_OP_CONV_2D_DW: + case GGML_OP_COL2IM_1D: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index becac9d6e..86842e554 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6730,6 +6730,78 @@ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) { return (coord + size) % size; // adding size avoids negative number weirdness } +// ggml_compute_forward_col2im_1d +// +// Scatter-add columns [K*OC, T_in] -> signal [T_out, OC] +// where T_out = (T_in - 1)*s + K - 2*p. Gather approach: each output reads ceil(K/s) inputs. +// Parallelized over the time axis so the split stays balanced whatever OC is. +// Supports F32, F16, BF16 input/output (same type), F32 accumulator. + +template +static void ggml_compute_forward_col2im_1d_impl( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src = dst->src[0]; // [K*OC, T_in] + + GGML_ASSERT(ggml_is_contiguous(src)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t OC = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + + const int64_t K_OC = src->ne[0]; + const int64_t T_in = src->ne[1]; + const int64_t K = K_OC / OC; + const int64_t T_out = dst->ne[0]; + + const elem_t * col_data = (const elem_t *) src->data; + elem_t * dst_data = (elem_t *) dst->data; + + const int ith = params->ith; + const int nth = params->nth; + + // Parallelize over the time axis: the split stays balanced whatever OC is, + // down to OC = 1 for mono audio, and threads read disjoint column bands + const int64_t dr = (T_out + nth - 1) / nth; + const int64_t it0 = dr * ith; + const int64_t it1 = it0 + dr < T_out ? it0 + dr : T_out; + + for (int64_t oc = 0; oc < OC; oc++) { + for (int64_t t_out = it0; t_out < it1; t_out++) { + const int64_t t_abs = t_out + p0; // absolute position in uncropped signal + // Gather: find all (t_in, k) where t_in * s + k == t_abs, 0 <= k < K + int64_t t_in_min = (t_abs - K + 1 + s0 - 1) / s0; // ceil((t_abs-K+1)/s) + if (t_in_min < 0) t_in_min = 0; + int64_t t_in_max = t_abs / s0; + if (t_in_max >= T_in) t_in_max = T_in - 1; + + float sum = 0.0f; + for (int64_t t_in = t_in_min; t_in <= t_in_max; t_in++) { + int64_t k = t_abs - t_in * s0; + if (k >= 0 && k < K) { + // col layout: [K*OC, T_in], element (oc*K+k, t_in) + sum += type_conversion_table::to_f32(col_data[(oc * K + k) + t_in * K_OC]); + } + } + // dst layout: [T_out, OC], element (t_out, oc) + dst_data[t_out + oc * T_out] = type_conversion_table::from_f32(sum); + } + } +} + +void ggml_compute_forward_col2im_1d( + const ggml_compute_params * params, + ggml_tensor * dst) { + switch (dst->src[0]->type) { + case GGML_TYPE_F32: ggml_compute_forward_col2im_1d_impl (params, dst); break; + case GGML_TYPE_F16: ggml_compute_forward_col2im_1d_impl(params, dst); break; + case GGML_TYPE_BF16: ggml_compute_forward_col2im_1d_impl(params, dst); break; + default: GGML_ABORT("col2im_1d: unsupported type %d", dst->src[0]->type); + } +} + // ggml_compute_forward_conv_2d diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 7398e5618..a8e18c716 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -68,6 +68,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_col2im_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 8815c67d8..18a5ebd2a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1031,6 +1031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "IM2COL", "IM2COL_BACK", "IM2COL_3D", + "COL2IM_1D", "CONV_2D", "CONV_3D", "CONV_2D_DW", @@ -1080,7 +1081,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1141,6 +1142,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "im2col(x)", "im2col_back(x)", "im2col_3d(x)", + "col2im_1d(x)", "conv_2d(x)", "conv_3d(x)", "conv_2d_dw(x)", @@ -1190,7 +1192,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); +static_assert(GGML_OP_COUNT == 97, "GGML_OP_COUNT != 97"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4541,6 +4543,41 @@ struct ggml_tensor * ggml_conv_1d_dw_ph( return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0); } +// ggml_col2im_1d + +struct ggml_tensor * ggml_col2im_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int s0, + int oc, + int p0) { + GGML_ASSERT(ggml_is_matrix(a)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16); + GGML_ASSERT(s0 > 0); + GGML_ASSERT(oc > 0); + GGML_ASSERT(p0 >= 0); + + const int64_t K_OC = a->ne[0]; + const int64_t T_in = a->ne[1]; + const int64_t K = K_OC / oc; + const int64_t T_out = (T_in - 1) * s0 + K - 2 * p0; + + GGML_ASSERT(K_OC == K * oc); // a->ne[0] must be a whole number of oc blocks + GGML_ASSERT(K > 0 && T_out > 0); + + const int64_t ne[4] = { T_out, oc, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 2, ne); + + int32_t params[] = { s0, (int32_t)oc, (int32_t)p0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_COL2IM_1D; + result->src[0] = a; + + return result; +} + // ggml_conv_transpose_1d static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {