TP: quantized KV cache support (llama/23792)
* TP: quantized KV cache support * fix partial view * remove overly strict assert
This commit is contained in:
parent
982533fc0c
commit
e815b264eb
|
|
@ -381,11 +381,15 @@ extern "C" {
|
|||
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
|
||||
// - some tensors have an inhomogenenous data layout along the split axis,
|
||||
// those tensors are divided into segments which are each individually split across devices
|
||||
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
|
||||
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
|
||||
// - ne has one entry per segment and device and that segment repeats nr times,
|
||||
// in total when accounting for repetitions the segments add up to ggml_tensor::ne for that axis,
|
||||
// the outer/inner loops are over segments/devices like [seg0_dev0_r0, seg0_dev1_r0, seg0_dev0_r1, seg0_dev1_r1, seg1_dev0_r0, seg1_dev1_r0],
|
||||
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
|
||||
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
|
||||
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V,
|
||||
// the Q matrix can be larger than the K and V matrices so this can either be expressed as 3 segments or as 2 segments
|
||||
// where the segment for K/V repeats twice
|
||||
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
|
||||
uint32_t nr[16];
|
||||
uint32_t n_segments;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -487,6 +487,9 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
|
|||
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) {
|
||||
// FIXME Currently this function preserves/erases the information in n_segments and nr in an inconsistent way.
|
||||
// Since the operations in question are developed specifically for llama.cpp this currently does not manifest as a bug there.
|
||||
// However, in a broader ggml context with arbitrary ggml graphs this can lead to unexpected results.
|
||||
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
|
||||
|
|
@ -497,11 +500,11 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
int64_t sum_a = 0;
|
||||
for (size_t s = 0; s < a.n_segments; s++) {
|
||||
sum_a += a.ne[s*n_bufs + j];
|
||||
sum_a += a.ne[s*n_bufs + j] * a.nr[s];
|
||||
}
|
||||
int64_t sum_b = 0;
|
||||
for (size_t s = 0; s < b.n_segments; s++) {
|
||||
sum_b += b.ne[s*n_bufs + j];
|
||||
sum_b += b.ne[s*n_bufs + j] * b.nr[s];
|
||||
}
|
||||
if (sum_a != sum_b) {
|
||||
return false;
|
||||
|
|
@ -511,7 +514,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
};
|
||||
|
||||
auto handle_generic = [&](const std::vector<ggml_backend_meta_split_state> & src_ss, bool scalar_only) -> ggml_backend_meta_split_state {
|
||||
ggml_backend_meta_split_state ret = {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, 1};
|
||||
ggml_backend_meta_split_state ret = {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, {1}, 1};
|
||||
for (size_t i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i] == nullptr || tensor->src[i] == tensor) {
|
||||
continue;
|
||||
|
|
@ -519,15 +522,15 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
if (ret.axis == GGML_BACKEND_SPLIT_AXIS_NONE) {
|
||||
ret = src_ss[i];
|
||||
} else if (!split_states_equal(src_ss[i], ret)) {
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ret.axis == GGML_BACKEND_SPLIT_AXIS_NONE) {
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
if (scalar_only && ret.axis >= 0 && ret.axis < GGML_MAX_DIMS) {
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
ret = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
GGML_ASSERT(ret.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
|
||||
return ret;
|
||||
|
|
@ -571,42 +574,24 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
|
||||
auto handle_mul_mat = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
return {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, {1}, 1};
|
||||
}
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1 && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
ggml_backend_meta_split_state ret = src_ss[0];
|
||||
ret.axis = GGML_BACKEND_SPLIT_AXIS_0;
|
||||
ret.nr[0] = 1;
|
||||
ret.n_segments = 1;
|
||||
return ret;
|
||||
}
|
||||
if (src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_1 && src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
ggml_backend_meta_split_state ret = src_ss[1];
|
||||
ret.n_segments = 1;
|
||||
return ret;
|
||||
return src_ss[1];
|
||||
}
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_0 && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
GGML_ASSERT(split_states_equal(src_ss[0], src_ss[1]));
|
||||
return {assume_sync ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_PARTIAL, {0}, 1};
|
||||
return {assume_sync ? GGML_BACKEND_SPLIT_AXIS_MIRRORED : GGML_BACKEND_SPLIT_AXIS_PARTIAL, {0}, {1}, 1};
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
};
|
||||
|
||||
auto handle_cpy = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis >= 0 && src_ss[0].axis < GGML_MAX_DIMS) {
|
||||
int64_t ne_split_src = tensor->src[0]->ne[0];
|
||||
for (int dim = 1; dim <= src_ss[0].axis; dim++) {
|
||||
ne_split_src *= tensor->src[0]->ne[dim];
|
||||
}
|
||||
int64_t ne_split_dst = 1;
|
||||
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
ne_split_dst *= tensor->ne[dim];
|
||||
if (ne_split_dst == ne_split_src) {
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, 1};
|
||||
}
|
||||
}
|
||||
}
|
||||
return handle_generic(src_ss, /*scalar_only =*/ false);
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
auto handle_reshape = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
|
|
@ -615,33 +600,25 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
case GGML_BACKEND_SPLIT_AXIS_1:
|
||||
case GGML_BACKEND_SPLIT_AXIS_2:
|
||||
case GGML_BACKEND_SPLIT_AXIS_3: {
|
||||
GGML_ASSERT(!ggml_is_permuted(tensor) && !ggml_is_permuted(tensor->src[0]));
|
||||
if (src_ss[0].axis == ggml_n_dims(tensor->src[0]) - 1) {
|
||||
return {ggml_backend_meta_split_axis(ggml_n_dims(tensor) - 1), {0}, 1};
|
||||
GGML_ASSERT(src_ss[0].n_segments == 1);
|
||||
if (src_ss[0].axis == ggml_n_dims(tensor->src[0]) - 1 && src_ss[0].nr[0] == 1) {
|
||||
return {ggml_backend_meta_split_axis(ggml_n_dims(tensor) - 1), {0}, {1}, 1};
|
||||
}
|
||||
std::vector<int64_t> base_ne_in;
|
||||
base_ne_in.reserve(GGML_MAX_DIMS - src_ss[0].axis);
|
||||
{
|
||||
base_ne_in.push_back(1);
|
||||
int dim = 0;
|
||||
for (; dim <= src_ss[0].axis; dim++) {
|
||||
base_ne_in[0] *= tensor->src[0]->ne[dim];
|
||||
}
|
||||
for (; dim <= GGML_MAX_DIMS; dim++) {
|
||||
base_ne_in.push_back(base_ne_in.back() * tensor->src[0]->ne[dim]);
|
||||
}
|
||||
int64_t base_ne_in = tensor->src[0]->ne[0];
|
||||
for (int dim = 1; dim <= src_ss[0].axis; dim++) {
|
||||
base_ne_in *= tensor->src[0]->ne[dim];
|
||||
}
|
||||
base_ne_in /= src_ss[0].nr[0];
|
||||
int64_t base_ne_out = 1;
|
||||
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
const int64_t base_ne_out_next = base_ne_out *= tensor->ne[dim];
|
||||
for (const int64_t & bni : base_ne_in) {
|
||||
if (bni == base_ne_out_next) {
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, 1};
|
||||
}
|
||||
if (base_ne_out_next % base_ne_in == 0) {
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, {uint32_t(base_ne_out_next/base_ne_in)}, 1};
|
||||
}
|
||||
if (base_ne_out_next > base_ne_in[0]) {
|
||||
GGML_ASSERT(dim + 1 < GGML_MAX_DIMS);
|
||||
return {ggml_backend_meta_split_axis(dim + 1), {0}, 1};
|
||||
if (base_ne_out_next > base_ne_in) {
|
||||
GGML_ASSERT(src_ss[0].n_segments == 1);
|
||||
GGML_ASSERT(src_ss[0].nr[0] == 1);
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, {1}, 1};
|
||||
}
|
||||
base_ne_out = base_ne_out_next;
|
||||
}
|
||||
|
|
@ -653,11 +630,18 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
}
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto handle_cpy = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis >= 0 && src_ss[0].axis < GGML_MAX_DIMS) {
|
||||
return handle_reshape(src_ss);
|
||||
}
|
||||
return handle_generic(src_ss, /*scalar_only =*/ false);
|
||||
};
|
||||
|
||||
auto handle_view = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (ggml_is_contiguous(tensor) && ggml_is_contiguous(tensor->src[0])) {
|
||||
return handle_reshape(src_ss);
|
||||
|
|
@ -681,7 +665,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
if (!ggml_is_permuted(tensor) && !ggml_is_permuted(tensor->src[0]) && axis >= 0 && axis < GGML_MAX_DIMS-1) {
|
||||
for (int dim = 0; dim < GGML_MAX_DIMS-1; dim++) {
|
||||
if (tensor->nb[dim+1] == tensor->src[0]->nb[axis+1]) {
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, 1};
|
||||
return {ggml_backend_meta_split_axis(dim), {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
@ -690,7 +674,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
return src_ss[0];
|
||||
}
|
||||
GGML_ABORT("view of permuted tensor not implemented");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
auto handle_permute = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
|
|
@ -699,7 +683,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
case GGML_BACKEND_SPLIT_AXIS_1:
|
||||
case GGML_BACKEND_SPLIT_AXIS_2:
|
||||
case GGML_BACKEND_SPLIT_AXIS_3: {
|
||||
return {ggml_backend_meta_split_axis(tensor->op_params[src_ss[0].axis]), {0}, 1};
|
||||
GGML_ASSERT(src_ss[0].n_segments == 1 || src_ss[0].nr[0] == 1);
|
||||
return {ggml_backend_meta_split_axis(tensor->op_params[src_ss[0].axis]), {0}, {src_ss[0].nr[0]}, 1};
|
||||
}
|
||||
case GGML_BACKEND_SPLIT_AXIS_MIRRORED:
|
||||
case GGML_BACKEND_SPLIT_AXIS_PARTIAL: {
|
||||
|
|
@ -707,7 +692,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
}
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
@ -716,7 +701,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
switch (src_ss[0].axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
case GGML_BACKEND_SPLIT_AXIS_1: {
|
||||
return {ggml_backend_meta_split_axis(int(src_ss[0].axis) ^ 1), {0}, 1};
|
||||
GGML_ASSERT(src_ss[0].n_segments == 1 || src_ss[0].nr[0] == 1);
|
||||
return {ggml_backend_meta_split_axis(int(src_ss[0].axis) ^ 1), {0}, {src_ss[0].nr[0]}, 1};
|
||||
}
|
||||
case GGML_BACKEND_SPLIT_AXIS_2:
|
||||
case GGML_BACKEND_SPLIT_AXIS_3:
|
||||
|
|
@ -726,7 +712,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
}
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
//return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
@ -764,16 +750,16 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
GGML_ASSERT( src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_2);
|
||||
GGML_ASSERT(tensor->src[4] == nullptr || src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED);
|
||||
GGML_ASSERT(tensor->src[4] == nullptr || src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_0);
|
||||
return {GGML_BACKEND_SPLIT_AXIS_1, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_1, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
auto handle_ssm_conv = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis == src_ss[1].axis) {
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
return {GGML_BACKEND_SPLIT_AXIS_1, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_1, {0}, {1}, 1};
|
||||
}
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1) {
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, {1}, 1};
|
||||
}
|
||||
}
|
||||
return handle_generic(src_ss, /*scalar_only =*/ false);
|
||||
|
|
@ -781,8 +767,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
|
||||
auto handle_gated_delta_net = [&](const std::vector<ggml_backend_meta_split_state> & src_ss) -> ggml_backend_meta_split_state {
|
||||
if (src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[1].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED &&
|
||||
src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED &&
|
||||
src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
src_ss[2].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[3].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED &&
|
||||
src_ss[4].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED && src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
|
||||
return src_ss[0];
|
||||
}
|
||||
GGML_ASSERT(src_ss[0].axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
|
|
@ -793,12 +779,12 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
// state shape is (S_v*S_v*H, K, n_seqs); the heads dim is nested inside axis 0,
|
||||
// so a head-aligned split on the input cache reshapes to axis 0 here (not axis 2).
|
||||
GGML_ASSERT(src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_2 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_1 || src_ss[5].axis == GGML_BACKEND_SPLIT_AXIS_0);
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_0, {0}, {1}, 1};
|
||||
};
|
||||
|
||||
auto calculate_split_state = [&]() -> ggml_backend_meta_split_state {
|
||||
if (ggml_nelements(tensor) == 0) {
|
||||
return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
return {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
}
|
||||
if (ggml_backend_buffer_get_usage(tensor->buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE && tensor->view_src == nullptr) {
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(ggml_backend_buffer_get_type(tensor->buffer));
|
||||
|
|
@ -807,19 +793,21 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
if (ret.axis >= 0 && ret.axis <= GGML_MAX_DIMS) {
|
||||
const int64_t granularity = ret.axis == GGML_BACKEND_SPLIT_AXIS_0 ? ggml_blck_size(tensor->type) : 1;
|
||||
int64_t ne_sum = 0;
|
||||
for (size_t sj = 0; sj < ret.n_segments*n_bufs; sj++) {
|
||||
GGML_ASSERT(ret.ne[sj] % granularity == 0);
|
||||
ne_sum += ret.ne[sj];
|
||||
for (size_t s = 0; s < ret.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
GGML_ASSERT(ret.ne[s*n_bufs + j] % granularity == 0);
|
||||
ne_sum += ret.ne[s*n_bufs + j] * ret.nr[s];
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(ne_sum == tensor->ne[ret.axis]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::vector<ggml_backend_meta_split_state> src_ss(GGML_MAX_SRC, {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, 1});
|
||||
std::vector<ggml_backend_meta_split_state> src_ss(GGML_MAX_SRC, {GGML_BACKEND_SPLIT_AXIS_NONE, {0}, {1}, 1});
|
||||
for (size_t i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (tensor->src[i] == nullptr || tensor->src[i] == tensor) {
|
||||
src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
continue;
|
||||
}
|
||||
src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true);
|
||||
|
|
@ -829,7 +817,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
ggml_backend_meta_split_state split_state;
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_NONE: {
|
||||
split_state = {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, 1};
|
||||
split_state = {GGML_BACKEND_SPLIT_AXIS_MIRRORED, {0}, {1}, 1};
|
||||
} break;
|
||||
case GGML_OP_DUP: {
|
||||
split_state = handle_generic(src_ss, /*scalar_only =*/ true);
|
||||
|
|
@ -1016,7 +1004,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
} break;
|
||||
default: {
|
||||
GGML_ABORT("ggml op not implemented: %s", ggml_op_name(tensor->op));
|
||||
split_state = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
split_state = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, {1}, 1};
|
||||
} break;
|
||||
}
|
||||
if (split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS) {
|
||||
|
|
@ -1034,23 +1022,25 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
split_state.ne[s*n_bufs + j] = 0;
|
||||
}
|
||||
for (size_t s = 0; s < src_ss[i].n_segments; s++) {
|
||||
split_state.ne[j] += src_ss[i].ne[s*n_bufs + j];
|
||||
split_state.ne[j] += src_ss[i].ne[s*n_bufs + j] * src_ss[i].nr[s];
|
||||
}
|
||||
split_state.ne[j] *= tensor->ne[split_state.axis];
|
||||
if (split_state.ne[j] != 0 || tensor->src[i]->ne[src_ss[i].axis] != 0) {
|
||||
GGML_ASSERT(split_state.ne[j] % tensor->src[i]->ne[src_ss[i].axis] == 0);
|
||||
split_state.ne[j] /= tensor->src[i]->ne[src_ss[i].axis];
|
||||
const int64_t div = tensor->src[i]->ne[src_ss[i].axis] * split_state.nr[0];
|
||||
GGML_ASSERT(split_state.ne[j] % div == 0);
|
||||
split_state.ne[j] /= div;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
// Assert that ratio is consistent:
|
||||
int64_t sum = 0;
|
||||
for (size_t s = 0; s < src_ss[i].n_segments; s++) {
|
||||
sum += src_ss[i].ne[s*n_bufs + j];
|
||||
sum += src_ss[i].ne[s*n_bufs + j] * src_ss[i].nr[s];
|
||||
}
|
||||
// Assert that ratio is consistent:
|
||||
GGML_ASSERT(split_state.ne[j] * tensor->src[i]->ne[src_ss[i].axis]
|
||||
== sum * tensor->ne[split_state.axis]);
|
||||
GGML_ASSERT(split_state.ne[j]*split_state.nr[0] * tensor->src[i]->ne[src_ss[i].axis]
|
||||
== sum * tensor->ne[split_state.axis]);
|
||||
}
|
||||
}
|
||||
first_src_split_by_axis = false;
|
||||
|
|
@ -1080,13 +1070,14 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
srcs_info += ", ";
|
||||
}
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor->src[0], true);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
const char * axis_name = ggml_backend_meta_split_axis_name(split_state.axis);
|
||||
std::string ne_info;
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
if (!ne_info.empty()) {
|
||||
ne_info += ", ";
|
||||
}
|
||||
ne_info += std::to_string(split_state.ne[j]);
|
||||
ne_info += std::to_string(split_state.ne[j]) + "x" + std::to_string(split_state.nr[0]);
|
||||
}
|
||||
srcs_info += std::string(tensor->src[i]->name) + "[" + ggml_op_name(tensor->src[i]->op) + ", " + axis_name + ", {" + ne_info + "}]";
|
||||
}
|
||||
|
|
@ -1095,7 +1086,8 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
if (!ne_info.empty()) {
|
||||
ne_info += ", ";
|
||||
}
|
||||
ne_info += std::to_string(buf_ctx->split_state_cache[key].first.ne[j]);
|
||||
const ggml_backend_meta_split_state & ss = buf_ctx->split_state_cache[key].first;
|
||||
ne_info += std::to_string(ss.ne[j]) + "x" + std::to_string(ss.nr[0]);
|
||||
}
|
||||
GGML_LOG_DEBUG("SPLIT_STATE: {%s} -> %s[%s, %s, {%s}]\n", srcs_info.c_str(), tensor->name, ggml_op_name(tensor->op),
|
||||
ggml_backend_meta_split_axis_name(buf_ctx->split_state_cache[key].first.axis), ne_info.c_str());
|
||||
|
|
@ -1107,8 +1099,10 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
|||
#ifndef NDEBUG
|
||||
if (ret.axis >= 0 && ret.axis < GGML_MAX_DIMS) {
|
||||
int64_t ne_ret = 0;
|
||||
for (size_t sj = 0; sj < ret.n_segments*n_bufs; sj++) {
|
||||
ne_ret += ret.ne[sj];
|
||||
for (size_t s = 0; s < ret.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ne_ret += ret.ne[s*n_bufs + j] * ret.nr[s];
|
||||
}
|
||||
}
|
||||
assert(ne_ret == tensor->ne[int(ret.axis)]);
|
||||
}
|
||||
|
|
@ -1155,7 +1149,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_m
|
|||
// GGML_ASSERT(ggml_is_contiguously_allocated(tensor));
|
||||
ne[split_dim] = 0;
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
ne[split_dim] += split_state.ne[s*n_simple_bufs + j];
|
||||
ne[split_dim] += split_state.ne[s*n_simple_bufs + j] * split_state.nr[s];
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (tensor->nb[i] > tensor->nb[split_dim]) {
|
||||
|
|
@ -1229,7 +1223,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_m
|
|||
for (size_t j = 0; j < n_simple_bufs; j++) {
|
||||
int64_t ne_sum = 0;
|
||||
for (size_t s = 0; s < split_state_src.n_segments; s++) {
|
||||
ne_sum += split_state_src.ne[s*n_simple_bufs + j];
|
||||
ne_sum += split_state_src.ne[s*n_simple_bufs + j] * split_state_src.nr[s];
|
||||
}
|
||||
if (ne_sum == 0) {
|
||||
simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
|
||||
|
|
@ -1255,8 +1249,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
if (split_state.n_segments != 1 || split_state.nr[0] != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(split_state.nr[0] != 0);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
size_t offset_data = 0;
|
||||
|
|
@ -1267,24 +1262,26 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|||
const size_t row_stride = tensor->nb[1];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
|
||||
const int64_t row_start = offset / row_stride;
|
||||
const int64_t row_count = size / row_stride;
|
||||
GGML_ASSERT(row_start + row_count <= tensor->ne[1]);
|
||||
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
|
||||
r_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
for (size_t r = 0; r < split_state.nr[s]; r++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + row_start * simple_tensor->nb[1], nbytes,
|
||||
row_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
GGML_ASSERT(offset_data*row_count == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
|
|
@ -1292,22 +1289,24 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|||
const size_t row_stride = tensor->nb[2];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
|
||||
const int64_t row_start = offset / row_stride;
|
||||
const int64_t row_count = size / row_stride;
|
||||
GGML_ASSERT(row_start + row_count <= tensor->ne[2]);
|
||||
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
|
||||
r_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
for (size_t r = 0; r < split_state.nr[s]; r++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + row_start * simple_tensor->nb[2], nbytes,
|
||||
row_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
GGML_ASSERT(offset_data*row_count == size);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1365,8 +1364,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
|||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
if (split_state.n_segments != 1 || split_state.nr[0] != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(split_state.nr[0] != 0);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
size_t offset_data = 0;
|
||||
|
|
@ -1377,24 +1377,26 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
|||
const size_t row_stride = tensor->nb[1];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
|
||||
const int64_t row_start = offset / row_stride;
|
||||
const int64_t row_count = size / row_stride;
|
||||
GGML_ASSERT(row_start + row_count <= tensor->ne[1]);
|
||||
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
|
||||
r_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
for (size_t r = 0; r < split_state.nr[s]; r++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + row_start * simple_tensor->nb[1], nbytes,
|
||||
row_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
GGML_ASSERT(offset_data*row_count == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
|
|
@ -1402,22 +1404,24 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
|||
const size_t row_stride = tensor->nb[2];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
|
||||
const int64_t row_start = offset / row_stride;
|
||||
const int64_t row_count = size / row_stride;
|
||||
GGML_ASSERT(row_start + row_count <= tensor->ne[2]);
|
||||
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
|
||||
r_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
for (size_t r = 0; r < split_state.nr[s]; r++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + row_start * simple_tensor->nb[2], nbytes,
|
||||
row_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
GGML_ASSERT(offset_data*row_count == size);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1675,6 +1679,7 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
GGML_ASSERT(split_state.nr[0] == 1);
|
||||
|
||||
switch (split_state.axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
|
|
@ -1719,6 +1724,7 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
|
|||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ false);
|
||||
GGML_ASSERT(split_state.n_segments == 1);
|
||||
GGML_ASSERT(split_state.nr[0] == 1);
|
||||
|
||||
switch (split_state.axis) {
|
||||
case GGML_BACKEND_SPLIT_AXIS_0:
|
||||
|
|
|
|||
Loading…
Reference in New Issue