CUDA: skip fusion for repeating adds in bias (llama/17080)
This commit is contained in:
parent
0caa32c772
commit
522b9bce33
|
|
@ -124,6 +124,7 @@ if (CUDAToolkit_FOUND)
|
||||||
|
|
||||||
if (GGML_CUDA_DEBUG)
|
if (GGML_CUDA_DEBUG)
|
||||||
list(APPEND CUDA_FLAGS -lineinfo)
|
list(APPEND CUDA_FLAGS -lineinfo)
|
||||||
|
add_compile_definitions(GGML_CUDA_DEBUG)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||||
|
|
|
||||||
|
|
@ -3152,8 +3152,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
|
||||||
#ifdef GGML_CUDA_DEBUG
|
#ifdef GGML_CUDA_DEBUG
|
||||||
const int nodes_fused = i - prev_i - 1;
|
const int nodes_fused = i - prev_i - 1;
|
||||||
prev_i = i;
|
prev_i = i;
|
||||||
|
|
@ -3302,6 +3300,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// we don't support repeating adds
|
||||||
|
if (bias_op == GGML_OP_ADD &&
|
||||||
|
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
|
||||||
|
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
const ggml_tensor * src0 = up_n->src[0];
|
const ggml_tensor * src0 = up_n->src[0];
|
||||||
const ggml_tensor * src1 = up_n->src[1];
|
const ggml_tensor * src1 = up_n->src[1];
|
||||||
const ggml_tensor * ids = up_n->src[2];
|
const ggml_tensor * ids = up_n->src[2];
|
||||||
|
|
@ -3411,6 +3416,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||||
fusion_data.x_bias = bias_tensor;
|
fusion_data.x_bias = bias_tensor;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue