opencl: batch profiling to improve speed and prevent memory leaks (llama/23495)

This commit is contained in:
shaofeiqi 2026-05-23 23:11:43 -07:00 committed by Georgi Gerganov
parent b84d03487c
commit 1435988ab3
1 changed files with 28 additions and 8 deletions

View File

@ -661,11 +661,10 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mm_iq4_nl_f32_l4_lm;
std::vector<ProfilingInfo> profiling_info;
std::vector<ProfilingInfo> profiling_results;
void write_profiling_info() {
FILE * fperf = fopen("cl_profiling.csv", "w");
if (!fperf) {
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
void flush_profiling_batch() {
if (profiling_info.empty()) {
return;
}
@ -689,6 +688,7 @@ struct ggml_backend_opencl_context {
CL_CHECK(clGetEventProfilingInfo(
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
CL_CHECK(clReleaseEvent(info.evt));
info.evt = nullptr;
char kernel_name[512];
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
@ -706,10 +706,26 @@ struct ggml_backend_opencl_context {
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
}
profiling_results.insert(profiling_results.end(),
std::make_move_iterator(profiling_info.begin()),
std::make_move_iterator(profiling_info.end()));
profiling_info.clear();
}
void write_profiling_info() {
if (profiling_results.empty()) {
return;
}
// Dump a csv
FILE * fperf = fopen("cl_profiling.csv", "w");
if (!fperf) {
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
return;
}
fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
for (const ProfilingInfo & info : profiling_info) {
for (const ProfilingInfo & info : profiling_results) {
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
info.op_name.c_str(), info.kernel_name.c_str(),
info.cmd_duration_ns/1.e6f,
@ -720,14 +736,14 @@ struct ggml_backend_opencl_context {
fclose(fperf);
// Dump a simple chrome trace
FILE* ftrace = fopen("cl_trace.json", "w");
FILE * ftrace = fopen("cl_trace.json", "w");
if (!ftrace) {
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
return;
}
fprintf(ftrace, "[\n");
for (const ProfilingInfo & info : profiling_info) {
for (const ProfilingInfo & info : profiling_results) {
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
info.kernel_name.c_str(), info.cmd_queued/1000);
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
@ -738,6 +754,7 @@ struct ggml_backend_opencl_context {
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
info.kernel_name.c_str(), info.cmd_end/1000);
}
fprintf(ftrace, "]\n");
fclose(ftrace);
}
@ -758,6 +775,9 @@ struct ggml_backend_opencl_context {
profiling_info.emplace_back();
populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
if (profiling_info.size() >= 2048) {
flush_profiling_batch();
}
#else
GGML_UNUSED(tensor);
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
@ -804,7 +824,7 @@ struct ggml_backend_opencl_context {
if (ref_count == 0) {
#ifdef GGML_OPENCL_PROFILING
write_profiling_info();
profiling_info.clear();
profiling_results.clear();
#endif
}
}