[Beignet] [PATCH] Add kernels performance output
Yongjia Zhang
zhang_yong_jia at 126.com
Tue Mar 18 22:24:25 PDT 2014
if environment variable OCL_OUTPUT_KERNEL_PERF is set non-zero,
then after the executable program exits, beignet will output the
time information of each kernel executed.
Signed-off-by:Yongjia Zhang<yongjia.zhang at intel.com>
---
src/CMakeLists.txt | 3 +-
src/cl_api.c | 23 ++++-
src/cl_command_queue.c | 5 +-
src/performance.c | 254 +++++++++++++++++++++++++++++++++++++++++++++++++
src/performance.h | 12 +++
5 files changed, 294 insertions(+), 3 deletions(-)
create mode 100644 src/performance.c
create mode 100644 src/performance.h
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 95ff56f..4c34235 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -47,7 +47,8 @@ set(OPENCL_SRC
intel/intel_batchbuffer.c
intel/intel_driver.c
x11/dricommon.c
- x11/va_dri2.c)
+ x11/va_dri2.c
+ performance.c)
if (EGL_FOUND AND MESA_SOURCE_FOUND)
set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
diff --git a/src/cl_api.c b/src/cl_api.c
index 9638994..f670f13 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -40,6 +40,8 @@
#include <assert.h>
#include <unistd.h>
+#include "performance.h"
+
#ifndef CL_VERSION_1_2
#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
@@ -293,6 +295,7 @@ clCreateContext(const cl_context_properties * properties,
pfn_notify,
user_data,
&err);
+ initialize_env_var();
error:
if (errcode_ret)
*errcode_ret = err;
@@ -1676,6 +1679,10 @@ clEnqueueCopyBuffer(cl_command_queue command_queue,
err = cl_command_queue_flush(command_queue);
}
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue);
+
return 0;
error:
@@ -1777,6 +1784,9 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
err = cl_command_queue_flush(command_queue);
}
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue);
+
error:
return err;
}
@@ -2016,6 +2026,9 @@ clEnqueueCopyImage(cl_command_queue command_queue,
err = cl_command_queue_flush(command_queue);
}
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue);
+
error:
return err;
}
@@ -2077,6 +2090,9 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
err = cl_command_queue_flush(command_queue);
}
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue);
+
error:
return err;
}
@@ -2138,6 +2154,9 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue,
err = cl_command_queue_flush(command_queue);
}
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue);
+
error:
return err;
}
@@ -2526,7 +2545,9 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
err = cl_command_queue_flush(command_queue);
}
-
+
+ if(b_output_kernel_perf)
+ time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue);
error:
return err;
}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4ac2e11..7eff14c 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -28,6 +28,7 @@
#include "cl_alloc.h"
#include "cl_driver.h"
#include "cl_khr_icd.h"
+#include "performance.h"
#include <assert.h>
#include <stdio.h>
@@ -376,6 +377,8 @@ cl_command_queue_ND_range(cl_command_queue queue,
const size_t *global_wk_sz,
const size_t *local_wk_sz)
{
+ if(b_output_kernel_perf)
+ time_start(queue->ctx, cl_kernel_get_name(k), queue);
const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
cl_int err = CL_SUCCESS;
@@ -408,7 +411,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
TRY (cl_fulsim_read_all_surfaces, queue, k);
}
#endif /* USE_FULSIM */
-
+
error:
return err;
}
diff --git a/src/performance.c b/src/performance.c
new file mode 100644
index 0000000..c4f40a6
--- /dev/null
+++ b/src/performance.c
@@ -0,0 +1,254 @@
+#include <performance.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#define MAX_KERNEL_NAME_LENGTH 100
+#define MAX_KERNEL_EXECUTION_COUNT 100000
+
+typedef struct kernel_storage_node
+{
+ char kernel_name[MAX_KERNEL_NAME_LENGTH];
+ float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+ int current_count;
+ float kernel_sum_time;
+ struct kernel_storage_node *next;
+} kernel_storage_node;
+
+typedef struct context_storage_node
+{
+ uint64_t context_id;
+ kernel_storage_node *kernels_storage;
+ char max_time_kernel_name[MAX_KERNEL_NAME_LENGTH];
+ float kernel_max_time;
+ int kernel_count;
+ struct context_storage_node *next;
+} context_storage_node;
+
+typedef struct storage
+{
+ context_storage_node * context_storage;
+} storage;
+
+
+
+static storage record;
+static int atexit_registered = 0;
+
+
+static context_storage_node * prev_context_pointer = NULL;
+static kernel_storage_node * prev_kernel_pointer = NULL;
+
+static context_storage_node * find_context(cl_context context)
+{
+ if(NULL != prev_context_pointer )
+ {
+ if(prev_context_pointer->context_id == (uint64_t)context)
+ return prev_context_pointer;
+ }
+
+ if(NULL == record.context_storage)
+ {
+ record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node));
+ record.context_storage->context_id = (uint64_t)context;
+ record.context_storage->kernels_storage = NULL;
+ record.context_storage->kernel_max_time = 0.0f;
+ record.context_storage->next = NULL;
+ record.context_storage->kernel_count = 0;
+ return record.context_storage;
+ }
+
+ context_storage_node *pre = record.context_storage;
+ context_storage_node *cur = record.context_storage;
+ while(NULL !=cur && (uint64_t)context != cur->context_id )
+ {
+ pre = cur;
+ cur = cur->next;
+ }
+ if(NULL != cur)
+ return cur;
+
+ pre->next = (context_storage_node *)malloc(sizeof(context_storage_node));
+ pre = pre->next;
+ pre->context_id = (uint64_t)context;
+ pre->kernels_storage = NULL;
+ pre->kernel_max_time = 0.0f;
+ pre->next = NULL;
+ pre->kernel_count = 0;
+ return pre;
+}
+
+static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name)
+{
+ if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
+ p_context == prev_context_pointer &&
+ !strcmp(kernel_name, prev_kernel_pointer->kernel_name))
+ return prev_kernel_pointer;
+
+ if(NULL == p_context)
+ return NULL;
+
+ if(NULL == p_context->kernels_storage)
+ {
+ p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+ p_context->kernel_count++;
+ strcpy(p_context->kernels_storage->kernel_name,kernel_name);
+ p_context->kernels_storage->current_count = 0;
+ p_context->kernels_storage->kernel_sum_time = 0.0f;
+ p_context->kernels_storage->next = NULL;
+ return p_context->kernels_storage;
+ }
+ kernel_storage_node *pre = p_context->kernels_storage;
+ kernel_storage_node *cur = p_context->kernels_storage;
+ while(NULL != cur && strcmp(cur->kernel_name, kernel_name))
+ {
+ pre = cur;
+ cur = cur->next;
+ }
+ if(NULL != cur)
+ {
+ return cur;
+ }
+ p_context->kernel_count++;
+ pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+ pre = pre->next;
+ pre->current_count = 0;
+ pre->kernel_sum_time = 0.0f;
+ pre->next = NULL;
+ strcpy(pre->kernel_name, kernel_name);
+ return pre;
+}
+
+static void free_storage()
+{
+ context_storage_node *p_context = record.context_storage;
+ while(NULL != p_context)
+ {
+ context_storage_node *p_tmp_context = p_context->next;
+ kernel_storage_node *p_kernel = p_context->kernels_storage;
+ while(NULL != p_kernel)
+ {
+ kernel_storage_node *p_tmp_kernel = p_kernel->next;
+ free(p_kernel);
+ p_kernel = p_tmp_kernel;
+ }
+ free(p_context);
+ p_context = p_tmp_context;
+ }
+}
+
+typedef struct time_element
+{
+ char kernel_name[MAX_KERNEL_NAME_LENGTH];
+ float kernel_sum_time;
+} time_element;
+
+static int cmp(const void *a, const void *b)
+{
+ if(((time_element *)a)->kernel_sum_time < ((time_element *)b)->kernel_sum_time)
+ return 1;
+ else if(((time_element *)a)->kernel_sum_time > ((time_element *)b)->kernel_sum_time)
+ return -1;
+ else
+ return 0;
+}
+
+static void print_time_info()
+{
+ context_storage_node *p_context = record.context_storage;
+ if(NULL == p_context)
+ {
+ printf("Nothing to output !\n");
+ return;
+ }
+
+ int tmp_context_id = 0;
+ while(NULL != p_context)
+ {
+ printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
+ printf(" ->>>> KERNELS TIME SUMMARY <<<<-\n");
+ kernel_storage_node *p_kernel = p_context->kernels_storage;
+ kernel_storage_node *p_tmp_kernel = p_kernel;
+ time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
+ int i = 0;
+ while(NULL != p_tmp_kernel)
+ {
+ strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name);
+ te[i++].kernel_sum_time = p_tmp_kernel->kernel_sum_time;
+ p_tmp_kernel = p_tmp_kernel->next;
+ }
+ float sum_time = 0.0f;
+ qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
+ for(i=0; i<p_context->kernel_count; ++i)
+ {
+ sum_time += te[i].kernel_sum_time;
+ printf(" [Kernel Name : %s Time : %.2f]\n", te[i].kernel_name, te[i].kernel_sum_time);
+ }
+ free(te);
+ printf(" Total : %.2f\n", sum_time);
+ p_tmp_kernel = p_kernel;
+ printf("\n ->>>> KERNELS TIME DETAIL <<<<-\n");
+ while(NULL != p_kernel)
+ {
+ printf(" [Kernel Name : %s Time : %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
+ for(i=0; i!=p_kernel->current_count; ++i)
+ printf(" Execution Round %d : %.2f\n", i+1, p_kernel->kernel_times[i]);
+ p_kernel = p_kernel->next;
+ }
+ printf("[------------ CONTEXT ENDS------------]\n\n");
+ p_context = p_context->next;
+ }
+ free_storage();
+}
+
+
+static void insert(cl_context context, const char *kernel_name, float time)
+{
+ if(!atexit_registered)
+ {
+ atexit_registered = 1;
+ atexit(print_time_info);
+ }
+ context_storage_node *p_context = find_context(context);
+ kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name);
+ prev_context_pointer = p_context;
+ prev_kernel_pointer = p_kernel;
+ p_kernel->kernel_times[p_kernel->current_count++] = time;
+ p_kernel->kernel_sum_time += time;
+ if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
+ {
+ p_context->kernel_max_time = p_kernel->kernel_sum_time;
+ strcpy(p_context->max_time_kernel_name, kernel_name);
+ }
+}
+
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+int b_output_kernel_perf = 0;
+static struct timeval start, end;
+
+void initialize_env_var()
+{
+ char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
+ if(NULL == env || !strcmp(env,"0"))
+ b_output_kernel_perf = 0;
+ else
+ b_output_kernel_perf = 1;
+}
+
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+ pthread_mutex_lock(&mutex);
+ gettimeofday(&start, NULL);
+}
+
+void time_end(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+ clFinish(cq);
+ gettimeofday(&end, NULL);
+ float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
+ insert(context, kernel_name, t);
+ pthread_mutex_unlock(&mutex);
+}
diff --git a/src/performance.h b/src/performance.h
new file mode 100644
index 0000000..c747743
--- /dev/null
+++ b/src/performance.h
@@ -0,0 +1,12 @@
+#ifndef __PERFORMANCE_H__
+#define __PERFORMANCE_H__
+#include "CL/cl.h"
+
+
+extern int b_output_kernel_perf;
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
+void time_end(cl_context context, const char * kernel_name, cl_command_queue cq);
+void initialize_env_var();
+
+
+#endif
--
1.8.3.2
More information about the Beignet
mailing list