[Beignet] [PATCH] Add kernels performance output

Yongjia Zhang zhang_yong_jia at 126.com
Tue Mar 18 22:24:25 PDT 2014


if environment variable OCL_OUTPUT_KERNEL_PERF is set non-zero,
then after the executable program exits, beignet will output the
time information of each kernel executed.

Signed-off-by:Yongjia Zhang<yongjia.zhang at intel.com>
---
 src/CMakeLists.txt     |   3 +-
 src/cl_api.c           |  23 ++++-
 src/cl_command_queue.c |   5 +-
 src/performance.c      | 254 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/performance.h      |  12 +++
 5 files changed, 294 insertions(+), 3 deletions(-)
 create mode 100644 src/performance.c
 create mode 100644 src/performance.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 95ff56f..4c34235 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -47,7 +47,8 @@ set(OPENCL_SRC
     intel/intel_batchbuffer.c
     intel/intel_driver.c
     x11/dricommon.c 
-    x11/va_dri2.c)
+    x11/va_dri2.c
+    performance.c)
 
 if (EGL_FOUND AND MESA_SOURCE_FOUND)
 set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
diff --git a/src/cl_api.c b/src/cl_api.c
index 9638994..f670f13 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -40,6 +40,8 @@
 #include <assert.h>
 #include <unistd.h>
 
+#include "performance.h"
+
 #ifndef CL_VERSION_1_2
 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
@@ -293,6 +295,7 @@ clCreateContext(const cl_context_properties *  properties,
                            pfn_notify,
                            user_data,
                            &err);
+  initialize_env_var();
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -1676,6 +1679,10 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
 
     err = cl_command_queue_flush(command_queue);
   }
+
+  if(b_output_kernel_perf)
+	  time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue);
+
   return 0;
 
 error:
@@ -1777,6 +1784,9 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue);
+
 error:
   return err;
 }
@@ -2016,6 +2026,9 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue);
+
 error:
   return err;
 }
@@ -2077,6 +2090,9 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue);
+
 error:
   return err;
 }
@@ -2138,6 +2154,9 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue);
+
 error:
   return err;
 }
@@ -2526,7 +2545,9 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
 
     err = cl_command_queue_flush(command_queue);
   }
-
+  
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue);
 error:
   return err;
 }
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4ac2e11..7eff14c 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -28,6 +28,7 @@
 #include "cl_alloc.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
+#include "performance.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -376,6 +377,8 @@ cl_command_queue_ND_range(cl_command_queue queue,
                           const size_t *global_wk_sz,
                           const size_t *local_wk_sz)
 {
+  if(b_output_kernel_perf)
+    time_start(queue->ctx, cl_kernel_get_name(k), queue);
   const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
   cl_int err = CL_SUCCESS;
 
@@ -408,7 +411,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
     TRY (cl_fulsim_read_all_surfaces, queue, k);
   }
 #endif /* USE_FULSIM */
-
+ 
 error:
   return err;
 }
diff --git a/src/performance.c b/src/performance.c
new file mode 100644
index 0000000..c4f40a6
--- /dev/null
+++ b/src/performance.c
@@ -0,0 +1,254 @@
+#include <performance.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#define MAX_KERNEL_NAME_LENGTH 100
+#define MAX_KERNEL_EXECUTION_COUNT 100000
+
+typedef struct kernel_storage_node
+{
+  char kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+  int current_count;
+  float kernel_sum_time;
+  struct kernel_storage_node *next;
+} kernel_storage_node;
+
+typedef struct context_storage_node
+{
+  uint64_t context_id;
+  kernel_storage_node *kernels_storage;
+  char max_time_kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_max_time;
+  int kernel_count;
+  struct context_storage_node *next;
+} context_storage_node;
+
+typedef struct storage
+{
+  context_storage_node * context_storage;
+} storage;
+
+
+
+static storage record;
+static int atexit_registered = 0;
+
+
+static context_storage_node * prev_context_pointer = NULL;
+static kernel_storage_node * prev_kernel_pointer = NULL;
+
+static context_storage_node * find_context(cl_context context)
+{
+  if(NULL != prev_context_pointer )
+  {
+    if(prev_context_pointer->context_id == (uint64_t)context)
+      return prev_context_pointer;
+  }
+
+  if(NULL == record.context_storage)
+  {
+    record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node));
+    record.context_storage->context_id = (uint64_t)context;
+    record.context_storage->kernels_storage = NULL;
+    record.context_storage->kernel_max_time = 0.0f;
+    record.context_storage->next = NULL;
+    record.context_storage->kernel_count = 0;
+    return record.context_storage;
+  }
+
+  context_storage_node *pre = record.context_storage;
+  context_storage_node *cur = record.context_storage;
+  while(NULL !=cur && (uint64_t)context != cur->context_id )
+  {
+    pre = cur;
+    cur = cur->next;
+  }
+  if(NULL != cur)
+    return cur;
+
+  pre->next = (context_storage_node *)malloc(sizeof(context_storage_node));
+  pre = pre->next;
+  pre->context_id = (uint64_t)context;
+  pre->kernels_storage = NULL;
+  pre->kernel_max_time = 0.0f;
+  pre->next = NULL;
+  pre->kernel_count = 0;
+  return pre;
+}
+
+static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name)
+{
+  if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
+     p_context == prev_context_pointer && 
+     !strcmp(kernel_name, prev_kernel_pointer->kernel_name))
+    return prev_kernel_pointer;
+
+  if(NULL == p_context)
+    return NULL;
+
+  if(NULL == p_context->kernels_storage)
+  {
+    p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+    p_context->kernel_count++;
+    strcpy(p_context->kernels_storage->kernel_name,kernel_name);
+    p_context->kernels_storage->current_count = 0;
+    p_context->kernels_storage->kernel_sum_time = 0.0f;
+    p_context->kernels_storage->next = NULL;
+    return p_context->kernels_storage;
+  }
+  kernel_storage_node *pre = p_context->kernels_storage;
+  kernel_storage_node *cur = p_context->kernels_storage;
+  while(NULL != cur && strcmp(cur->kernel_name, kernel_name))
+  {
+    pre = cur;
+    cur = cur->next;
+  }
+  if(NULL != cur)
+  {
+    return cur;
+  }
+  p_context->kernel_count++;
+  pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+  pre = pre->next;
+  pre->current_count = 0;
+  pre->kernel_sum_time = 0.0f;
+  pre->next = NULL;
+  strcpy(pre->kernel_name, kernel_name);
+  return pre;
+}
+
+static void free_storage()
+{
+  context_storage_node *p_context = record.context_storage;
+  while(NULL != p_context)
+  {
+    context_storage_node *p_tmp_context = p_context->next;
+    kernel_storage_node *p_kernel = p_context->kernels_storage;
+    while(NULL != p_kernel)
+    {
+      kernel_storage_node *p_tmp_kernel = p_kernel->next;
+      free(p_kernel);
+      p_kernel = p_tmp_kernel;
+    }
+    free(p_context);
+    p_context = p_tmp_context;
+  }
+}
+
+typedef struct time_element
+{
+  char kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_sum_time;
+} time_element;
+
+static int cmp(const void *a, const void *b)
+{
+  if(((time_element *)a)->kernel_sum_time < ((time_element *)b)->kernel_sum_time)
+    return 1;
+  else if(((time_element *)a)->kernel_sum_time > ((time_element *)b)->kernel_sum_time)
+    return -1;
+  else
+    return 0;
+}
+
+static void print_time_info()
+{
+  context_storage_node *p_context = record.context_storage;
+  if(NULL == p_context)
+  {
+    printf("Nothing to output !\n");
+    return;
+  }
+
+  int tmp_context_id = 0;
+  while(NULL != p_context)
+  {
+    printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
+    printf("  ->>>> KERNELS TIME SUMMARY <<<<-\n");
+    kernel_storage_node *p_kernel = p_context->kernels_storage;
+    kernel_storage_node *p_tmp_kernel = p_kernel;
+    time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
+    int i = 0;
+    while(NULL != p_tmp_kernel)
+    {
+      strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name);
+      te[i++].kernel_sum_time = p_tmp_kernel->kernel_sum_time;
+      p_tmp_kernel = p_tmp_kernel->next;
+    }
+    float sum_time = 0.0f;
+    qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
+    for(i=0; i<p_context->kernel_count; ++i)
+    {
+      sum_time += te[i].kernel_sum_time;
+      printf("    [Kernel Name : %s   Time : %.2f]\n", te[i].kernel_name, te[i].kernel_sum_time);
+    }
+    free(te);
+    printf("    Total : %.2f\n", sum_time);
+    p_tmp_kernel = p_kernel;
+    printf("\n  ->>>> KERNELS TIME DETAIL <<<<-\n");
+    while(NULL != p_kernel)
+    {
+      printf("    [Kernel Name : %s   Time : %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
+      for(i=0; i!=p_kernel->current_count; ++i)
+        printf("      Execution Round %d : %.2f\n", i+1, p_kernel->kernel_times[i]);
+        p_kernel = p_kernel->next;
+    }
+    printf("[------------  CONTEXT ENDS------------]\n\n");
+    p_context = p_context->next;
+  }
+  free_storage();
+}
+
+
+static void insert(cl_context context, const char *kernel_name, float time)
+{
+  if(!atexit_registered)
+  {
+    atexit_registered = 1;
+    atexit(print_time_info);
+  }
+  context_storage_node *p_context = find_context(context);
+  kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name);
+  prev_context_pointer = p_context;
+  prev_kernel_pointer = p_kernel;
+  p_kernel->kernel_times[p_kernel->current_count++] = time;
+  p_kernel->kernel_sum_time += time;
+  if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
+  {
+    p_context->kernel_max_time = p_kernel->kernel_sum_time;
+    strcpy(p_context->max_time_kernel_name, kernel_name);
+  }
+}
+
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+int b_output_kernel_perf = 0;
+static struct timeval start, end;
+
+void initialize_env_var()
+{
+  char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
+  if(NULL == env || !strcmp(env,"0"))
+    b_output_kernel_perf = 0;
+  else
+    b_output_kernel_perf = 1;
+}
+
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+  pthread_mutex_lock(&mutex);
+  gettimeofday(&start, NULL);
+}
+
+void time_end(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+  clFinish(cq);  
+  gettimeofday(&end, NULL);
+  float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
+  insert(context, kernel_name, t);
+  pthread_mutex_unlock(&mutex);
+}
diff --git a/src/performance.h b/src/performance.h
new file mode 100644
index 0000000..c747743
--- /dev/null
+++ b/src/performance.h
@@ -0,0 +1,12 @@
+#ifndef __PERFORMANCE_H__
+#define __PERFORMANCE_H__
+#include "CL/cl.h"
+
+
+extern int b_output_kernel_perf;
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
+void time_end(cl_context context, const char * kernel_name, cl_command_queue cq);
+void initialize_env_var();
+
+
+#endif
-- 
1.8.3.2




More information about the Beignet mailing list