[Beignet] [PATCH v2] Fix the same kernel name issue of OCL_OUTPUT_KERNEL_PERF

Yongjia Zhang zhang_yong_jia at 126.com
Mon Jun 23 08:09:33 PDT 2014


Now it treats kernels with same kernel name and different build
options separately. When OCL_OUTPUT_KERNEL_PERF==1, it outputs the
time summary as before, but if OCL_OUTPUT_KERNEL_PERF==2, it will
output the time details including the kernel build options and
kernels with same kernel name but different build options will
output separately.

v2: use strncmp and strncpy instead of strcmp and strcpy.

Signed-off-by: Yongjia Zhang <yongjia.zhang at intel.com>
---
 src/cl_api.c      |  17 +++++---
 src/performance.c | 113 ++++++++++++++++++++++++++++++++++++++----------------
 src/performance.h |   2 +-
 3 files changed, 92 insertions(+), 40 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index 6206e10..8856f5f 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1681,7 +1681,7 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
   }
 
   if(b_output_kernel_perf)
-	  time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue);
+	  time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "\0", command_queue);
 
   return 0;
 
@@ -1785,7 +1785,7 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
   }
 
   if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue);
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "\0", command_queue);
 
 error:
   return err;
@@ -2027,7 +2027,7 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
   }
 
   if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue);
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "\0", command_queue);
 
 error:
   return err;
@@ -2091,7 +2091,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
   }
 
   if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue);
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "\0", command_queue);
 
 error:
   return err;
@@ -2155,7 +2155,7 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
   }
 
   if(b_output_kernel_perf)
-    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue);
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "\0", command_queue);
 
 error:
   return err;
@@ -2556,7 +2556,12 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   }
 
   if(b_output_kernel_perf)
-    time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue);
+  {
+    if(kernel->program->build_opts != NULL)
+      time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
+    else
+      time_end(command_queue->ctx, cl_kernel_get_name(kernel), "\0", command_queue);
+  }
 error:
   return err;
 }
diff --git a/src/performance.c b/src/performance.c
index a785460..85cd481 100644
--- a/src/performance.c
+++ b/src/performance.c
@@ -8,11 +8,12 @@
 
 #define MAX_KERNEL_NAME_LENGTH 100
 #define MAX_KERNEL_EXECUTION_COUNT 100000
-
+#define MAX_KERNEL_BUILD_OPT 1000
 typedef struct kernel_storage_node
 {
   char kernel_name[MAX_KERNEL_NAME_LENGTH];
   float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+  char build_option[MAX_KERNEL_BUILD_OPT];
   int current_count;
   float kernel_sum_time;
   struct kernel_storage_node *next;
@@ -81,11 +82,12 @@ static context_storage_node * find_context(cl_context context)
   return pre;
 }
 
-static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name)
+static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name, const char *build_opt)
 {
   if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
      p_context == prev_context_pointer &&
-     !strcmp(kernel_name, prev_kernel_pointer->kernel_name))
+     !strncmp(kernel_name, prev_kernel_pointer->kernel_name, MAX_KERNEL_NAME_LENGTH) &&
+     !strncmp(build_opt, prev_kernel_pointer->build_option, MAX_KERNEL_BUILD_OPT))
     return prev_kernel_pointer;
 
   if(NULL == p_context)
@@ -95,30 +97,38 @@ static kernel_storage_node * find_kernel(context_storage_node *p_context, const
   {
     p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
     p_context->kernel_count++;
-    strcpy(p_context->kernels_storage->kernel_name,kernel_name);
+    strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
+    p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+    strncpy(p_context->kernels_storage->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+    p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = '\0';
     p_context->kernels_storage->current_count = 0;
     p_context->kernels_storage->kernel_sum_time = 0.0f;
     p_context->kernels_storage->next = NULL;
     return p_context->kernels_storage;
   }
+
   kernel_storage_node *pre = p_context->kernels_storage;
   kernel_storage_node *cur = p_context->kernels_storage;
-  while(NULL != cur && strcmp(cur->kernel_name, kernel_name))
+  while(NULL != cur &&
+        (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) ||
+         strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT)))
   {
     pre = cur;
     cur = cur->next;
   }
   if(NULL != cur)
-  {
     return cur;
-  }
+
   p_context->kernel_count++;
   pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
   pre = pre->next;
   pre->current_count = 0;
   pre->kernel_sum_time = 0.0f;
   pre->next = NULL;
-  strcpy(pre->kernel_name, kernel_name);
+  strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+  pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+  strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+  pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
   return pre;
 }
 
@@ -146,6 +156,8 @@ typedef struct time_element
   float kernel_sum_time;
   int kernel_execute_count;
   double dev;
+  float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+  uint32_t time_index;
 } time_element;
 
 static int cmp(const void *a, const void *b)
@@ -172,40 +184,62 @@ static void print_time_info()
   {
     printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
     printf("  ->>>> KERNELS TIME SUMMARY <<<<-\n");
+
     kernel_storage_node *p_kernel = p_context->kernels_storage;
     kernel_storage_node *p_tmp_kernel = p_kernel;
     time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
-    int i = 0, j = 0;
+    memset(te, 0, sizeof(time_element)*p_context->kernel_count);
+    int i = -1, j = 0, k = 0;
     while(NULL != p_tmp_kernel)
     {
-      te[i].kernel_execute_count = p_tmp_kernel->current_count;
-      strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name);
-      te[i].kernel_sum_time = p_tmp_kernel->kernel_sum_time;
-      float average = p_tmp_kernel->kernel_sum_time / p_tmp_kernel->current_count;
-      float sumsquare = 0.0f;
+      for(k=0; k<=i; k++)
+      {
+        if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH))
+          break;
+      }
+      if(k == i+1)
+      {
+        i++;
+        k = i;
+      }
+      te[k].kernel_execute_count += p_tmp_kernel->current_count;
+      strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH);
+      te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+      te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time;
       for(j=0; j != p_tmp_kernel->current_count; ++j)
-        sumsquare += pow((p_tmp_kernel->kernel_times[j] - average), 2.0 );
-      te[i++].dev = sqrt(sumsquare/p_tmp_kernel->current_count);
+        te[k].kernel_times[te[k].time_index++] = p_tmp_kernel->kernel_times[j];
       p_tmp_kernel = p_tmp_kernel->next;
     }
+
+    for(k=0; k<=i; k++)
+    {
+      float average = te[k].kernel_sum_time / te[k].kernel_execute_count;
+      double sumsquare = 0.0;
+      for(j=0; j<te[k].time_index; ++j)
+        sumsquare += pow((te[k].kernel_times[j] - average), 2.0);
+      te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count);
+    }
+
     float sum_time = 0.0f;
     qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
-    for(i=0; i<p_context->kernel_count; ++i)
-      sum_time += te[i].kernel_sum_time;
-    for(i=0; i<p_context->kernel_count; ++i)
+    for(j=0; j<=i; ++j)
+      sum_time += te[j].kernel_sum_time;
+
+    for(j=0; j<=i; ++j)
     {
-      printf("    [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f  Count: %-7d  Ave(ms): %7.2f  Dev: %.1lf%%] \n",
-             te[i].kernel_name,
-             te[i].kernel_sum_time / sum_time * 100,
-             te[i].kernel_sum_time,
-             te[i].kernel_execute_count,
-             te[i].kernel_sum_time / te[i].kernel_execute_count,
-             te[i].dev / te[i].kernel_sum_time * te[i].kernel_execute_count * 100);
+      printf("    [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f  Count: %-7d  Ave(ms): %7.2f  Dev: %.1lf%%]\n",
+             te[j].kernel_name,
+             te[j].kernel_sum_time / sum_time * 100,
+             te[j].kernel_sum_time,
+             te[j].kernel_execute_count,
+             te[j].kernel_sum_time / te[j].kernel_execute_count,
+             te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count * 100);
     }
     free(te);
     printf("    Total : %.2f\n", sum_time);
     if(2 != b_output_kernel_perf)
     {
+      printf("[------------  CONTEXT ENDS------------]\n\n");
       p_context = p_context->next;
       continue;
     }
@@ -214,6 +248,18 @@ static void print_time_info()
     while(NULL != p_kernel)
     {
       printf("    [Kernel Name : %30s   Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
+      if(*p_kernel->build_option != '\0')
+      {
+        int count = 0;
+        printf("      ->Build Options : ");
+        while(p_kernel->build_option[count] != '\0' )
+        {
+          printf("%c", p_kernel->build_option[count++]);
+          if(count % 100 == 0)
+            printf("\n                         ");
+        }
+        printf("\n");
+      }
       for(i=0; i!=p_kernel->current_count; ++i)
         printf("      Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]);
       p_kernel = p_kernel->next;
@@ -225,7 +271,7 @@ static void print_time_info()
 }
 
 
-static void insert(cl_context context, const char *kernel_name, float time)
+static void insert(cl_context context, const char *kernel_name, const char *build_opt, float time)
 {
   if(!atexit_registered)
   {
@@ -233,7 +279,7 @@ static void insert(cl_context context, const char *kernel_name, float time)
     atexit(print_time_info);
   }
   context_storage_node *p_context = find_context(context);
-  kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name);
+  kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt);
   prev_context_pointer = p_context;
   prev_kernel_pointer = p_kernel;
   p_kernel->kernel_times[p_kernel->current_count++] = time;
@@ -241,7 +287,8 @@ static void insert(cl_context context, const char *kernel_name, float time)
   if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
   {
     p_context->kernel_max_time = p_kernel->kernel_sum_time;
-    strcpy(p_context->max_time_kernel_name, kernel_name);
+    strncpy(p_context->max_time_kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+    p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
   }
 }
 
@@ -253,9 +300,9 @@ static struct timeval start, end;
 void initialize_env_var()
 {
   char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
-  if(NULL == env || !strcmp(env,"0"))
+  if(NULL == env || !strncmp(env,"0", 1))
     b_output_kernel_perf = 0;
-  else if(!strcmp(env,"1"))
+  else if(!strncmp(env,"1", 1))
     b_output_kernel_perf = 1;
   else
     b_output_kernel_perf = 2;
@@ -267,11 +314,11 @@ void time_start(cl_context context, const char * kernel_name, cl_command_queue c
   gettimeofday(&start, NULL);
 }
 
-void time_end(cl_context context, const char * kernel_name, cl_command_queue cq)
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq)
 {
   clFinish(cq);
   gettimeofday(&end, NULL);
   float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
-  insert(context, kernel_name, t);
+  insert(context, kernel_name, build_opt, t);
   pthread_mutex_unlock(&mutex);
 }
diff --git a/src/performance.h b/src/performance.h
index c747743..1e75054 100644
--- a/src/performance.h
+++ b/src/performance.h
@@ -5,7 +5,7 @@
 
 extern int b_output_kernel_perf;
 void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
-void time_end(cl_context context, const char * kernel_name, cl_command_queue cq);
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq);
 void initialize_env_var();
 
 
-- 
1.8.3.2




More information about the Beignet mailing list