[Beignet] [PATCH v2] Fix the same kernel name issue of OCL_OUTPUT_KERNEL_PERF

Zhigang Gong zhigang.gong at linux.intel.com
Thu Jun 12 19:33:32 PDT 2014


LGTM, pushed with slight change.
Thanks.

On Mon, Jun 23, 2014 at 11:09:33PM +0800, Yongjia Zhang wrote:
> Now it treats kernels with same kernel name and different build
> options separately. When OCL_OUTPUT_KERNEL_PERF==1, it outputs the
> time summary as before, but if OCL_OUTPUT_KERNEL_PERF==2, it will
> output the time details including the kernel build options and
> kernels with same kernel name but different build options will
> output separately.
> 
> v2: use strncmp and strncpy instead of strcmp and strcpy.
> 
> Signed-off-by: Yongjia Zhang <yongjia.zhang at intel.com>
> ---
>  src/cl_api.c      |  17 +++++---
>  src/performance.c | 113 ++++++++++++++++++++++++++++++++++++++----------------
>  src/performance.h |   2 +-
>  3 files changed, 92 insertions(+), 40 deletions(-)
> 
> diff --git a/src/cl_api.c b/src/cl_api.c
> index 6206e10..8856f5f 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -1681,7 +1681,7 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
>    }
>  
>    if(b_output_kernel_perf)
> -	  time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue);
> +	  time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "\0", command_queue);
>  
>    return 0;
>  
> @@ -1785,7 +1785,7 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
>    }
>  
>    if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue);
> +    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "\0", command_queue);
>  
>  error:
>    return err;
> @@ -2027,7 +2027,7 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
>    }
>  
>    if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue);
> +    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "\0", command_queue);
>  
>  error:
>    return err;
> @@ -2091,7 +2091,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
>    }
>  
>    if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue);
> +    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "\0", command_queue);
>  
>  error:
>    return err;
> @@ -2155,7 +2155,7 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
>    }
>  
>    if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue);
> +    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "\0", command_queue);
>  
>  error:
>    return err;
> @@ -2556,7 +2556,12 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
>    }
>  
>    if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue);
> +  {
> +    if(kernel->program->build_opts != NULL)
> +      time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
> +    else
> +      time_end(command_queue->ctx, cl_kernel_get_name(kernel), "\0", command_queue);
> +  }
>  error:
>    return err;
>  }
> diff --git a/src/performance.c b/src/performance.c
> index a785460..85cd481 100644
> --- a/src/performance.c
> +++ b/src/performance.c
> @@ -8,11 +8,12 @@
>  
>  #define MAX_KERNEL_NAME_LENGTH 100
>  #define MAX_KERNEL_EXECUTION_COUNT 100000
> -
> +#define MAX_KERNEL_BUILD_OPT 1000
>  typedef struct kernel_storage_node
>  {
>    char kernel_name[MAX_KERNEL_NAME_LENGTH];
>    float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
> +  char build_option[MAX_KERNEL_BUILD_OPT];
>    int current_count;
>    float kernel_sum_time;
>    struct kernel_storage_node *next;
> @@ -81,11 +82,12 @@ static context_storage_node * find_context(cl_context context)
>    return pre;
>  }
>  
> -static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name)
> +static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name, const char *build_opt)
>  {
>    if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
>       p_context == prev_context_pointer &&
> -     !strcmp(kernel_name, prev_kernel_pointer->kernel_name))
> +     !strncmp(kernel_name, prev_kernel_pointer->kernel_name, MAX_KERNEL_NAME_LENGTH) &&
> +     !strncmp(build_opt, prev_kernel_pointer->build_option, MAX_KERNEL_BUILD_OPT))
>      return prev_kernel_pointer;
>  
>    if(NULL == p_context)
> @@ -95,30 +97,38 @@ static kernel_storage_node * find_kernel(context_storage_node *p_context, const
>    {
>      p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
>      p_context->kernel_count++;
> -    strcpy(p_context->kernels_storage->kernel_name,kernel_name);
> +    strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
> +    p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
> +    strncpy(p_context->kernels_storage->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
> +    p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = '\0';
>      p_context->kernels_storage->current_count = 0;
>      p_context->kernels_storage->kernel_sum_time = 0.0f;
>      p_context->kernels_storage->next = NULL;
>      return p_context->kernels_storage;
>    }
> +
>    kernel_storage_node *pre = p_context->kernels_storage;
>    kernel_storage_node *cur = p_context->kernels_storage;
> -  while(NULL != cur && strcmp(cur->kernel_name, kernel_name))
> +  while(NULL != cur &&
> +        (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) ||
> +         strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT)))
>    {
>      pre = cur;
>      cur = cur->next;
>    }
>    if(NULL != cur)
> -  {
>      return cur;
> -  }
> +
>    p_context->kernel_count++;
>    pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
>    pre = pre->next;
>    pre->current_count = 0;
>    pre->kernel_sum_time = 0.0f;
>    pre->next = NULL;
> -  strcpy(pre->kernel_name, kernel_name);
> +  strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
> +  pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
> +  strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
> +  pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
>    return pre;
>  }
>  
> @@ -146,6 +156,8 @@ typedef struct time_element
>    float kernel_sum_time;
>    int kernel_execute_count;
>    double dev;
> +  float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
> +  uint32_t time_index;
>  } time_element;
>  
>  static int cmp(const void *a, const void *b)
> @@ -172,40 +184,62 @@ static void print_time_info()
>    {
>      printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
>      printf("  ->>>> KERNELS TIME SUMMARY <<<<-\n");
> +
>      kernel_storage_node *p_kernel = p_context->kernels_storage;
>      kernel_storage_node *p_tmp_kernel = p_kernel;
>      time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
> -    int i = 0, j = 0;
> +    memset(te, 0, sizeof(time_element)*p_context->kernel_count);
> +    int i = -1, j = 0, k = 0;
>      while(NULL != p_tmp_kernel)
>      {
> -      te[i].kernel_execute_count = p_tmp_kernel->current_count;
> -      strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name);
> -      te[i].kernel_sum_time = p_tmp_kernel->kernel_sum_time;
> -      float average = p_tmp_kernel->kernel_sum_time / p_tmp_kernel->current_count;
> -      float sumsquare = 0.0f;
> +      for(k=0; k<=i; k++)
> +      {
> +        if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH))
> +          break;
> +      }
> +      if(k == i+1)
> +      {
> +        i++;
> +        k = i;
> +      }
> +      te[k].kernel_execute_count += p_tmp_kernel->current_count;
> +      strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH);
> +      te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
> +      te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time;
>        for(j=0; j != p_tmp_kernel->current_count; ++j)
> -        sumsquare += pow((p_tmp_kernel->kernel_times[j] - average), 2.0 );
> -      te[i++].dev = sqrt(sumsquare/p_tmp_kernel->current_count);
> +        te[k].kernel_times[te[k].time_index++] = p_tmp_kernel->kernel_times[j];
>        p_tmp_kernel = p_tmp_kernel->next;
>      }
> +
> +    for(k=0; k<=i; k++)
> +    {
> +      float average = te[k].kernel_sum_time / te[k].kernel_execute_count;
> +      double sumsquare = 0.0;
> +      for(j=0; j<te[k].time_index; ++j)
> +        sumsquare += pow((te[k].kernel_times[j] - average), 2.0);
> +      te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count);
> +    }
> +
>      float sum_time = 0.0f;
>      qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
> -    for(i=0; i<p_context->kernel_count; ++i)
> -      sum_time += te[i].kernel_sum_time;
> -    for(i=0; i<p_context->kernel_count; ++i)
> +    for(j=0; j<=i; ++j)
> +      sum_time += te[j].kernel_sum_time;
> +
> +    for(j=0; j<=i; ++j)
>      {
> -      printf("    [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f  Count: %-7d  Ave(ms): %7.2f  Dev: %.1lf%%] \n",
> -             te[i].kernel_name,
> -             te[i].kernel_sum_time / sum_time * 100,
> -             te[i].kernel_sum_time,
> -             te[i].kernel_execute_count,
> -             te[i].kernel_sum_time / te[i].kernel_execute_count,
> -             te[i].dev / te[i].kernel_sum_time * te[i].kernel_execute_count * 100);
> +      printf("    [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f  Count: %-7d  Ave(ms): %7.2f  Dev: %.1lf%%]\n",
> +             te[j].kernel_name,
> +             te[j].kernel_sum_time / sum_time * 100,
> +             te[j].kernel_sum_time,
> +             te[j].kernel_execute_count,
> +             te[j].kernel_sum_time / te[j].kernel_execute_count,
> +             te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count * 100);
>      }
>      free(te);
>      printf("    Total : %.2f\n", sum_time);
>      if(2 != b_output_kernel_perf)
>      {
> +      printf("[------------  CONTEXT ENDS------------]\n\n");
>        p_context = p_context->next;
>        continue;
>      }
> @@ -214,6 +248,18 @@ static void print_time_info()
>      while(NULL != p_kernel)
>      {
>        printf("    [Kernel Name : %30s   Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
> +      if(*p_kernel->build_option != '\0')
> +      {
> +        int count = 0;
> +        printf("      ->Build Options : ");
> +        while(p_kernel->build_option[count] != '\0' )
> +        {
> +          printf("%c", p_kernel->build_option[count++]);
> +          if(count % 100 == 0)
> +            printf("\n                         ");
> +        }
> +        printf("\n");
> +      }
>        for(i=0; i!=p_kernel->current_count; ++i)
>          printf("      Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]);
>        p_kernel = p_kernel->next;
> @@ -225,7 +271,7 @@ static void print_time_info()
>  }
>  
>  
> -static void insert(cl_context context, const char *kernel_name, float time)
> +static void insert(cl_context context, const char *kernel_name, const char *build_opt, float time)
>  {
>    if(!atexit_registered)
>    {
> @@ -233,7 +279,7 @@ static void insert(cl_context context, const char *kernel_name, float time)
>      atexit(print_time_info);
>    }
>    context_storage_node *p_context = find_context(context);
> -  kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name);
> +  kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt);
>    prev_context_pointer = p_context;
>    prev_kernel_pointer = p_kernel;
>    p_kernel->kernel_times[p_kernel->current_count++] = time;
> @@ -241,7 +287,8 @@ static void insert(cl_context context, const char *kernel_name, float time)
>    if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
>    {
>      p_context->kernel_max_time = p_kernel->kernel_sum_time;
> -    strcpy(p_context->max_time_kernel_name, kernel_name);
> +    strncpy(p_context->max_time_kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
> +    p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
>    }
>  }
>  
> @@ -253,9 +300,9 @@ static struct timeval start, end;
>  void initialize_env_var()
>  {
>    char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
> -  if(NULL == env || !strcmp(env,"0"))
> +  if(NULL == env || !strncmp(env,"0", 1))
>      b_output_kernel_perf = 0;
> -  else if(!strcmp(env,"1"))
> +  else if(!strncmp(env,"1", 1))
>      b_output_kernel_perf = 1;
>    else
>      b_output_kernel_perf = 2;
> @@ -267,11 +314,11 @@ void time_start(cl_context context, const char * kernel_name, cl_command_queue c
>    gettimeofday(&start, NULL);
>  }
>  
> -void time_end(cl_context context, const char * kernel_name, cl_command_queue cq)
> +void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq)
>  {
>    clFinish(cq);
>    gettimeofday(&end, NULL);
>    float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
> -  insert(context, kernel_name, t);
> +  insert(context, kernel_name, build_opt, t);
>    pthread_mutex_unlock(&mutex);
>  }
> diff --git a/src/performance.h b/src/performance.h
> index c747743..1e75054 100644
> --- a/src/performance.h
> +++ b/src/performance.h
> @@ -5,7 +5,7 @@
>  
>  extern int b_output_kernel_perf;
>  void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
> -void time_end(cl_context context, const char * kernel_name, cl_command_queue cq);
> +void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq);
>  void initialize_env_var();
>  
>  
> -- 
> 1.8.3.2
> 
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list