[Beignet] [PATCH v2] Fix the same kernel name issue of OCL_OUTPUT_KERNEL_PERF
Zhigang Gong
zhigang.gong at linux.intel.com
Thu Jun 12 19:33:32 PDT 2014
LGTM, pushed with slight change.
Thanks.
On Mon, Jun 23, 2014 at 11:09:33PM +0800, Yongjia Zhang wrote:
> Now it treats kernels with same kernel name and different build
> options separately. When OCL_OUTPUT_KERNEL_PERF==1, it outputs the
> time summary as before, but if OCL_OUTPUT_KERNEL_PERF==2, it will
> output the time details including the kernel build options and
> kernels with same kernel name but different build options will
> output separately.
>
> v2: use strncmp and strncpy instead of strcmp and strcpy.
>
> Signed-off-by: Yongjia Zhang <yongjia.zhang at intel.com>
> ---
> src/cl_api.c | 17 +++++---
> src/performance.c | 113 ++++++++++++++++++++++++++++++++++++++----------------
> src/performance.h | 2 +-
> 3 files changed, 92 insertions(+), 40 deletions(-)
>
> diff --git a/src/cl_api.c b/src/cl_api.c
> index 6206e10..8856f5f 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -1681,7 +1681,7 @@ clEnqueueCopyBuffer(cl_command_queue command_queue,
> }
>
> if(b_output_kernel_perf)
> - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue);
> + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "\0", command_queue);
>
> return 0;
>
> @@ -1785,7 +1785,7 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
> }
>
> if(b_output_kernel_perf)
> - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue);
> + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "\0", command_queue);
>
> error:
> return err;
> @@ -2027,7 +2027,7 @@ clEnqueueCopyImage(cl_command_queue command_queue,
> }
>
> if(b_output_kernel_perf)
> - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue);
> + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "\0", command_queue);
>
> error:
> return err;
> @@ -2091,7 +2091,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
> }
>
> if(b_output_kernel_perf)
> - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue);
> + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "\0", command_queue);
>
> error:
> return err;
> @@ -2155,7 +2155,7 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue,
> }
>
> if(b_output_kernel_perf)
> - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue);
> + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "\0", command_queue);
>
> error:
> return err;
> @@ -2556,7 +2556,12 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
> }
>
> if(b_output_kernel_perf)
> - time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue);
> + {
> + if(kernel->program->build_opts != NULL)
> + time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
> + else
> + time_end(command_queue->ctx, cl_kernel_get_name(kernel), "\0", command_queue);
> + }
> error:
> return err;
> }
> diff --git a/src/performance.c b/src/performance.c
> index a785460..85cd481 100644
> --- a/src/performance.c
> +++ b/src/performance.c
> @@ -8,11 +8,12 @@
>
> #define MAX_KERNEL_NAME_LENGTH 100
> #define MAX_KERNEL_EXECUTION_COUNT 100000
> -
> +#define MAX_KERNEL_BUILD_OPT 1000
> typedef struct kernel_storage_node
> {
> char kernel_name[MAX_KERNEL_NAME_LENGTH];
> float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
> + char build_option[MAX_KERNEL_BUILD_OPT];
> int current_count;
> float kernel_sum_time;
> struct kernel_storage_node *next;
> @@ -81,11 +82,12 @@ static context_storage_node * find_context(cl_context context)
> return pre;
> }
>
> -static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name)
> +static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name, const char *build_opt)
> {
> if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
> p_context == prev_context_pointer &&
> - !strcmp(kernel_name, prev_kernel_pointer->kernel_name))
> + !strncmp(kernel_name, prev_kernel_pointer->kernel_name, MAX_KERNEL_NAME_LENGTH) &&
> + !strncmp(build_opt, prev_kernel_pointer->build_option, MAX_KERNEL_BUILD_OPT))
> return prev_kernel_pointer;
>
> if(NULL == p_context)
> @@ -95,30 +97,38 @@ static kernel_storage_node * find_kernel(context_storage_node *p_context, const
> {
> p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
> p_context->kernel_count++;
> - strcpy(p_context->kernels_storage->kernel_name,kernel_name);
> + strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
> + p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
> + strncpy(p_context->kernels_storage->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
> + p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = '\0';
> p_context->kernels_storage->current_count = 0;
> p_context->kernels_storage->kernel_sum_time = 0.0f;
> p_context->kernels_storage->next = NULL;
> return p_context->kernels_storage;
> }
> +
> kernel_storage_node *pre = p_context->kernels_storage;
> kernel_storage_node *cur = p_context->kernels_storage;
> - while(NULL != cur && strcmp(cur->kernel_name, kernel_name))
> + while(NULL != cur &&
> + (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) ||
> + strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT)))
> {
> pre = cur;
> cur = cur->next;
> }
> if(NULL != cur)
> - {
> return cur;
> - }
> +
> p_context->kernel_count++;
> pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
> pre = pre->next;
> pre->current_count = 0;
> pre->kernel_sum_time = 0.0f;
> pre->next = NULL;
> - strcpy(pre->kernel_name, kernel_name);
> + strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
> + pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
> + strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
> + pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
> return pre;
> }
>
> @@ -146,6 +156,8 @@ typedef struct time_element
> float kernel_sum_time;
> int kernel_execute_count;
> double dev;
> + float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
> + uint32_t time_index;
> } time_element;
>
> static int cmp(const void *a, const void *b)
> @@ -172,40 +184,62 @@ static void print_time_info()
> {
> printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
> printf(" ->>>> KERNELS TIME SUMMARY <<<<-\n");
> +
> kernel_storage_node *p_kernel = p_context->kernels_storage;
> kernel_storage_node *p_tmp_kernel = p_kernel;
> time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
> - int i = 0, j = 0;
> + memset(te, 0, sizeof(time_element)*p_context->kernel_count);
> + int i = -1, j = 0, k = 0;
> while(NULL != p_tmp_kernel)
> {
> - te[i].kernel_execute_count = p_tmp_kernel->current_count;
> - strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name);
> - te[i].kernel_sum_time = p_tmp_kernel->kernel_sum_time;
> - float average = p_tmp_kernel->kernel_sum_time / p_tmp_kernel->current_count;
> - float sumsquare = 0.0f;
> + for(k=0; k<=i; k++)
> + {
> + if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH))
> + break;
> + }
> + if(k == i+1)
> + {
> + i++;
> + k = i;
> + }
> + te[k].kernel_execute_count += p_tmp_kernel->current_count;
> + strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH);
> + te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
> + te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time;
> for(j=0; j != p_tmp_kernel->current_count; ++j)
> - sumsquare += pow((p_tmp_kernel->kernel_times[j] - average), 2.0 );
> - te[i++].dev = sqrt(sumsquare/p_tmp_kernel->current_count);
> + te[k].kernel_times[te[k].time_index++] = p_tmp_kernel->kernel_times[j];
> p_tmp_kernel = p_tmp_kernel->next;
> }
> +
> + for(k=0; k<=i; k++)
> + {
> + float average = te[k].kernel_sum_time / te[k].kernel_execute_count;
> + double sumsquare = 0.0;
> + for(j=0; j<te[k].time_index; ++j)
> + sumsquare += pow((te[k].kernel_times[j] - average), 2.0);
> + te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count);
> + }
> +
> float sum_time = 0.0f;
> qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
> - for(i=0; i<p_context->kernel_count; ++i)
> - sum_time += te[i].kernel_sum_time;
> - for(i=0; i<p_context->kernel_count; ++i)
> + for(j=0; j<=i; ++j)
> + sum_time += te[j].kernel_sum_time;
> +
> + for(j=0; j<=i; ++j)
> {
> - printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d Ave(ms): %7.2f Dev: %.1lf%%] \n",
> - te[i].kernel_name,
> - te[i].kernel_sum_time / sum_time * 100,
> - te[i].kernel_sum_time,
> - te[i].kernel_execute_count,
> - te[i].kernel_sum_time / te[i].kernel_execute_count,
> - te[i].dev / te[i].kernel_sum_time * te[i].kernel_execute_count * 100);
> + printf(" [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f Count: %-7d Ave(ms): %7.2f Dev: %.1lf%%]\n",
> + te[j].kernel_name,
> + te[j].kernel_sum_time / sum_time * 100,
> + te[j].kernel_sum_time,
> + te[j].kernel_execute_count,
> + te[j].kernel_sum_time / te[j].kernel_execute_count,
> + te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count * 100);
> }
> free(te);
> printf(" Total : %.2f\n", sum_time);
> if(2 != b_output_kernel_perf)
> {
> + printf("[------------ CONTEXT ENDS------------]\n\n");
> p_context = p_context->next;
> continue;
> }
> @@ -214,6 +248,18 @@ static void print_time_info()
> while(NULL != p_kernel)
> {
> printf(" [Kernel Name : %30s Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
> + if(*p_kernel->build_option != '\0')
> + {
> + int count = 0;
> + printf(" ->Build Options : ");
> + while(p_kernel->build_option[count] != '\0' )
> + {
> + printf("%c", p_kernel->build_option[count++]);
> + if(count % 100 == 0)
> + printf("\n ");
> + }
> + printf("\n");
> + }
> for(i=0; i!=p_kernel->current_count; ++i)
> printf(" Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]);
> p_kernel = p_kernel->next;
> @@ -225,7 +271,7 @@ static void print_time_info()
> }
>
>
> -static void insert(cl_context context, const char *kernel_name, float time)
> +static void insert(cl_context context, const char *kernel_name, const char *build_opt, float time)
> {
> if(!atexit_registered)
> {
> @@ -233,7 +279,7 @@ static void insert(cl_context context, const char *kernel_name, float time)
> atexit(print_time_info);
> }
> context_storage_node *p_context = find_context(context);
> - kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name);
> + kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt);
> prev_context_pointer = p_context;
> prev_kernel_pointer = p_kernel;
> p_kernel->kernel_times[p_kernel->current_count++] = time;
> @@ -241,7 +287,8 @@ static void insert(cl_context context, const char *kernel_name, float time)
> if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
> {
> p_context->kernel_max_time = p_kernel->kernel_sum_time;
> - strcpy(p_context->max_time_kernel_name, kernel_name);
> + strncpy(p_context->max_time_kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
> + p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
> }
> }
>
> @@ -253,9 +300,9 @@ static struct timeval start, end;
> void initialize_env_var()
> {
> char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
> - if(NULL == env || !strcmp(env,"0"))
> + if(NULL == env || !strncmp(env,"0", 1))
> b_output_kernel_perf = 0;
> - else if(!strcmp(env,"1"))
> + else if(!strncmp(env,"1", 1))
> b_output_kernel_perf = 1;
> else
> b_output_kernel_perf = 2;
> @@ -267,11 +314,11 @@ void time_start(cl_context context, const char * kernel_name, cl_command_queue c
> gettimeofday(&start, NULL);
> }
>
> -void time_end(cl_context context, const char * kernel_name, cl_command_queue cq)
> +void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq)
> {
> clFinish(cq);
> gettimeofday(&end, NULL);
> float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
> - insert(context, kernel_name, t);
> + insert(context, kernel_name, build_opt, t);
> pthread_mutex_unlock(&mutex);
> }
> diff --git a/src/performance.h b/src/performance.h
> index c747743..1e75054 100644
> --- a/src/performance.h
> +++ b/src/performance.h
> @@ -5,7 +5,7 @@
>
> extern int b_output_kernel_perf;
> void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
> -void time_end(cl_context context, const char * kernel_name, cl_command_queue cq);
> +void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq);
> void initialize_env_var();
>
>
> --
> 1.8.3.2
>
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list