[Beignet] [PATCH] Runtime: Fix broken OCL_OUTPUT_KERNEL_PERF

Xiuli Pan xiuli.pan at intel.com
Wed Apr 5 06:54:10 UTC 2017


From: Pan Xiuli <xiuli.pan at intel.com>

After the runtime refining, the OCL_OUTPUT_KERNEL_PERF is broken. Fix
it for performance tuning.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 src/cl_api_context.c        | 2 ++
 src/cl_command_queue_gen7.c | 1 +
 src/cl_enqueue.c            | 3 +++
 src/cl_enqueue.h            | 1 +
 src/performance.c           | 1 -
 5 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/cl_api_context.c b/src/cl_api_context.c
index e8184b1..7028f8d 100644
--- a/src/cl_api_context.c
+++ b/src/cl_api_context.c
@@ -19,6 +19,7 @@
 #include "cl_context.h"
 #include "cl_device_id.h"
 #include "cl_alloc.h"
+#include "performance.h"
 
 cl_context
 clCreateContext(const cl_context_properties *properties,
@@ -55,6 +56,7 @@ clCreateContext(const cl_context_properties *properties,
     context = cl_create_context(properties, num_devices, devices, pfn_notify, user_data, &err);
   } while (0);
 
+  initialize_env_var();
   if (errcode_ret)
     *errcode_ret = err;
   return context;
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index dd82a44..6f85148 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -492,6 +492,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   event->exec_data.queue = queue;
   event->exec_data.gpgpu = gpgpu;
   event->exec_data.type = EnqueueNDRangeKernel;
+  event->exec_data.name = kernel.name;
 
   return CL_SUCCESS;
 
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 8350089..166dc55 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -25,6 +25,7 @@
 #include "cl_utils.h"
 #include "cl_alloc.h"
 #include "cl_device_enqueue.h"
+#include "performance.h"
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
@@ -579,6 +580,8 @@ cl_enqueue_ndrange(enqueue_data *data, cl_int status)
     void *batch_buf = cl_gpgpu_ref_batch_buf(data->gpgpu);
     cl_gpgpu_sync(batch_buf);
     cl_gpgpu_unref_batch_buf(batch_buf);
+    if(b_output_kernel_perf)
+      time_end(data->queue->ctx, data->name, "", data->queue);
   }
 
   return err;
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index 50a54fc..1532a59 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -81,6 +81,7 @@ typedef struct _enqueue_data {
   cl_bool mid_event_of_enq;  /* For non-uniform ndrange, one enqueue have a sequence event, the
                                 last event need to parse device enqueue information.
                                 0 : last event; 1: non-last event */
+  const char* name;          /* enqueue name */
 } enqueue_data;
 
 /* Do real enqueue commands */
diff --git a/src/performance.c b/src/performance.c
index 1e676c3..b8cdcc6 100644
--- a/src/performance.c
+++ b/src/performance.c
@@ -325,7 +325,6 @@ void time_start(cl_context context, const char * kernel_name, cl_command_queue c
 
 void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq)
 {
-  clFinish(cq);
   gettimeofday(&end, NULL);
   float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
   insert(context, kernel_name, build_opt, t);
-- 
2.7.4



More information about the Beignet mailing list