[Beignet] [PATCH V3] Complete the feature of clGetEventProfilingInfo API

junyan.he at inbox.com junyan.he at inbox.com
Thu Nov 28 18:55:54 PST 2013


From: Junyan He <junyan.he at linux.intel.com>

The profiling feature is now all supported. We use
drm_intel_reg_read to get the current time of GPU
when the event is queued and submitted, and use
PIPI_CONTROL cmd to get the executing time of the
GPU for kernel start and end.
One trivial problem is that:
The GPU timer counter is 36 bits with resolution of
80ns, so 2^36*80 = 5500s, about half an hour.
Some test may last about 2~5 min and if it starts at
about half an hour, this may cause a wrap back problem
and cause the case fail.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 src/cl_api.c            | 78 +++++++++++++++++++++++++++++++++++++++----------
 src/cl_driver.h         |  8 +++--
 src/cl_driver_defs.c    |  3 +-
 src/cl_enqueue.c        | 19 ++++++++----
 src/cl_enqueue.h        |  2 +-
 src/cl_event.c          | 27 ++++++++++-------
 src/cl_event.h          |  3 +-
 src/intel/intel_gpgpu.c | 36 +++++++++++++++++++++--
 8 files changed, 137 insertions(+), 39 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index 0978129..35d0984 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -70,6 +70,13 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
   cl_event e;
   if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
     e = cl_event_new(queue->ctx, queue, type, event!=NULL);
+
+    /* if need profiling, add the submit timestamp here. */
+    if (e->type != CL_COMMAND_USER &&
+	    e->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+	cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED);
+    }
+
     if(event != NULL)
       *event = e;
     if(status == CL_ENQUEUE_EXECUTE_DEFER) {
@@ -1270,16 +1277,25 @@ clGetEventProfilingInfo(cl_event             event,
     goto error;
   }
 
-  if ((param_name != CL_PROFILING_COMMAND_QUEUED &&
-          param_name != CL_PROFILING_COMMAND_SUBMIT &&
-          param_name != CL_PROFILING_COMMAND_START &&
-          param_name != CL_PROFILING_COMMAND_END) ||
-          (param_value && param_value_size < sizeof(cl_ulong))) {
+  if (param_value && param_value_size < sizeof(cl_ulong)) {
     err = CL_INVALID_VALUE;
     goto error;
   }
 
-  err = cl_event_profiling(event, param_name, &ret_val);
+  if (param_name == CL_PROFILING_COMMAND_QUEUED) {
+    ret_val = event->timestamp[0];
+  } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
+    ret_val = event->timestamp[1];
+  } else if (param_name == CL_PROFILING_COMMAND_START) {
+    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START);
+    ret_val = event->timestamp[2];
+  } else if (param_name == CL_PROFILING_COMMAND_END) {
+    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END);
+    ret_val = event->timestamp[3];
+  } else {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
 
   if (err == CL_SUCCESS) {
     if (param_value)
@@ -1354,7 +1370,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1437,7 +1453,7 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1487,7 +1503,7 @@ clEnqueueWriteBuffer(cl_command_queue    command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1570,7 +1586,7 @@ clEnqueueWriteBufferRect(cl_command_queue     command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1649,6 +1665,11 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
   return 0;
@@ -1740,6 +1761,11 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -1818,7 +1844,7 @@ clEnqueueReadImage(cl_command_queue      command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1897,7 +1923,7 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1974,6 +2000,11 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -2030,6 +2061,11 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -2086,6 +2122,11 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -2217,7 +2258,7 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -2313,7 +2354,7 @@ clEnqueueMapImage(cl_command_queue   command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -2350,7 +2391,7 @@ clEnqueueUnmapMemObject(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -2456,6 +2497,11 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -2535,7 +2581,7 @@ clEnqueueNativeKernel(cl_command_queue   command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 8efe1e7..a34c22e 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -193,8 +193,12 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
 extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
 
 /* Get a event time stamp */
-typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
-extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp;
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
+extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
+
+/* Get current GPU time stamp */
+typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
+extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
 
 /* Will spawn all threads */
 typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 54fa62e..b46799a 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -80,5 +80,6 @@ LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
 LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
 LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
 LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
-LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL;
+LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL;
+LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
 
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 070fd98..330d230 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -16,16 +16,18 @@
  *
  * Author: Rong Yang <rong.r.yang at intel.com>
  */
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
 
 #include "cl_enqueue.h"
 #include "cl_image.h"
 #include "cl_driver.h"
+#include "cl_event.h"
+#include "cl_command_queue.h"
 #include "cl_utils.h"
 
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <pthread.h>
 
 cl_int cl_enqueue_read_buffer(enqueue_data* data)
 {
@@ -376,8 +378,15 @@ cl_int cl_enqueue_native_kernel(enqueue_data *data)
 error:
   return err;
 }
-cl_int cl_enqueue_handle(enqueue_data* data)
+
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
 {
+  /* if need profiling, add the submit timestamp here. */
+  if (event && event->type != CL_COMMAND_USER
+           && event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+    cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
+  }
+
   switch(data->type) {
     case EnqueueReadBuffer:
       return cl_enqueue_read_buffer(data);
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index b412d58..1d3ae5f 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -64,5 +64,5 @@ typedef struct _enqueue_data {
 } enqueue_data;
 
 /* Do real enqueue commands */
-cl_int cl_enqueue_handle(enqueue_data* data);
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data);
 #endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
index 028dfb6..f838a3a 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -380,7 +380,7 @@ void cl_event_set_status(cl_event event, cl_int status)
 
   if(status <= CL_COMPLETE) {
     if(event->enqueue_cb) {
-      cl_enqueue_handle(&event->enqueue_cb->data);
+      cl_enqueue_handle(event, &event->enqueue_cb->data);
       if(event->gpgpu_event)
         cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set complet, need refine
       event->status = status;  //Change the event status after enqueue and befor unlock
@@ -496,22 +496,29 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
   return CL_SUCCESS;
 }
 
-cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val)
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
 {
+  cl_ulong ret_val = 0;
+  GET_QUEUE_THREAD_GPGPU(event->queue);
+
   if (!event->gpgpu_event) {
-    /* Some event like read buffer do not need GPU involved, so
-       we just return all the profiling to 0 now. */
-    *ret_val = 0;
+    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
   }
 
-  if(param_name == CL_PROFILING_COMMAND_START ||
-     param_name == CL_PROFILING_COMMAND_QUEUED ||
-     param_name == CL_PROFILING_COMMAND_SUBMIT) {
-    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val);
+  if(param_name == CL_PROFILING_COMMAND_SUBMIT ||
+         param_name == CL_PROFILING_COMMAND_QUEUED) {
+    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+    return CL_SUCCESS;
+  } else if(param_name == CL_PROFILING_COMMAND_START) {
+    cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 0, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
   } else if (param_name == CL_PROFILING_COMMAND_END) {
-    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val);
+    cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 1, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
   } else {
     return CL_INVALID_VALUE;
diff --git a/src/cl_event.h b/src/cl_event.h
index 722486a..3c61110 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -68,6 +68,7 @@ struct _cl_event {
   enqueue_callback*  enqueue_cb;  /* This event's enqueue */
   enqueue_callback*  waits_head;  /* The head of enqueues list wait on this event */
   cl_bool            emplict;     /* Identify this event whether created by api emplict*/
+  cl_ulong           timestamp[4];/* The time stamps for profiling. */
 };
 
 /* Create a new event object */
@@ -91,6 +92,6 @@ void cl_event_update_status(cl_event);
 /* Create the marker event */
 cl_int cl_event_marker(cl_command_queue, cl_event*);
 /* Do the event profiling */
-cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val);
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 7be9059..b1597ac 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -51,6 +51,8 @@
 #define MO_RETAIN_BIT         (1 << 28)
 #define SAMPLER_STATE_SIZE    (16)
 
+#define TIMESTAMP_ADDR        0x2358
+
 /* Stores both binding tables and surface states */
 typedef struct surface_heap {
   uint32_t binding_table[256];
@@ -1041,15 +1043,42 @@ intel_gpgpu_event_delete(intel_event_t *event)
   cl_free(event);
 }
 
+/* We want to get the current time of GPU. */
+static void
+intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
+{
+  uint64_t result = 0;
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+
+  drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+  result = result & 0xFFFFFFFFF0000000;
+  result = result >> 28;
+  result *= 80;
+
+  *ret_ts = result;
+  return;
+}
+
+/* Get the GPU execute time. */
 static void
-intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* ret_ts)
+intel_gpgpu_event_get_exec_timestamp(intel_event_t *event,
+                                int index, uint64_t* ret_ts)
 {
+  uint64_t result = 0;
+
   assert(event->ts_buf != NULL);
   assert(index == 0 || index == 1);
   drm_intel_gem_bo_map_gtt(event->ts_buf);
   uint64_t* ptr = event->ts_buf->virtual;
+  result = ptr[index];
+
+  /* According to BSpec, the timestamp counter should be 36 bits,
+     but comparing to the timestamp counter from IO control reading,
+     we find the first 4 bits seems to be fake. In order to keep the
+     timestamp counter conformable, we just skip the first 4 bits. */
+  result = ((result & 0x0FFFFFFFF) << 4) * 80; //convert to nanoseconds
+  *ret_ts = result;
 
-  *ret_ts = ptr[index] * 80; //convert to nanoseconds
   drm_intel_gem_bo_unmap_gtt(event->ts_buf);
 }
 
@@ -1080,6 +1109,7 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
   cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
   cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
-  cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb *)intel_gpgpu_event_get_timestamp;
+  cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
+  cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
 }
 
-- 
1.8.3.2



More information about the Beignet mailing list