[Beignet] [PATCH V3] Add profiling feature based on new event implementation.

junyan.he at inbox.com junyan.he at inbox.com
Fri Dec 16 14:29:21 UTC 2016


From: Junyan He <junyan.he at intel.com>

TODO:
In opencl 2.0, a new profiling item called CL_PROFILING_COMMAND_COMPLETE
is imported. It means that we need to record the time stamp of all the
child events created by the "Kernel enqueing kernels" feature finish.
This should be done after the "Kernel enqueing kernels" feature enabled.

V2:
Update event time stamp before inserting to queue thread, avoid MT issue.

V3:
Fixup overflow problem.

Signed-off-by: Junyan He <junyan.he at intel.com>
---
 src/cl_api.c                   |  51 ------------------
 src/cl_api_event.c             |  41 ++++++++++++++
 src/cl_api_mem.c               |   9 ++++
 src/cl_command_queue_enqueue.c |   2 +
 src/cl_driver.h                |   4 +-
 src/cl_enqueue.c               |   9 ----
 src/cl_event.c                 | 118 +++++++++++++++++++++++++++++++----------
 src/cl_event.h                 |  10 ++--
 src/intel/intel_gpgpu.c        |  16 +++---
 9 files changed, 155 insertions(+), 105 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index d7b5434..6a4f4ec 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1312,57 +1312,6 @@ error:
   return err;
 }
 
-
-cl_int
-clGetEventProfilingInfo(cl_event             event,
-                        cl_profiling_info    param_name,
-                        size_t               param_value_size,
-                        void *               param_value,
-                        size_t *             param_value_size_ret)
-{
-  cl_int err = CL_SUCCESS;
-  cl_ulong ret_val;
-
-  CHECK_EVENT(event);
-  //cl_event_update_status(event, 0);
-
-  if (event->event_type == CL_COMMAND_USER ||
-      !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
-          event->status != CL_COMPLETE) {
-    err = CL_PROFILING_INFO_NOT_AVAILABLE;
-    goto error;
-  }
-
-  if (param_value && param_value_size < sizeof(cl_ulong)) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-
-  if (param_name == CL_PROFILING_COMMAND_QUEUED) {
-    ret_val = event->queued_timestamp;
-  } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
-    ret_val= event->queued_timestamp + cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[1]);
-  } else if (param_name == CL_PROFILING_COMMAND_START) {
-    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START);
-    ret_val = event->queued_timestamp + cl_event_get_start_timestamp(event);
-  } else if (param_name == CL_PROFILING_COMMAND_END) {
-    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END);
-    ret_val =  event->queued_timestamp + cl_event_get_end_timestamp(event);
-  } else {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-
-  if (err == CL_SUCCESS) {
-    if (param_value)
-      *(cl_ulong*)param_value = ret_val;
-    if (param_value_size_ret)
-      *param_value_size_ret = sizeof(cl_ulong);
-  }
-error:
-  return err;
-}
-
 cl_mem clCreatePipe (cl_context context,
                      cl_mem_flags flags,
                      cl_uint pipe_packet_size,
diff --git a/src/cl_api_event.c b/src/cl_api_event.c
index 8f2b8e0..af1442a 100644
--- a/src/cl_api_event.c
+++ b/src/cl_api_event.c
@@ -290,3 +290,44 @@ clGetEventInfo(cl_event event,
   return cl_get_info_helper(src_ptr, src_size,
                             param_value, param_value_size, param_value_size_ret);
 }
+
+cl_int
+clGetEventProfilingInfo(cl_event event,
+                        cl_profiling_info param_name,
+                        size_t param_value_size,
+                        void *param_value,
+                        size_t *param_value_size_ret)
+{
+  cl_ulong ret_val;
+
+  if (!CL_OBJECT_IS_EVENT(event)) {
+    return CL_INVALID_EVENT;
+  }
+
+  assert(event->event_type == CL_COMMAND_USER || event->queue != NULL);
+  if (event->event_type == CL_COMMAND_USER ||
+      !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
+      cl_event_get_status(event) != CL_COMPLETE) {
+    return CL_PROFILING_INFO_NOT_AVAILABLE;
+  }
+
+  if (param_value && param_value_size < sizeof(cl_ulong)) {
+    return CL_INVALID_VALUE;
+  }
+
+  if (param_name < CL_PROFILING_COMMAND_QUEUED ||
+      param_name > CL_PROFILING_COMMAND_COMPLETE) {
+    return CL_INVALID_VALUE;
+  }
+
+  ret_val = event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED];
+  if (ret_val == CL_EVENT_INVALID_TIMESTAMP) {
+    return CL_INVALID_VALUE;
+  }
+
+  if (param_value)
+    *(cl_ulong *)param_value = ret_val;
+  if (param_value_size_ret)
+    *param_value_size_ret = sizeof(cl_ulong);
+  return CL_SUCCESS;
+}
diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 07be706..de18684 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -318,6 +318,7 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
 
       ptr = data->ptr;
       e->status = CL_COMPLETE; // Just set the status, no notify. No one depend on us now.
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else {
       err = cl_enqueue_handle(data, CL_SUBMITTED); // Submit to get the address.
       if (err != CL_SUCCESS) {
@@ -410,6 +411,7 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
       }
 
       e->status = CL_COMPLETE;
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else { // May need to wait some event to complete.
       cl_command_queue_enqueue_event(command_queue, e);
     }
@@ -513,6 +515,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
       }
 
       e->status = CL_COMPLETE; // Just set the status, no notify. No one depend on us now.
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else {
       cl_command_queue_enqueue_event(command_queue, e);
     }
@@ -616,6 +619,7 @@ clEnqueueWriteBuffer(cl_command_queue command_queue,
       }
 
       e->status = CL_COMPLETE; // Just set the status, no notify. No one depend on us now.
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else {
       cl_command_queue_enqueue_event(command_queue, e);
     }
@@ -765,6 +769,7 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
       }
 
       e->status = CL_COMPLETE; // Just set the status, no notify. No one depend on us now.
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else {
       cl_command_queue_enqueue_event(command_queue, e);
     }
@@ -916,6 +921,7 @@ clEnqueueWriteBufferRect(cl_command_queue command_queue,
       }
 
       e->status = CL_COMPLETE; // Just set the status, no notify. No one depend on us now.
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else {
       cl_command_queue_enqueue_event(command_queue, e);
     }
@@ -1601,6 +1607,7 @@ clEnqueueMapImage(cl_command_queue command_queue,
 
       ptr = data->ptr;
       e->status = CL_COMPLETE; // Just set the status, no notify. No one depend on us now.
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else {
       err = cl_enqueue_handle(data, CL_SUBMITTED); // Submit to get the address.
       if (err != CL_SUCCESS) {
@@ -1798,6 +1805,7 @@ clEnqueueReadImage(cl_command_queue command_queue,
       }
 
       e->status = CL_COMPLETE; // Just set the status, no notify. No one depend on us now.
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else {
       cl_command_queue_enqueue_event(command_queue, e);
     }
@@ -1950,6 +1958,7 @@ clEnqueueWriteImage(cl_command_queue command_queue,
       }
 
       e->status = CL_COMPLETE; // Just set the status, no notify. No one depend on us now.
+      cl_event_update_timestamp(e, CL_QUEUED, CL_COMPLETE);
     } else {
       cl_command_queue_enqueue_event(command_queue, e);
     }
diff --git a/src/cl_command_queue_enqueue.c b/src/cl_command_queue_enqueue.c
index 32545b3..cf9ee3f 100644
--- a/src/cl_command_queue_enqueue.c
+++ b/src/cl_command_queue_enqueue.c
@@ -135,6 +135,8 @@ LOCAL void
 cl_command_queue_enqueue_event(cl_command_queue queue, cl_event event)
 {
   CL_OBJECT_INC_REF(event);
+  cl_event_update_timestamp(event, CL_QUEUED, event->status);
+
   assert(CL_OBJECT_IS_COMMAND_QUEUE(queue));
   CL_OBJECT_LOCK(queue);
   assert(queue->worker.quit == CL_FALSE);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index b45e2fb..9b2fd32 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -262,11 +262,11 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
 extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
 
 /* Get a event time stamp */
-typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, cl_gpgpu_event, int, uint64_t*);
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, int, uint64_t*);
 extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
 
 /* Get current GPU time stamp */
-typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
+typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_driver, uint64_t*);
 extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
 
 /* Get current batch buffer handle */
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index fbcd7b6..59605f9 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -572,9 +572,6 @@ cl_enqueue_ndrange(enqueue_data *data, cl_int status)
     void *batch_buf = cl_gpgpu_ref_batch_buf(data->gpgpu);
     cl_gpgpu_sync(batch_buf);
     cl_gpgpu_unref_batch_buf(batch_buf);
-    /* Finished, we can release the gpgpu now. */
-    cl_gpgpu_delete(data->gpgpu);
-    data->gpgpu = NULL;
   }
 
   return err;
@@ -626,12 +623,6 @@ cl_enqueue_delete(enqueue_data *data)
 LOCAL cl_int
 cl_enqueue_handle(enqueue_data *data, cl_int status)
 {
-  /* if need profiling, add the submit timestamp here. */
-  //  if (event && event->event_type != CL_COMMAND_USER &&
-  //      event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
-  //    cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
-  //  }
-
   switch (data->type) {
   case EnqueueReturnSuccesss:
     return CL_SUCCESS;
diff --git a/src/cl_event.c b/src/cl_event.c
index 0804dbd..ee2787e 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -23,46 +23,98 @@
 #include <string.h>
 #include <stdio.h>
 
-LOCAL cl_int
-cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
+// TODO: Need to move it to some device related file later.
+static void
+cl_event_update_timestamp_gen(cl_event event, cl_int status)
 {
-  // TODO:
-  return CL_INVALID_VALUE;
+  cl_ulong ts = 0;
+
+  if ((event->exec_data.type == EnqueueCopyBufferRect) ||
+      (event->exec_data.type == EnqueueCopyBuffer) ||
+      (event->exec_data.type == EnqueueCopyImage) ||
+      (event->exec_data.type == EnqueueCopyBufferToImage) ||
+      (event->exec_data.type == EnqueueCopyImageToBuffer) ||
+      (event->exec_data.type == EnqueueNDRangeKernel) ||
+      (event->exec_data.type == EnqueueFillBuffer) ||
+      (event->exec_data.type == EnqueueFillImage)) {
+
+    if (status == CL_QUEUED || status == CL_SUBMITTED) {
+      cl_gpgpu_event_get_gpu_cur_timestamp(event->queue->ctx->drv, &ts);
+      event->timestamp[CL_QUEUED - status] = ts;
+      return;
+    } else if (status == CL_RUNNING) {
+      assert(event->exec_data.gpgpu);
+      return; // Wait for the event complete and get run and complete then.
+    } else {
+      assert(event->exec_data.gpgpu);
+      cl_gpgpu_event_get_exec_timestamp(event->exec_data.gpgpu, 0, &ts);
+      event->timestamp[2] = ts;
+      cl_gpgpu_event_get_exec_timestamp(event->exec_data.gpgpu, 1, &ts);
+      event->timestamp[3] = ts;
+      return;
+    }
+  } else {
+    cl_gpgpu_event_get_gpu_cur_timestamp(event->queue->ctx->drv, &ts);
+    event->timestamp[CL_QUEUED - status] = ts;
+    return;
+  }
 }
 
-LOCAL cl_ulong
-cl_event_get_timestamp_delta(cl_ulong start_timestamp, cl_ulong end_timestamp)
+LOCAL void
+cl_event_update_timestamp(cl_event event, cl_int from, cl_int to)
 {
-  cl_ulong ret_val;
+  int i;
+  cl_bool re_cal = CL_FALSE;
+  cl_ulong ts[4];
 
-  if (end_timestamp > start_timestamp) {
-    ret_val = end_timestamp - start_timestamp;
-  } else {
-    /*if start time stamp is greater than end timstamp then set ret value to max*/
-    ret_val = ((cl_ulong)1 << 32);
-  }
+  assert(from >= to);
+  assert(from >= CL_COMPLETE || from <= CL_QUEUED);
+  assert(to >= CL_COMPLETE || to <= CL_QUEUED);
 
-  return ret_val;
-}
+  if (event->event_type == CL_COMMAND_USER)
+    return;
 
-LOCAL cl_ulong
-cl_event_get_start_timestamp(cl_event event)
-{
-  cl_ulong ret_val;
+  assert(event->queue);
+  if ((event->queue->props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    return;
 
-  ret_val = cl_event_get_timestamp_delta(event->timestamp[0], event->timestamp[2]);
+  i = CL_QUEUED - from;
+  if (event->timestamp[i] == CL_EVENT_INVALID_TIMESTAMP)
+    cl_event_update_timestamp_gen(event, from);
+  i++;
 
-  return ret_val;
-}
+  for (; i <= CL_QUEUED - to; i++) {
+    cl_event_update_timestamp_gen(event, CL_QUEUED - i);
+  }
 
-LOCAL cl_ulong
-cl_event_get_end_timestamp(cl_event event)
-{
-  cl_ulong ret_val;
+  if (to == CL_COMPLETE) {
+    // TODO: Need to set the CL_PROFILING_COMMAND_COMPLETE when enable child enqueue.
+    // Just a duplicate of event complete time now.
+    event->timestamp[4] = event->timestamp[3];
+
+    /* If timestamp overflow, set queued time to 0 and re-calculate. */
+    for (i = 0; i < 4; i++) {
+      if (event->timestamp[i + 1] < event->timestamp[i]) {
+        re_cal = CL_TRUE;
+        break;
+      }
+    }
 
-  ret_val = cl_event_get_timestamp_delta(event->timestamp[0], event->timestamp[3]);
+    if (re_cal) {
+      for (i = 3; i >= 0; i--) {
+        if (event->timestamp[i + 1] < event->timestamp[i]) { //overflow
+          ts[i] = event->timestamp[i + 1] + (CL_EVENT_INVALID_TIMESTAMP - event->timestamp[i]);
+        } else {
+          ts[i] = event->timestamp[i + 1] - event->timestamp[i];
+        }
+      }
 
-  return ret_val;
+      event->timestamp[0] = 0;
+      for (i = 1; i < 5; i++) {
+        event->timestamp[i] = event->timestamp[i - 1] + ts[i - 1];
+      }
+    }
+  }
 }
 
 LOCAL void
@@ -88,6 +140,7 @@ static cl_event
 cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type,
              cl_uint num_events, cl_event *event_list)
 {
+  int i;
   cl_event e = cl_calloc(1, sizeof(_cl_event));
   if (e == NULL)
     return NULL;
@@ -115,6 +168,10 @@ cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type,
 
   e->depend_events = event_list;
   e->depend_event_num = num_events;
+  for (i = 0; i < 4; i++) {
+    e->timestamp[i] = CL_EVENT_INVALID_TIMESTAMP;
+  }
+
   return e;
 }
 
@@ -297,6 +354,7 @@ cl_event_set_status(cl_event event, cl_int status)
   list_head *pos;
   cl_bool notify_queue = CL_FALSE;
   cl_event_user_callback cb;
+  cl_int old_status;
 
   assert(event);
 
@@ -317,6 +375,7 @@ cl_event_set_status(cl_event event, cl_int status)
     return CL_INVALID_OPERATION;
   }
 
+  old_status = event->status;
   event->status = status;
 
   /* Call all the callbacks. */
@@ -360,6 +419,9 @@ cl_event_set_status(cl_event event, cl_int status)
     notify_queue = CL_TRUE;
   }
 
+  if (status >= CL_COMPLETE)
+    cl_event_update_timestamp(event, old_status, status);
+
   CL_OBJECT_UNLOCK(event);
 
   /* Need to notify all the command queue within the same context. */
diff --git a/src/cl_event.h b/src/cl_event.h
index f67299c..9df5ab6 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -48,8 +48,7 @@ typedef struct _cl_event {
   cl_uint depend_event_num;   /* The depend events number. */
   list_head callbacks;        /* The events The event callback functions */
   list_head enqueue_node;     /* The node in the enqueue list. */
-  cl_ulong timestamp[4];      /* The time stamps for profiling. */
-  cl_ulong queued_timestamp;
+  cl_ulong timestamp[5];      /* The time stamps for profiling. */
   enqueue_data exec_data; /* Context for execute this event. */
 } _cl_event;
 
@@ -62,6 +61,8 @@ typedef struct _cl_event {
 #define CL_EVENT_IS_BARRIER(E) (E->event_type == CL_COMMAND_BARRIER)
 #define CL_EVENT_IS_USER(E) (E->event_type == CL_COMMAND_USER)
 
+#define CL_EVENT_INVALID_TIMESTAMP 0xFFFFFFFFFFFFFFFF
+
 /* Create a new event object */
 extern cl_event cl_event_create(cl_context ctx, cl_command_queue queue, cl_uint num_events,
                                 const cl_event *event_list, cl_command_type type, cl_int *errcode_ret);
@@ -78,11 +79,8 @@ extern cl_int cl_event_set_callback(cl_event event, cl_int exec_type,
                                     cl_event_notify_cb pfn_notify, void *user_data);
 extern cl_int cl_event_wait_for_events_list(cl_uint num_events, const cl_event *event_list);
 extern cl_int cl_event_wait_for_event_ready(cl_event event);
-extern cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp, cl_ulong end_timestamp);
-extern cl_ulong cl_event_get_start_timestamp(cl_event event);
-extern cl_ulong cl_event_get_end_timestamp(cl_event event);
-extern cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
 extern cl_event cl_event_create_marker_or_barrier(cl_command_queue queue, cl_uint num_events_in_wait_list,
                                                   const cl_event *event_wait_list, cl_bool is_barrier,
                                                   cl_int* error);
+extern void cl_event_update_timestamp(cl_event event, cl_int from_status, cl_int to_status);
 #endif /* __CL_EVENT_H__ */
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index d56d35d..e35b9d1 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -2277,10 +2277,10 @@ intel_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr)
 
 /* We want to get the current time of GPU. */
 static void
-intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
+intel_gpgpu_event_get_gpu_cur_timestamp(intel_driver_t* gen_driver, uint64_t* ret_ts)
 {
   uint64_t result = 0;
-  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  drm_intel_bufmgr *bufmgr = gen_driver->bufmgr;
 
   /* Get the ts that match the bspec */
   result = intel_gpgpu_read_ts_reg(bufmgr);
@@ -2292,15 +2292,13 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
 
 /* Get the GPU execute time. */
 static void
-intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
-				     int index, uint64_t* ret_ts)
+intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, int index, uint64_t* ret_ts)
 {
   uint64_t result = 0;
-
-  assert(event->ts_buf != NULL);
+  assert(gpgpu->time_stamp_b.bo);
   assert(index == 0 || index == 1);
-  drm_intel_gem_bo_map_gtt(event->ts_buf);
-  uint64_t* ptr = event->ts_buf->virtual;
+  drm_intel_gem_bo_map_gtt(gpgpu->time_stamp_b.bo);
+  uint64_t* ptr = gpgpu->time_stamp_b.bo->virtual;
   result = ptr[index];
 
   /* According to BSpec, the timestamp counter should be 36 bits,
@@ -2311,7 +2309,7 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
   result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds
   *ret_ts = result;
 
-  drm_intel_gem_bo_unmap_gtt(event->ts_buf);
+  drm_intel_gem_bo_unmap_gtt(gpgpu->time_stamp_b.bo);
 }
 
 static int
-- 
2.7.4





More information about the Beignet mailing list