[Beignet] [PATCH 2/2] Using the PIPE_CONTROL to implement get time stamp in gen backend

Tue Oct 15 23:42:51 PDT 2013

From: Junyan He <junyan.he at linux.intel.com>

We use PIPE_CONTROL to get the time stamps from GPU just after batch
start and before batch flush. Using the first one the caculate the
CL_PROFILING_COMMAND_START time and uing the second one to caculate
the CL_PROFILING_COMMAND_END time.
There are 2 limitations here:
1. Then end time stamp is just before the FLUSH, so the Flush time
   is not included, which will cause to lose the accuracy. Because
   the we do not know which event will be used to do the profling
   when it is created, adding another flush for end time stamp may
   add some overload.
2. The time of CPU and GPU can not be sync correctly now. So the
   time of CL_PROFILING_COMMAND_QUEUED and CL_PROFILING_COMMAND_SUBMIT
   which happens on CPU side can not be caculated correctly with the
   same base time of GPU. So we just simplely set them to
   CL_PROFILING_COMMAND_START now. For the Event not involving GPU
   operations such as ReadBuffer, all the times are 0 now.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 src/cl_command_queue_gen7.c |    5 +++-
 src/cl_driver.h             |    5 +++-
 src/cl_driver_defs.c        |    1 +
 src/cl_event.c              |   23 +++++++++++++++++
 src/cl_event.h              |    2 ++
 src/intel/intel_defines.h   |    4 +++
 src/intel/intel_gpgpu.c     |   60 ++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index be7bcef..65f8e17 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -287,7 +287,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   }
 
   /* Setup the kernel */
-  cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
+  if (queue->props & CL_QUEUE_PROFILING_ENABLE)
+    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
+  else
+    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
 
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 100b38d..5ed4fb1 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -129,7 +129,7 @@ typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
 extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
 
 /* Configure internal state */
-typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
+typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling);
 extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
 
 /* Set the buffer object where to report performance counters */
@@ -191,6 +191,9 @@ extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
 typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
 extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
 
+/* Get a event time stamp */
+typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
+extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp;
 
 /* Will spawn all threads */
 typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index ac4ff7a..fe38ba2 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -78,4 +78,5 @@ LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
 LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
 LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
 LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
+LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL;
 
diff --git a/src/cl_event.c b/src/cl_event.c
index 918e245..ef0c82b 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -490,3 +490,26 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
   cl_event_set_status(*event, CL_COMPLETE);
   return CL_SUCCESS;
 }
+
+cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val)
+{
+  if (!event->gpgpu_event) {
+    /* Some event like read buffer do not need GPU involved, so
+       we just return all the profiling to 0 now. */
+    *ret_val = 0;
+    return CL_SUCCESS;
+  }
+
+  if(param_name == CL_PROFILING_COMMAND_START ||
+     param_name == CL_PROFILING_COMMAND_QUEUED ||
+     param_name == CL_PROFILING_COMMAND_SUBMIT) {
+    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val);
+    return CL_SUCCESS;
+  } else if (param_name == CL_PROFILING_COMMAND_END) {
+    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val);
+    return CL_SUCCESS;
+  } else {
+    return CL_INVALID_VALUE;
+  }
+}
+
diff --git a/src/cl_event.h b/src/cl_event.h
index 7dde24b..722486a 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -90,5 +90,7 @@ void cl_event_set_status(cl_event, cl_int);
 void cl_event_update_status(cl_event);
 /* Create the marker event */
 cl_int cl_event_marker(cl_command_queue, cl_event*);
+/* Do the event profiling */
+cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index 19bdbed..e5015ec 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -62,6 +62,7 @@
 #define CMD_MEDIA_GATEWAY_STATE                 CMD(2, 0, 3)
 #define CMD_MEDIA_STATE_FLUSH                   CMD(2, 0, 4)
 #define CMD_GPGPU_WALKER                        CMD(2, 1, 5)
+#define CMD_PIPE_CONTROL                        CMD(3, 2, 0)
 
 #define CMD_LOAD_REGISTER_IMM                   (0x22 << 23)
 
@@ -300,6 +301,9 @@
 #define GEN7_PIPE_CONTROL_INSTRUCTION_GFX 0x3
 #define GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL 0x2
 #define GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL 0x0
+#define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP        (3 << 14)
+#define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE       (1 << 2)
+
 
 #define GEN_MAPFILTER_NEAREST        0x0
 #define GEN_MAPFILTER_LINEAR         0x1
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 21cf09b..f3de186 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -60,6 +60,7 @@ typedef struct surface_heap {
 typedef struct intel_event {
   intel_batchbuffer_t *batch;
   drm_intel_bo* buffer;
+  drm_intel_bo* ts_buf;
   int status;
 } intel_event_t;
 
@@ -98,6 +99,7 @@ struct intel_gpgpu
   struct { drm_intel_bo *bo; } perf_b;
   struct { drm_intel_bo *bo; } scratch_b;
   struct { drm_intel_bo *bo; } constant_b;
+  struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
 
   uint32_t per_thread_scratch;
   struct {
@@ -123,6 +125,8 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
 {
   if (gpgpu == NULL)
     return;
+  if(gpgpu->time_stamp_b.bo)
+    drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
   if (gpgpu->surface_heap_b.bo)
     drm_intel_bo_unreference(gpgpu->surface_heap_b.bo);
   if (gpgpu->idrt_b.bo)
@@ -280,6 +284,21 @@ static const uint32_t gpgpu_l3_config_reg2[] = {
   0x00204080, 0x00244890, 0x00284490, 0x002444A0
 };
 
+/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
+static void
+intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
+{
+  BEGIN_BATCH(gpgpu->batch, 5);
+  OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2));
+  OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP);
+  OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo,
+          I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+          GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t));
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  ADVANCE_BATCH();
+}
+
 static void
 intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
 {
@@ -345,11 +364,19 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
     OUT_BATCH(gpgpu->batch, 0);
     ADVANCE_BATCH(gpgpu->batch);
   }
+
+  /* Insert PIPE_CONTROL for time stamp of start*/
+  if (gpgpu->time_stamp_b.bo)
+    intel_gpgpu_write_timestamp(gpgpu, 0);
 }
 
 static void
 intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
 {
+  /* Insert PIPE_CONTROL for time stamp of end*/
+  if (gpgpu->time_stamp_b.bo)
+    intel_gpgpu_write_timestamp(gpgpu, 1);
+
   /* Insert the performance counter command */
   if (gpgpu->perf_b.bo) {
     BEGIN_BATCH(gpgpu->batch, 3);
@@ -394,7 +421,8 @@ intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
 static void
 intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
                        uint32_t max_threads,
-                       uint32_t size_cs_entry)
+                       uint32_t size_cs_entry,
+                       int profiling)
 {
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   drm_intel_bo *bo;
@@ -410,6 +438,16 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->urb.size_cs_entry = size_cs_entry;
   gpgpu->max_threads = max_threads;
 
+  /* Set the profile buffer*/
+  if(gpgpu->time_stamp_b.bo)
+    dri_bo_unreference(gpgpu->time_stamp_b.bo);
+  gpgpu->time_stamp_b.bo = NULL;
+  if (profiling) {
+    bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096);
+    assert(bo);
+    gpgpu->time_stamp_b.bo = bo;
+  }
+
   /* Constant URB  buffer */
   if(gpgpu->curbe_b.bo)
     dri_bo_unreference(gpgpu->curbe_b.bo);
@@ -926,6 +964,11 @@ intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
   if(event->buffer != NULL)
     drm_intel_bo_reference(event->buffer);
 
+  if(gpgpu->time_stamp_b.bo) {
+    event->ts_buf = gpgpu->time_stamp_b.bo;
+    drm_intel_bo_reference(event->ts_buf);
+  }
+
 exit:
   return event;
 error:
@@ -988,9 +1031,23 @@ intel_gpgpu_event_delete(intel_event_t *event)
   assert(event->batch == NULL);   //This command must have been flushed.
   if(event->buffer)
     drm_intel_bo_unreference(event->buffer);
+  if(event->ts_buf)
+    drm_intel_bo_unreference(event->ts_buf);
   cl_free(event);
 }
 
+static void
+intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* ret_ts)
+{
+  assert(event->ts_buf != NULL);
+  assert(index == 0 || index == 1);
+  drm_intel_gem_bo_map_gtt(event->ts_buf);
+  uint64_t* ptr = event->ts_buf->virtual;
+
+  *ret_ts = ptr[index] * 80; //convert to nanoseconds
+  drm_intel_gem_bo_unmap_gtt(event->ts_buf);
+}
+
 LOCAL void
 intel_set_gpgpu_callbacks(void)
 {
@@ -1018,5 +1075,6 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
   cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
   cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
+  cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb *)intel_gpgpu_event_get_timestamp;
 }
 
-- 
1.7.9.5