[Beignet] [PATCH V2 1/2] Using the PIPE_CONTROL to implement get time stamp in gen backend

Thu Oct 17 21:19:08 PDT 2013

Pushed, thanks.

On Fri, Oct 18, 2013 at 10:19:51AM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> We use PIPE_CONTROL to get the time stamps from GPU just after batch
> start and before batch flush. Using the first one the caculate the
> CL_PROFILING_COMMAND_START time and uing the second one to caculate
> the CL_PROFILING_COMMAND_END time.
> There are 2 limitations here:
> 1. Then end time stamp is just before the FLUSH, so the Flush time
>    is not included, which will cause to lose the accuracy. Because
>    the we do not know which event will be used to do the profling
>    when it is created, adding another flush for end time stamp may
>    add some overload.
> 2. The time of CPU and GPU can not be sync correctly now. So the
>    time of CL_PROFILING_COMMAND_QUEUED and CL_PROFILING_COMMAND_SUBMIT
>    which happens on CPU side can not be caculated correctly with the
>    same base time of GPU. So we just simplely set them to
>    CL_PROFILING_COMMAND_START now. For the Event not involving GPU
>    operations such as ReadBuffer, all the times are 0 now.
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  src/cl_command_queue_gen7.c |    5 +++-
>  src/cl_driver.h             |    5 +++-
>  src/cl_driver_defs.c        |    1 +
>  src/cl_event.c              |   22 ++++++++++++++++
>  src/cl_event.h              |    2 ++
>  src/intel/intel_defines.h   |    4 +++
>  src/intel/intel_gpgpu.c     |   60 ++++++++++++++++++++++++++++++++++++++++++-
>  7 files changed, 96 insertions(+), 3 deletions(-)
> 
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index be7bcef..65f8e17 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -287,7 +287,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>    }
>  
>    /* Setup the kernel */
> -  cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
> +  if (queue->props & CL_QUEUE_PROFILING_ENABLE)
> +    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
> +  else
> +    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
>  
>    /* Bind user buffers */
>    cl_command_queue_bind_surface(queue, ker);
> diff --git a/src/cl_driver.h b/src/cl_driver.h
> index 100b38d..5ed4fb1 100644
> --- a/src/cl_driver.h
> +++ b/src/cl_driver.h
> @@ -129,7 +129,7 @@ typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
>  extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
>  
>  /* Configure internal state */
> -typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
> +typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling);
>  extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
>  
>  /* Set the buffer object where to report performance counters */
> @@ -191,6 +191,9 @@ extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
>  typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
>  extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
>  
> +/* Get a event time stamp */
> +typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
> +extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp;
>  
>  /* Will spawn all threads */
>  typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
> diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
> index ac4ff7a..fe38ba2 100644
> --- a/src/cl_driver_defs.c
> +++ b/src/cl_driver_defs.c
> @@ -78,4 +78,5 @@ LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
>  LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
>  LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
>  LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
> +LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL;
>  
> diff --git a/src/cl_event.c b/src/cl_event.c
> index 918e245..212f1ee 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -490,3 +490,25 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
>    cl_event_set_status(*event, CL_COMPLETE);
>    return CL_SUCCESS;
>  }
> +
> +cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val)
> +{
> +  if (!event->gpgpu_event) {
> +    /* Some event like read buffer do not need GPU involved, so
> +       we just return all the profiling to 0 now. */
> +    *ret_val = 0;
> +    return CL_SUCCESS;
> +  }
> +
> +  if(param_name == CL_PROFILING_COMMAND_START ||
> +     param_name == CL_PROFILING_COMMAND_QUEUED ||
> +     param_name == CL_PROFILING_COMMAND_SUBMIT) {
> +    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val);
> +    return CL_SUCCESS;
> +  } else if (param_name == CL_PROFILING_COMMAND_END) {
> +    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val);
> +    return CL_SUCCESS;
> +  } else {
> +    return CL_INVALID_VALUE;
> +  }
> +}
> diff --git a/src/cl_event.h b/src/cl_event.h
> index 7dde24b..722486a 100644
> --- a/src/cl_event.h
> +++ b/src/cl_event.h
> @@ -90,5 +90,7 @@ void cl_event_set_status(cl_event, cl_int);
>  void cl_event_update_status(cl_event);
>  /* Create the marker event */
>  cl_int cl_event_marker(cl_command_queue, cl_event*);
> +/* Do the event profiling */
> +cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val);
>  #endif /* __CL_EVENT_H__ */
>  
> diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
> index 19bdbed..e5015ec 100644
> --- a/src/intel/intel_defines.h
> +++ b/src/intel/intel_defines.h
> @@ -62,6 +62,7 @@
>  #define CMD_MEDIA_GATEWAY_STATE                 CMD(2, 0, 3)
>  #define CMD_MEDIA_STATE_FLUSH                   CMD(2, 0, 4)
>  #define CMD_GPGPU_WALKER                        CMD(2, 1, 5)
> +#define CMD_PIPE_CONTROL                        CMD(3, 2, 0)
>  
>  #define CMD_LOAD_REGISTER_IMM                   (0x22 << 23)
>  
> @@ -300,6 +301,9 @@
>  #define GEN7_PIPE_CONTROL_INSTRUCTION_GFX 0x3
>  #define GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL 0x2
>  #define GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL 0x0
> +#define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP        (3 << 14)
> +#define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE       (1 << 2)
> +
>  
>  #define GEN_MAPFILTER_NEAREST        0x0
>  #define GEN_MAPFILTER_LINEAR         0x1
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index 21cf09b..f3de186 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -60,6 +60,7 @@ typedef struct surface_heap {
>  typedef struct intel_event {
>    intel_batchbuffer_t *batch;
>    drm_intel_bo* buffer;
> +  drm_intel_bo* ts_buf;
>    int status;
>  } intel_event_t;
>  
> @@ -98,6 +99,7 @@ struct intel_gpgpu
>    struct { drm_intel_bo *bo; } perf_b;
>    struct { drm_intel_bo *bo; } scratch_b;
>    struct { drm_intel_bo *bo; } constant_b;
> +  struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
>  
>    uint32_t per_thread_scratch;
>    struct {
> @@ -123,6 +125,8 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
>  {
>    if (gpgpu == NULL)
>      return;
> +  if(gpgpu->time_stamp_b.bo)
> +    drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
>    if (gpgpu->surface_heap_b.bo)
>      drm_intel_bo_unreference(gpgpu->surface_heap_b.bo);
>    if (gpgpu->idrt_b.bo)
> @@ -280,6 +284,21 @@ static const uint32_t gpgpu_l3_config_reg2[] = {
>    0x00204080, 0x00244890, 0x00284490, 0x002444A0
>  };
>  
> +/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
> +static void
> +intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
> +{
> +  BEGIN_BATCH(gpgpu->batch, 5);
> +  OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2));
> +  OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP);
> +  OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo,
> +          I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> +          GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t));
> +  OUT_BATCH(gpgpu->batch, 0);
> +  OUT_BATCH(gpgpu->batch, 0);
> +  ADVANCE_BATCH();
> +}
> +
>  static void
>  intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
>  {
> @@ -345,11 +364,19 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
>      OUT_BATCH(gpgpu->batch, 0);
>      ADVANCE_BATCH(gpgpu->batch);
>    }
> +
> +  /* Insert PIPE_CONTROL for time stamp of start*/
> +  if (gpgpu->time_stamp_b.bo)
> +    intel_gpgpu_write_timestamp(gpgpu, 0);
>  }
>  
>  static void
>  intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
>  {
> +  /* Insert PIPE_CONTROL for time stamp of end*/
> +  if (gpgpu->time_stamp_b.bo)
> +    intel_gpgpu_write_timestamp(gpgpu, 1);
> +
>    /* Insert the performance counter command */
>    if (gpgpu->perf_b.bo) {
>      BEGIN_BATCH(gpgpu->batch, 3);
> @@ -394,7 +421,8 @@ intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
>  static void
>  intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
>                         uint32_t max_threads,
> -                       uint32_t size_cs_entry)
> +                       uint32_t size_cs_entry,
> +                       int profiling)
>  {
>    drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
>    drm_intel_bo *bo;
> @@ -410,6 +438,16 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
>    gpgpu->urb.size_cs_entry = size_cs_entry;
>    gpgpu->max_threads = max_threads;
>  
> +  /* Set the profile buffer*/
> +  if(gpgpu->time_stamp_b.bo)
> +    dri_bo_unreference(gpgpu->time_stamp_b.bo);
> +  gpgpu->time_stamp_b.bo = NULL;
> +  if (profiling) {
> +    bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096);
> +    assert(bo);
> +    gpgpu->time_stamp_b.bo = bo;
> +  }
> +
>    /* Constant URB  buffer */
>    if(gpgpu->curbe_b.bo)
>      dri_bo_unreference(gpgpu->curbe_b.bo);
> @@ -926,6 +964,11 @@ intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
>    if(event->buffer != NULL)
>      drm_intel_bo_reference(event->buffer);
>  
> +  if(gpgpu->time_stamp_b.bo) {
> +    event->ts_buf = gpgpu->time_stamp_b.bo;
> +    drm_intel_bo_reference(event->ts_buf);
> +  }
> +
>  exit:
>    return event;
>  error:
> @@ -988,9 +1031,23 @@ intel_gpgpu_event_delete(intel_event_t *event)
>    assert(event->batch == NULL);   //This command must have been flushed.
>    if(event->buffer)
>      drm_intel_bo_unreference(event->buffer);
> +  if(event->ts_buf)
> +    drm_intel_bo_unreference(event->ts_buf);
>    cl_free(event);
>  }
>  
> +static void
> +intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* ret_ts)
> +{
> +  assert(event->ts_buf != NULL);
> +  assert(index == 0 || index == 1);
> +  drm_intel_gem_bo_map_gtt(event->ts_buf);
> +  uint64_t* ptr = event->ts_buf->virtual;
> +
> +  *ret_ts = ptr[index] * 80; //convert to nanoseconds
> +  drm_intel_gem_bo_unmap_gtt(event->ts_buf);
> +}
> +
>  LOCAL void
>  intel_set_gpgpu_callbacks(void)
>  {
> @@ -1018,5 +1075,6 @@ intel_set_gpgpu_callbacks(void)
>    cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
>    cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
>    cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
> +  cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb *)intel_gpgpu_event_get_timestamp;
>  }
>  
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet