[Beignet] [PATCH V2 1/2] Using the PIPE_CONTROL to implement get time stamp in gen backend
Zhigang Gong
zhigang.gong at linux.intel.com
Thu Oct 17 21:19:08 PDT 2013
Pushed, thanks.
On Fri, Oct 18, 2013 at 10:19:51AM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
>
> We use PIPE_CONTROL to get the time stamps from GPU just after batch
> start and before batch flush. Using the first one the caculate the
> CL_PROFILING_COMMAND_START time and uing the second one to caculate
> the CL_PROFILING_COMMAND_END time.
> There are 2 limitations here:
> 1. Then end time stamp is just before the FLUSH, so the Flush time
> is not included, which will cause to lose the accuracy. Because
> the we do not know which event will be used to do the profling
> when it is created, adding another flush for end time stamp may
> add some overload.
> 2. The time of CPU and GPU can not be sync correctly now. So the
> time of CL_PROFILING_COMMAND_QUEUED and CL_PROFILING_COMMAND_SUBMIT
> which happens on CPU side can not be caculated correctly with the
> same base time of GPU. So we just simplely set them to
> CL_PROFILING_COMMAND_START now. For the Event not involving GPU
> operations such as ReadBuffer, all the times are 0 now.
>
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
> src/cl_command_queue_gen7.c | 5 +++-
> src/cl_driver.h | 5 +++-
> src/cl_driver_defs.c | 1 +
> src/cl_event.c | 22 ++++++++++++++++
> src/cl_event.h | 2 ++
> src/intel/intel_defines.h | 4 +++
> src/intel/intel_gpgpu.c | 60 ++++++++++++++++++++++++++++++++++++++++++-
> 7 files changed, 96 insertions(+), 3 deletions(-)
>
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index be7bcef..65f8e17 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -287,7 +287,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
> }
>
> /* Setup the kernel */
> - cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
> + if (queue->props & CL_QUEUE_PROFILING_ENABLE)
> + cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
> + else
> + cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
>
> /* Bind user buffers */
> cl_command_queue_bind_surface(queue, ker);
> diff --git a/src/cl_driver.h b/src/cl_driver.h
> index 100b38d..5ed4fb1 100644
> --- a/src/cl_driver.h
> +++ b/src/cl_driver.h
> @@ -129,7 +129,7 @@ typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
> extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
>
> /* Configure internal state */
> -typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
> +typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling);
> extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
>
> /* Set the buffer object where to report performance counters */
> @@ -191,6 +191,9 @@ extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
> typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
> extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
>
> +/* Get a event time stamp */
> +typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
> +extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp;
>
> /* Will spawn all threads */
> typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
> diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
> index ac4ff7a..fe38ba2 100644
> --- a/src/cl_driver_defs.c
> +++ b/src/cl_driver_defs.c
> @@ -78,4 +78,5 @@ LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
> LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
> LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
> LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
> +LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL;
>
> diff --git a/src/cl_event.c b/src/cl_event.c
> index 918e245..212f1ee 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -490,3 +490,25 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
> cl_event_set_status(*event, CL_COMPLETE);
> return CL_SUCCESS;
> }
> +
> +cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val)
> +{
> + if (!event->gpgpu_event) {
> + /* Some event like read buffer do not need GPU involved, so
> + we just return all the profiling to 0 now. */
> + *ret_val = 0;
> + return CL_SUCCESS;
> + }
> +
> + if(param_name == CL_PROFILING_COMMAND_START ||
> + param_name == CL_PROFILING_COMMAND_QUEUED ||
> + param_name == CL_PROFILING_COMMAND_SUBMIT) {
> + cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val);
> + return CL_SUCCESS;
> + } else if (param_name == CL_PROFILING_COMMAND_END) {
> + cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val);
> + return CL_SUCCESS;
> + } else {
> + return CL_INVALID_VALUE;
> + }
> +}
> diff --git a/src/cl_event.h b/src/cl_event.h
> index 7dde24b..722486a 100644
> --- a/src/cl_event.h
> +++ b/src/cl_event.h
> @@ -90,5 +90,7 @@ void cl_event_set_status(cl_event, cl_int);
> void cl_event_update_status(cl_event);
> /* Create the marker event */
> cl_int cl_event_marker(cl_command_queue, cl_event*);
> +/* Do the event profiling */
> +cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val);
> #endif /* __CL_EVENT_H__ */
>
> diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
> index 19bdbed..e5015ec 100644
> --- a/src/intel/intel_defines.h
> +++ b/src/intel/intel_defines.h
> @@ -62,6 +62,7 @@
> #define CMD_MEDIA_GATEWAY_STATE CMD(2, 0, 3)
> #define CMD_MEDIA_STATE_FLUSH CMD(2, 0, 4)
> #define CMD_GPGPU_WALKER CMD(2, 1, 5)
> +#define CMD_PIPE_CONTROL CMD(3, 2, 0)
>
> #define CMD_LOAD_REGISTER_IMM (0x22 << 23)
>
> @@ -300,6 +301,9 @@
> #define GEN7_PIPE_CONTROL_INSTRUCTION_GFX 0x3
> #define GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL 0x2
> #define GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL 0x0
> +#define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP (3 << 14)
> +#define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE (1 << 2)
> +
>
> #define GEN_MAPFILTER_NEAREST 0x0
> #define GEN_MAPFILTER_LINEAR 0x1
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index 21cf09b..f3de186 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -60,6 +60,7 @@ typedef struct surface_heap {
> typedef struct intel_event {
> intel_batchbuffer_t *batch;
> drm_intel_bo* buffer;
> + drm_intel_bo* ts_buf;
> int status;
> } intel_event_t;
>
> @@ -98,6 +99,7 @@ struct intel_gpgpu
> struct { drm_intel_bo *bo; } perf_b;
> struct { drm_intel_bo *bo; } scratch_b;
> struct { drm_intel_bo *bo; } constant_b;
> + struct { drm_intel_bo *bo; } time_stamp_b; /* time stamp buffer */
>
> uint32_t per_thread_scratch;
> struct {
> @@ -123,6 +125,8 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
> {
> if (gpgpu == NULL)
> return;
> + if(gpgpu->time_stamp_b.bo)
> + drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
> if (gpgpu->surface_heap_b.bo)
> drm_intel_bo_unreference(gpgpu->surface_heap_b.bo);
> if (gpgpu->idrt_b.bo)
> @@ -280,6 +284,21 @@ static const uint32_t gpgpu_l3_config_reg2[] = {
> 0x00204080, 0x00244890, 0x00284490, 0x002444A0
> };
>
> +/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
> +static void
> +intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
> +{
> + BEGIN_BATCH(gpgpu->batch, 5);
> + OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2));
> + OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP);
> + OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo,
> + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> + GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t));
> + OUT_BATCH(gpgpu->batch, 0);
> + OUT_BATCH(gpgpu->batch, 0);
> + ADVANCE_BATCH();
> +}
> +
> static void
> intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
> {
> @@ -345,11 +364,19 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
> OUT_BATCH(gpgpu->batch, 0);
> ADVANCE_BATCH(gpgpu->batch);
> }
> +
> + /* Insert PIPE_CONTROL for time stamp of start*/
> + if (gpgpu->time_stamp_b.bo)
> + intel_gpgpu_write_timestamp(gpgpu, 0);
> }
>
> static void
> intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
> {
> + /* Insert PIPE_CONTROL for time stamp of end*/
> + if (gpgpu->time_stamp_b.bo)
> + intel_gpgpu_write_timestamp(gpgpu, 1);
> +
> /* Insert the performance counter command */
> if (gpgpu->perf_b.bo) {
> BEGIN_BATCH(gpgpu->batch, 3);
> @@ -394,7 +421,8 @@ intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
> static void
> intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
> uint32_t max_threads,
> - uint32_t size_cs_entry)
> + uint32_t size_cs_entry,
> + int profiling)
> {
> drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
> drm_intel_bo *bo;
> @@ -410,6 +438,16 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
> gpgpu->urb.size_cs_entry = size_cs_entry;
> gpgpu->max_threads = max_threads;
>
> + /* Set the profile buffer*/
> + if(gpgpu->time_stamp_b.bo)
> + dri_bo_unreference(gpgpu->time_stamp_b.bo);
> + gpgpu->time_stamp_b.bo = NULL;
> + if (profiling) {
> + bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096);
> + assert(bo);
> + gpgpu->time_stamp_b.bo = bo;
> + }
> +
> /* Constant URB buffer */
> if(gpgpu->curbe_b.bo)
> dri_bo_unreference(gpgpu->curbe_b.bo);
> @@ -926,6 +964,11 @@ intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
> if(event->buffer != NULL)
> drm_intel_bo_reference(event->buffer);
>
> + if(gpgpu->time_stamp_b.bo) {
> + event->ts_buf = gpgpu->time_stamp_b.bo;
> + drm_intel_bo_reference(event->ts_buf);
> + }
> +
> exit:
> return event;
> error:
> @@ -988,9 +1031,23 @@ intel_gpgpu_event_delete(intel_event_t *event)
> assert(event->batch == NULL); //This command must have been flushed.
> if(event->buffer)
> drm_intel_bo_unreference(event->buffer);
> + if(event->ts_buf)
> + drm_intel_bo_unreference(event->ts_buf);
> cl_free(event);
> }
>
> +static void
> +intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* ret_ts)
> +{
> + assert(event->ts_buf != NULL);
> + assert(index == 0 || index == 1);
> + drm_intel_gem_bo_map_gtt(event->ts_buf);
> + uint64_t* ptr = event->ts_buf->virtual;
> +
> + *ret_ts = ptr[index] * 80; //convert to nanoseconds
> + drm_intel_gem_bo_unmap_gtt(event->ts_buf);
> +}
> +
> LOCAL void
> intel_set_gpgpu_callbacks(void)
> {
> @@ -1018,5 +1075,6 @@ intel_set_gpgpu_callbacks(void)
> cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
> cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
> cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
> + cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb *)intel_gpgpu_event_get_timestamp;
> }
>
> --
> 1.7.9.5
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list