[Beignet] [PATCH V3] Complete the feature of clGetEventProfilingInfo API
junyan.he at inbox.com
junyan.he at inbox.com
Thu Nov 28 18:55:54 PST 2013
From: Junyan He <junyan.he at linux.intel.com>
The profiling feature is now all supported. We use
drm_intel_reg_read to get the current time of GPU
when the event is queued and submitted, and use
PIPI_CONTROL cmd to get the executing time of the
GPU for kernel start and end.
One trivial problem is that:
The GPU timer counter is 36 bits with resolution of
80ns, so 2^36*80 = 5500s, about half an hour.
Some test may last about 2~5 min and if it starts at
about half an hour, this may cause a wrap back problem
and cause the case fail.
Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
src/cl_api.c | 78 +++++++++++++++++++++++++++++++++++++++----------
src/cl_driver.h | 8 +++--
src/cl_driver_defs.c | 3 +-
src/cl_enqueue.c | 19 ++++++++----
src/cl_enqueue.h | 2 +-
src/cl_event.c | 27 ++++++++++-------
src/cl_event.h | 3 +-
src/intel/intel_gpgpu.c | 36 +++++++++++++++++++++--
8 files changed, 137 insertions(+), 39 deletions(-)
diff --git a/src/cl_api.c b/src/cl_api.c
index 0978129..35d0984 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -70,6 +70,13 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
cl_event e;
if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
e = cl_event_new(queue->ctx, queue, type, event!=NULL);
+
+ /* if need profiling, add the submit timestamp here. */
+ if (e->type != CL_COMMAND_USER &&
+ e->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED);
+ }
+
if(event != NULL)
*event = e;
if(status == CL_ENQUEUE_EXECUTE_DEFER) {
@@ -1270,16 +1277,25 @@ clGetEventProfilingInfo(cl_event event,
goto error;
}
- if ((param_name != CL_PROFILING_COMMAND_QUEUED &&
- param_name != CL_PROFILING_COMMAND_SUBMIT &&
- param_name != CL_PROFILING_COMMAND_START &&
- param_name != CL_PROFILING_COMMAND_END) ||
- (param_value && param_value_size < sizeof(cl_ulong))) {
+ if (param_value && param_value_size < sizeof(cl_ulong)) {
err = CL_INVALID_VALUE;
goto error;
}
- err = cl_event_profiling(event, param_name, &ret_val);
+ if (param_name == CL_PROFILING_COMMAND_QUEUED) {
+ ret_val = event->timestamp[0];
+ } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
+ ret_val = event->timestamp[1];
+ } else if (param_name == CL_PROFILING_COMMAND_START) {
+ err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START);
+ ret_val = event->timestamp[2];
+ } else if (param_name == CL_PROFILING_COMMAND_END) {
+ err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END);
+ ret_val = event->timestamp[3];
+ } else {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
if (err == CL_SUCCESS) {
if (param_value)
@@ -1354,7 +1370,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1437,7 +1453,7 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1487,7 +1503,7 @@ clEnqueueWriteBuffer(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1570,7 +1586,7 @@ clEnqueueWriteBufferRect(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1649,6 +1665,11 @@ clEnqueueCopyBuffer(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
err = cl_command_queue_flush(command_queue);
}
return 0;
@@ -1740,6 +1761,11 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
err = cl_command_queue_flush(command_queue);
}
@@ -1818,7 +1844,7 @@ clEnqueueReadImage(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1897,7 +1923,7 @@ clEnqueueWriteImage(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1974,6 +2000,11 @@ clEnqueueCopyImage(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
err = cl_command_queue_flush(command_queue);
}
@@ -2030,6 +2061,11 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
err = cl_command_queue_flush(command_queue);
}
@@ -2086,6 +2122,11 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
err = cl_command_queue_flush(command_queue);
}
@@ -2217,7 +2258,7 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -2313,7 +2354,7 @@ clEnqueueMapImage(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -2350,7 +2391,7 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -2456,6 +2497,11 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+ if (event && (*event)->type != CL_COMMAND_USER
+ && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
err = cl_command_queue_flush(command_queue);
}
@@ -2535,7 +2581,7 @@ clEnqueueNativeKernel(cl_command_queue command_queue,
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
- err = cl_enqueue_handle(data);
+ err = cl_enqueue_handle(event ? *event : NULL, data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 8efe1e7..a34c22e 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -193,8 +193,12 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
/* Get a event time stamp */
-typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
-extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp;
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
+extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
+
+/* Get current GPU time stamp */
+typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
+extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
/* Will spawn all threads */
typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 54fa62e..b46799a 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -80,5 +80,6 @@ LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
-LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL;
+LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL;
+LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 070fd98..330d230 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -16,16 +16,18 @@
*
* Author: Rong Yang <rong.r.yang at intel.com>
*/
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
#include "cl_enqueue.h"
#include "cl_image.h"
#include "cl_driver.h"
+#include "cl_event.h"
+#include "cl_command_queue.h"
#include "cl_utils.h"
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <pthread.h>
cl_int cl_enqueue_read_buffer(enqueue_data* data)
{
@@ -376,8 +378,15 @@ cl_int cl_enqueue_native_kernel(enqueue_data *data)
error:
return err;
}
-cl_int cl_enqueue_handle(enqueue_data* data)
+
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
{
+ /* if need profiling, add the submit timestamp here. */
+ if (event && event->type != CL_COMMAND_USER
+ && event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+ cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
+ }
+
switch(data->type) {
case EnqueueReadBuffer:
return cl_enqueue_read_buffer(data);
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index b412d58..1d3ae5f 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -64,5 +64,5 @@ typedef struct _enqueue_data {
} enqueue_data;
/* Do real enqueue commands */
-cl_int cl_enqueue_handle(enqueue_data* data);
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data);
#endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
index 028dfb6..f838a3a 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -380,7 +380,7 @@ void cl_event_set_status(cl_event event, cl_int status)
if(status <= CL_COMPLETE) {
if(event->enqueue_cb) {
- cl_enqueue_handle(&event->enqueue_cb->data);
+ cl_enqueue_handle(event, &event->enqueue_cb->data);
if(event->gpgpu_event)
cl_gpgpu_event_update_status(event->gpgpu_event, 1); //now set complet, need refine
event->status = status; //Change the event status after enqueue and befor unlock
@@ -496,22 +496,29 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
return CL_SUCCESS;
}
-cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val)
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
{
+ cl_ulong ret_val = 0;
+ GET_QUEUE_THREAD_GPGPU(event->queue);
+
if (!event->gpgpu_event) {
- /* Some event like read buffer do not need GPU involved, so
- we just return all the profiling to 0 now. */
- *ret_val = 0;
+ cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+ event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
return CL_SUCCESS;
}
- if(param_name == CL_PROFILING_COMMAND_START ||
- param_name == CL_PROFILING_COMMAND_QUEUED ||
- param_name == CL_PROFILING_COMMAND_SUBMIT) {
- cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val);
+ if(param_name == CL_PROFILING_COMMAND_SUBMIT ||
+ param_name == CL_PROFILING_COMMAND_QUEUED) {
+ cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+ event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+ return CL_SUCCESS;
+ } else if(param_name == CL_PROFILING_COMMAND_START) {
+ cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 0, &ret_val);
+ event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
return CL_SUCCESS;
} else if (param_name == CL_PROFILING_COMMAND_END) {
- cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val);
+ cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 1, &ret_val);
+ event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
return CL_SUCCESS;
} else {
return CL_INVALID_VALUE;
diff --git a/src/cl_event.h b/src/cl_event.h
index 722486a..3c61110 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -68,6 +68,7 @@ struct _cl_event {
enqueue_callback* enqueue_cb; /* This event's enqueue */
enqueue_callback* waits_head; /* The head of enqueues list wait on this event */
cl_bool emplict; /* Identify this event whether created by api emplict*/
+ cl_ulong timestamp[4];/* The time stamps for profiling. */
};
/* Create a new event object */
@@ -91,6 +92,6 @@ void cl_event_update_status(cl_event);
/* Create the marker event */
cl_int cl_event_marker(cl_command_queue, cl_event*);
/* Do the event profiling */
-cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val);
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
#endif /* __CL_EVENT_H__ */
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 7be9059..b1597ac 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -51,6 +51,8 @@
#define MO_RETAIN_BIT (1 << 28)
#define SAMPLER_STATE_SIZE (16)
+#define TIMESTAMP_ADDR 0x2358
+
/* Stores both binding tables and surface states */
typedef struct surface_heap {
uint32_t binding_table[256];
@@ -1041,15 +1043,42 @@ intel_gpgpu_event_delete(intel_event_t *event)
cl_free(event);
}
+/* We want to get the current time of GPU. */
+static void
+intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
+{
+ uint64_t result = 0;
+ drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+
+ drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+ result = result & 0xFFFFFFFFF0000000;
+ result = result >> 28;
+ result *= 80;
+
+ *ret_ts = result;
+ return;
+}
+
+/* Get the GPU execute time. */
static void
-intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* ret_ts)
+intel_gpgpu_event_get_exec_timestamp(intel_event_t *event,
+ int index, uint64_t* ret_ts)
{
+ uint64_t result = 0;
+
assert(event->ts_buf != NULL);
assert(index == 0 || index == 1);
drm_intel_gem_bo_map_gtt(event->ts_buf);
uint64_t* ptr = event->ts_buf->virtual;
+ result = ptr[index];
+
+ /* According to BSpec, the timestamp counter should be 36 bits,
+ but comparing to the timestamp counter from IO control reading,
+ we find the first 4 bits seems to be fake. In order to keep the
+ timestamp counter conformable, we just skip the first 4 bits. */
+ result = ((result & 0x0FFFFFFFF) << 4) * 80; //convert to nanoseconds
+ *ret_ts = result;
- *ret_ts = ptr[index] * 80; //convert to nanoseconds
drm_intel_gem_bo_unmap_gtt(event->ts_buf);
}
@@ -1080,6 +1109,7 @@ intel_set_gpgpu_callbacks(void)
cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
- cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb *)intel_gpgpu_event_get_timestamp;
+ cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
+ cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
}
--
1.8.3.2
More information about the Beignet
mailing list