[Beignet] [PATCH 13/15] Runtime: Add profiling support in runtime.

junyan.he at inbox.com junyan.he at inbox.com
Wed Aug 12 01:49:44 PDT 2015


From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 src/cl_command_queue.c      |    8 +++++++
 src/cl_command_queue.h      |    2 ++
 src/cl_command_queue_gen7.c |   38 ++++++++++++++++++++++++++++++
 src/cl_driver.h             |   16 +++++++++++++
 src/cl_driver_defs.c        |    5 ++++
 src/cl_gbe_loader.cpp       |   15 ++++++++++++
 src/cl_gbe_loader.h         |    3 +++
 src/intel/intel_gpgpu.c     |   54 +++++++++++++++++++++++++++++++++++++++++++
 src/intel/intel_gpgpu.h     |    3 ++-
 9 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4b92311..a345eb9 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -223,6 +223,7 @@ cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
   size_t global_wk_sz[3];
   size_t outbuf_sz = 0;
   void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz, &outbuf_sz);
+  void* profiling_info;
 
   if (cl_gpgpu_flush(gpgpu) < 0)
     return CL_OUT_OF_RESOURCES;
@@ -246,6 +247,13 @@ cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
     global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
     cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
   }
+
+  /* If have profiling info, output it. */
+  profiling_info = cl_gpgpu_get_profiling_info(gpgpu);
+  if (profiling_info) {
+    interp_output_profiling(profiling_info, cl_gpgpu_map_profiling_buffer(gpgpu));
+    cl_gpgpu_unmap_profiling_buffer(gpgpu);
+  }
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 2cd6739..91c941c 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -40,6 +40,8 @@ struct _cl_command_queue {
   cl_event* wait_events;               /* Point to array of non-complete user events that block this command queue */
   cl_int    wait_events_num;           /* Number of Non-complete user events */
   cl_int    wait_events_size;          /* The size of array that wait_events point to */
+  cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
+  cl_event  current_event;             /* Current event. */
   cl_command_queue_properties  props;  /* Queue properties */
   cl_command_queue prev, next;         /* We chain the command queues together */
   void *thread_data;                   /* Used to store thread context data */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 89f39b3..d8664d1 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -279,6 +279,36 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
 }
 
 static int
+cl_bind_profiling(cl_gpgpu gpgpu, uint32_t simd_sz, cl_kernel ker, size_t global_sz, size_t local_sz, uint32_t bti) {
+  int32_t offset;
+  int i = 0;
+  int thread_num;
+  if (simd_sz == 16) {
+    for(i = 0; i < 2; i++) {
+      offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_TIMESTAMP0 + i, 0);
+      assert(offset >= 0);
+      memset(ker->curbe + offset, 0x0, sizeof(uint32_t)*8*2);
+      thread_num = (local_sz + 15)/16;
+    }
+  } else {
+    assert(simd_sz == 8);
+    for(i = 0; i < 4; i++) {
+      offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_TIMESTAMP0 + i, 0);
+      assert(offset >= 0);
+      memset(ker->curbe + offset, 0x0, sizeof(uint32_t)*8);
+      thread_num = (local_sz + 7)/8;
+    }
+  }
+
+  offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_BUF_POINTER, 0);
+  thread_num = thread_num*(global_sz/local_sz);
+  if (cl_gpgpu_set_profiling_buffer(gpgpu, thread_num*128 + 4, offset, bti))
+    return -1;
+
+  return 0;
+}
+
+static int
 cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
   int32_t value = GBE_CURBE_PRINTF_INDEX_POINTER;
   int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
@@ -373,6 +403,14 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
       goto error;
   }
 
+  if (interp_get_profiling_bti(ker->opaque) != 0) {
+    if (cl_bind_profiling(gpgpu, simd_sz, ker, global_size, local_sz, interp_get_profiling_bti(ker->opaque)))
+      goto error;
+    cl_gpgpu_set_profiling_info(gpgpu, interp_dup_profiling(ker->opaque));
+  } else {
+    cl_gpgpu_set_profiling_info(gpgpu, NULL);
+  }
+
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
   /* Bind user images */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 1ab4dff..f130a8e 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -252,6 +252,22 @@ extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
 typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
 extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
 
+/* Set the profiling buffer */
+typedef int (cl_gpgpu_set_profiling_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint8_t);
+extern cl_gpgpu_set_profiling_buffer_cb *cl_gpgpu_set_profiling_buffer;
+
+typedef int (cl_gpgpu_set_profiling_info_cb)(cl_gpgpu, void *);
+extern cl_gpgpu_set_profiling_info_cb *cl_gpgpu_set_profiling_info;
+
+typedef void* (cl_gpgpu_get_profiling_info_cb)(cl_gpgpu);
+extern cl_gpgpu_get_profiling_info_cb *cl_gpgpu_get_profiling_info;
+
+typedef void* (cl_gpgpu_map_profiling_buffer_cb)(cl_gpgpu);
+extern cl_gpgpu_map_profiling_buffer_cb *cl_gpgpu_map_profiling_buffer;
+
+typedef void (cl_gpgpu_unmap_profiling_buffer_cb)(cl_gpgpu);
+extern cl_gpgpu_unmap_profiling_buffer_cb *cl_gpgpu_unmap_profiling_buffer;
+
 /* Set the printf buffer */
 typedef int (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint32_t, uint8_t);
 extern cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index b77acdc..0d6fa9a 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -90,6 +90,11 @@ LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp =
 LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
 LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
 LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
+LOCAL cl_gpgpu_set_profiling_buffer_cb *cl_gpgpu_set_profiling_buffer = NULL;
+LOCAL cl_gpgpu_set_profiling_info_cb *cl_gpgpu_set_profiling_info = NULL;
+LOCAL cl_gpgpu_get_profiling_info_cb *cl_gpgpu_get_profiling_info = NULL;
+LOCAL cl_gpgpu_map_profiling_buffer_cb *cl_gpgpu_map_profiling_buffer = NULL;
+LOCAL cl_gpgpu_unmap_profiling_buffer_cb *cl_gpgpu_unmap_profiling_buffer = NULL;
 LOCAL cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer = NULL;
 LOCAL cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer = NULL;
 LOCAL cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer = NULL;
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index c3454e8..4f15ee6 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -63,6 +63,9 @@ gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data = NULL;
 gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size = NULL;
 gbe_kernel_get_image_size_cb *interp_kernel_get_image_size = NULL;
 gbe_kernel_get_image_data_cb *interp_kernel_get_image_data = NULL;
+gbe_output_profiling_cb* interp_output_profiling = NULL;
+gbe_get_profiling_bti_cb* interp_get_profiling_bti = NULL;
+gbe_dup_profiling_cb* interp_dup_profiling = NULL;
 gbe_get_printf_num_cb* interp_get_printf_num = NULL;
 gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti = NULL;
 gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti = NULL;
@@ -212,6 +215,18 @@ struct GbeLoaderInitializer
     if (interp_kernel_get_image_data == NULL)
       return false;
 
+    interp_output_profiling = *(gbe_output_profiling_cb**)dlsym(dlhInterp, "gbe_output_profiling");
+    if (interp_output_profiling == NULL)
+      return false;
+
+    interp_get_profiling_bti = *(gbe_get_profiling_bti_cb**)dlsym(dlhInterp, "gbe_get_profiling_bti");
+    if (interp_get_profiling_bti == NULL)
+      return false;
+
+    interp_dup_profiling = *(gbe_dup_profiling_cb**)dlsym(dlhInterp, "gbe_dup_profiling");
+    if (interp_dup_profiling == NULL)
+      return false;
+
     interp_get_printf_num = *(gbe_get_printf_num_cb**)dlsym(dlhInterp, "gbe_get_printf_num");
     if (interp_get_printf_num == NULL)
       return false;
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index 6fa4c98..916f21e 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -63,6 +63,9 @@ extern gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data;
 extern gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size;
 extern gbe_kernel_get_image_size_cb *interp_kernel_get_image_size;
 extern gbe_kernel_get_image_data_cb *interp_kernel_get_image_data;
+extern gbe_output_profiling_cb* interp_output_profiling;
+extern gbe_get_profiling_bti_cb* interp_get_profiling_bti;
+extern gbe_dup_profiling_cb* interp_dup_profiling;
 extern gbe_get_printf_num_cb* interp_get_printf_num;
 extern gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti;
 extern gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 901bd98..6f70917 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -2030,6 +2030,37 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
 }
 
 static int
+intel_gpgpu_set_profiling_buf(intel_gpgpu_t *gpgpu, uint32_t size, uint32_t offset, uint8_t bti)
+{
+  drm_intel_bo *bo = NULL;
+
+  if (gpgpu->profiling_b.bo)
+    dri_bo_unreference(gpgpu->profiling_b.bo);
+  gpgpu->profiling_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Profiling buffer", size, 4096);
+  bo = gpgpu->profiling_b.bo;
+  if (!bo || (drm_intel_bo_map(bo, 1) != 0)) {
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    return -1;
+  }
+  memset(bo->virtual, 0, size);
+  drm_intel_bo_unmap(bo);
+  cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)bo, offset, 0, size, bti);
+  return 0;
+}
+
+static void
+intel_gpgpu_set_profiling_info(intel_gpgpu_t *gpgpu, void* profiling_info)
+{
+  gpgpu->profiling_info = profiling_info;
+}
+
+static void*
+intel_gpgpu_get_profiling_info(intel_gpgpu_t *gpgpu)
+{
+  return gpgpu->profiling_info;
+}
+
+static int
 intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint32_t offset, uint8_t bti)
 {
   drm_intel_bo *bo = NULL;
@@ -2060,6 +2091,24 @@ intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint
 }
 
 static void*
+intel_gpgpu_map_profiling_buf(intel_gpgpu_t *gpgpu)
+{
+  drm_intel_bo *bo = NULL;
+  bo = gpgpu->profiling_b.bo;
+  drm_intel_bo_map(bo, 1);
+  return bo->virtual;
+}
+
+static void
+intel_gpgpu_unmap_profiling_buf_addr(intel_gpgpu_t *gpgpu)
+{
+  drm_intel_bo *bo = NULL;
+  bo = gpgpu->profiling_b.bo;
+  drm_intel_bo_unmap(bo);
+}
+
+
+static void*
 intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
 {
   drm_intel_bo *bo = NULL;
@@ -2150,6 +2199,11 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
   cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
   cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
+  cl_gpgpu_set_profiling_buffer = (cl_gpgpu_set_profiling_buffer_cb *)intel_gpgpu_set_profiling_buf;
+  cl_gpgpu_set_profiling_info = (cl_gpgpu_set_profiling_info_cb *)intel_gpgpu_set_profiling_info;
+  cl_gpgpu_get_profiling_info = (cl_gpgpu_get_profiling_info_cb *)intel_gpgpu_get_profiling_info;
+  cl_gpgpu_map_profiling_buffer = (cl_gpgpu_map_profiling_buffer_cb *)intel_gpgpu_map_profiling_buf;
+  cl_gpgpu_unmap_profiling_buffer = (cl_gpgpu_unmap_profiling_buffer_cb *)intel_gpgpu_unmap_profiling_buf_addr;
   cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf;
   cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf;
   cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr;
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index ad7290e..ccbf2fa 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -46,6 +46,7 @@ struct intel_gpgpu
   void* ker_opaque;
   size_t global_wk_sz[3];
   void* printf_info;
+  void* profiling_info;
   struct intel_driver *drv;
   struct intel_batchbuffer *batch;
   cl_gpgpu_kernel *ker;
@@ -66,7 +67,7 @@ struct intel_gpgpu
   struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
   struct { drm_intel_bo *bo;
            drm_intel_bo *ibo;} printf_b;      /* the printf buf and index buf*/
-
+  struct { drm_intel_bo *bo; } profiling_b;   /* the buf for profiling*/
   struct { drm_intel_bo *bo; } aux_buf;
   struct {
     uint32_t surface_heap_offset;
-- 
1.7.9.5



More information about the Beignet mailing list