[Beignet] [PATCH] runtime: fix a gpgpu event and thread local gpgpu handling bug.

Wed Jul 2 21:35:51 PDT 2014

When pending a command queue, we need to record the whole gpgpu
structure not just the batch buffer. For the following reason:

1. We need to keep those private buffer, for example those printf buffers.
2. We need to make sure this gpgpu will not be reused by other enqueuement.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 src/cl_command_queue.c        | 13 +++++++--
 src/cl_command_queue.h        |  3 ++
 src/cl_driver.h               |  8 ++----
 src/cl_driver_defs.c          |  4 +--
 src/cl_enqueue.c              |  2 +-
 src/cl_event.c                | 48 ++++++++++++++++++++++++++++---
 src/cl_event.h                |  9 ++++--
 src/cl_thread.c               | 20 +++++++++++++
 src/cl_thread.h               |  3 ++
 src/intel/intel_batchbuffer.c | 13 ---------
 src/intel/intel_batchbuffer.h |  1 -
 src/intel/intel_gpgpu.c       | 66 +++++++++++++++++--------------------------
 12 files changed, 117 insertions(+), 73 deletions(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 8426c4e..6343d75 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -28,6 +28,7 @@
 #include "cl_alloc.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
+#include "cl_event.h"
 #include "performance.h"
 
 #include <assert.h>
@@ -421,10 +422,9 @@ error:
   return err;
 }
 
-LOCAL cl_int
-cl_command_queue_flush(cl_command_queue queue)
+LOCAL void
+cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
 {
-  GET_QUEUE_THREAD_GPGPU(queue);
   size_t global_wk_sz[3];
   void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz);
 
@@ -447,7 +447,14 @@ cl_command_queue_flush(cl_command_queue queue)
     global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
     cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
   }
+}
 
+LOCAL cl_int
+cl_command_queue_flush(cl_command_queue queue)
+{
+  GET_QUEUE_THREAD_GPGPU(queue);
+  cl_command_queue_flush_gpgpu(queue, gpgpu);
+  cl_event_flush_command_queue(queue);
   cl_invalid_thread_gpgpu(queue);
   return CL_SUCCESS;
 }
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index b79d63a..9a6db12 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -82,6 +82,9 @@ cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem);
 /* Flush for the command queue */
 extern cl_int cl_command_queue_flush(cl_command_queue);
 
+/* Flush for the specified gpgpu */
+extern void cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
+
 /* Wait for the completion of the command queue */
 extern cl_int cl_command_queue_finish(cl_command_queue);
 
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 2999eb7..3d1d8d8 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -197,13 +197,9 @@ extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new;
 typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int);
 extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status;
 
-/* pending flush the batch buffer of this event */
-typedef void (cl_gpgpu_event_pending_cb)(cl_gpgpu, cl_gpgpu_event);
-extern cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending;
-
 /* flush the batch buffer of this event */
-typedef void (cl_gpgpu_event_resume_cb)(cl_gpgpu_event);
-extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
+typedef void (cl_gpgpu_event_flush_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush;
 
 /* cancel exec batch buffer of this event */
 typedef void (cl_gpgpu_event_cancel_cb)(cl_gpgpu_event);
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index c9385c4..72f25d9 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -79,9 +79,7 @@ LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
 LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
 LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
 LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
-LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
-LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
-LOCAL cl_gpgpu_event_cancel_cb *cl_gpgpu_event_cancel = NULL;
+LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL;
 LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
 LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL;
 LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 7fa44ff..af118ad 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -461,7 +461,7 @@ cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
     case EnqueueNDRangeKernel:
     case EnqueueFillBuffer:
     case EnqueueFillImage:
-      cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);
+      cl_event_flush(event);
       return CL_SUCCESS;
     case EnqueueNativeKernel:
       return cl_enqueue_native_kernel(data);
diff --git a/src/cl_event.c b/src/cl_event.c
index a3af59c..8ad5e3b 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -46,6 +46,39 @@ cl_event_is_gpu_command_type(cl_command_type type)
   }
 }
 
+void cl_event_flush_command_queue(cl_command_queue queue)
+{
+  int i;
+  cl_event event;
+
+  for(i = 0; i < queue->barrier_events_num; i++)
+  {
+    event = queue->barrier_events[i];
+    if (event->gpgpu_event)
+      cl_gpgpu_event_flush(event->gpgpu_event);
+  }
+
+  for(i = 0; i < queue->wait_events_num; i++)
+  {
+    event = queue->wait_events[i];
+    if (event->gpgpu_event)
+      cl_gpgpu_event_flush(event->gpgpu_event);
+  }
+
+  event = queue->last_event;
+  if (event && event->gpgpu_event)
+    cl_gpgpu_event_flush(event->gpgpu_event);
+}
+
+void cl_event_flush(cl_event event)
+{
+  assert(event->gpgpu != NULL && event->gpgpu_event != NULL);
+  cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
+  cl_gpgpu_event_flush(event->gpgpu_event);
+  cl_gpgpu_delete(event->gpgpu);
+  event->gpgpu = NULL;
+}
+
 cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
 {
   cl_event event = NULL;
@@ -138,6 +171,11 @@ void cl_event_delete(cl_event event)
   pthread_mutex_unlock(&event->ctx->event_lock);
   cl_context_delete(event->ctx);
 
+  if (event->gpgpu) {
+    fprintf(stderr, "Warning: a event is deleted with a pending enqueued task.\n");
+    cl_gpgpu_delete(event->gpgpu);
+    event->gpgpu = NULL;
+  }
   cl_free(event);
 }
 
@@ -257,7 +295,6 @@ void cl_event_new_enqueue_callback(cl_event event,
   cl_command_queue queue = event->queue;
   cl_int i;
   cl_int err = CL_SUCCESS;
-  GET_QUEUE_THREAD_GPGPU(data->queue);
 
   /* Allocate and initialize the structure itself */
   TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
@@ -344,7 +381,7 @@ void cl_event_new_enqueue_callback(cl_event event,
     }
   }
   if(data->queue != NULL && event->gpgpu_event != NULL) {
-    cl_gpgpu_event_pending(gpgpu, event->gpgpu_event);
+    event->gpgpu = cl_thread_gpgpu_take(event->queue);
     data->ptr = (void *)event->gpgpu_event;
   }
   cb->data = *data;
@@ -394,8 +431,11 @@ void cl_event_set_status(cl_event event, cl_int status)
         if(event->gpgpu_event)
           cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set complet, need refine
       } else {
-        if(event->gpgpu_event)
-          cl_gpgpu_event_cancel(event->gpgpu_event);  //Error cancel the enqueue
+        if(event->gpgpu_event) {
+          // Error then cancel the enqueued event.
+          cl_gpgpu_delete(event->gpgpu);
+          event->gpgpu = NULL;
+        }
       }
 
       event->status = status;  //Change the event status after enqueue and befor unlock
diff --git a/src/cl_event.h b/src/cl_event.h
index bd8a5c7..9343c97 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -63,6 +63,7 @@ struct _cl_event {
   cl_command_queue   queue;       /* The command queue associated with event */
   cl_command_type    type;        /* The command type associated with event */
   cl_int             status;      /* The execution status */
+  cl_gpgpu           gpgpu;       /* Current gpgpu, owned by this structure. */
   cl_gpgpu_event     gpgpu_event; /* The event object communicate with hardware */
   user_callback*     user_cb;     /* The event callback functions */
   enqueue_callback*  enqueue_cb;  /* This event's enqueue */
@@ -95,9 +96,13 @@ cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const cl_event
 cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const cl_event *,  cl_event*);
 /* Do the event profiling */
 cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
-/*insert the user event*/
+/* insert the user event */
 cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event);
-/*remove the user event*/
+/* remove the user event */
 cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event);
+/* flush the event's pending gpgpu batch buffer */
+void cl_event_flush(cl_event event);
+/* flush all the waiting event on this command queue */
+void cl_event_flush_command_queue(cl_command_queue queue);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/cl_thread.c b/src/cl_thread.c
index 4692ed7..5713d70 100644
--- a/src/cl_thread.c
+++ b/src/cl_thread.c
@@ -209,6 +209,26 @@ void cl_invalid_thread_gpgpu(cl_command_queue queue)
   spec->valid = 0;
 }
 
+cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue)
+{
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  spec = thread_private->threads_data[thread_id];
+  assert(spec);
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  if (!spec->valid)
+    return NULL;
+
+  assert(spec->gpgpu);
+  cl_gpgpu gpgpu = spec->gpgpu;
+  spec->gpgpu = NULL;
+  spec->valid = 0;
+  return gpgpu;
+}
+
 /* The destructor for clean the thread specific data. */
 void cl_thread_data_destroy(cl_command_queue queue)
 {
diff --git a/src/cl_thread.h b/src/cl_thread.h
index bf855a2..ecc99ad 100644
--- a/src/cl_thread.h
+++ b/src/cl_thread.h
@@ -41,4 +41,7 @@ void cl_set_thread_batch_buf(cl_command_queue queue, void* buf);
 /* Used to get the batch buffer of each thread. */
 void* cl_get_thread_batch_buf(cl_command_queue queue);
 
+/* take current gpgpu from the thread gpgpu pool. */
+cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue);
+
 #endif /* __CL_THREAD_H__ */
diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c
index d0da77a..7767db3 100644
--- a/src/intel/intel_batchbuffer.c
+++ b/src/intel/intel_batchbuffer.c
@@ -185,16 +185,3 @@ intel_batchbuffer_delete(intel_batchbuffer_t *batch)
 
   cl_free(batch);
 }
-
-LOCAL void
-intel_batchbuffer_take(intel_batchbuffer_t *from, intel_batchbuffer_t *to)
-{
-  *to = *from;
-  //Need not unreference the buffer, to will unreference it.
-  from->buffer = NULL;
-  from->map = NULL;
-  from->ptr = NULL;
-  from->size = 0;
-  from->atomic = 0;
-  from->enable_slm = 0;
-}
diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h
index c62043e..0c3bc13 100644
--- a/src/intel/intel_batchbuffer.h
+++ b/src/intel/intel_batchbuffer.h
@@ -101,7 +101,6 @@ extern void intel_batchbuffer_init(intel_batchbuffer_t*, struct intel_driver*);
 extern void intel_batchbuffer_terminate(intel_batchbuffer_t*);
 extern void intel_batchbuffer_flush(intel_batchbuffer_t*);
 extern void intel_batchbuffer_reset(intel_batchbuffer_t*, size_t sz);
-extern void intel_batchbuffer_take(intel_batchbuffer_t*, intel_batchbuffer_t *);
 
 static INLINE uint32_t
 intel_batchbuffer_space(const intel_batchbuffer_t *batch)
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index a7c449d..d00bc83 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -60,9 +60,8 @@ typedef struct surface_heap {
 } surface_heap_t;
 
 typedef struct intel_event {
-  intel_batchbuffer_t *batch;
-  drm_intel_bo* buffer;
-  drm_intel_bo* ts_buf;
+  drm_intel_bo *buffer;
+  drm_intel_bo *ts_buf;
   int status;
 } intel_event_t;
 
@@ -569,12 +568,19 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
 }
 
 static void
+intel_gpgpu_flush_batch_buffer(intel_batchbuffer_t *batch)
+{
+  assert(batch);
+  intel_batchbuffer_emit_mi_flush(batch);
+  intel_batchbuffer_flush(batch);
+}
+
+static void
 intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
 {
   if (!gpgpu->batch || !gpgpu->batch->buffer)
     return;
-  intel_batchbuffer_emit_mi_flush(gpgpu->batch);
-  intel_batchbuffer_flush(gpgpu->batch);
+  intel_gpgpu_flush_batch_buffer(gpgpu->batch);
   intel_gpgpu_check_binded_buf_address(gpgpu);
 }
 
@@ -1156,11 +1162,10 @@ intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
   intel_event_t *event = NULL;
   TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t));
 
-  event->status = command_queued;
-  event->batch = NULL;
   event->buffer = gpgpu->batch->buffer;
-  if(event->buffer != NULL)
+  if (event->buffer)
     drm_intel_bo_reference(event->buffer);
+  event->status = command_queued;
 
   if(gpgpu->time_stamp_b.bo) {
     event->ts_buf = gpgpu->time_stamp_b.bo;
@@ -1175,6 +1180,17 @@ error:
   goto exit;
 }
 
+/*
+   The upper layer already flushed the batch buffer, just update
+   internal status to command_submitted.
+*/
+static void
+intel_gpgpu_event_flush(intel_event_t *event)
+{
+  assert(event->status == command_queued);
+  event->status = command_running;
+}
+
 static int
 intel_gpgpu_event_update_status(intel_event_t *event, int wait)
 {
@@ -1182,7 +1198,7 @@ intel_gpgpu_event_update_status(intel_event_t *event, int wait)
     return event->status;
 
   if (event->buffer &&
-      event->batch == NULL &&        //have flushed
+      event->status == command_running &&
       !drm_intel_bo_busy(event->buffer)) {
     event->status = command_complete;
     drm_intel_bo_unreference(event->buffer);
@@ -1203,36 +1219,8 @@ intel_gpgpu_event_update_status(intel_event_t *event, int wait)
 }
 
 static void
-intel_gpgpu_event_pending(intel_gpgpu_t *gpgpu, intel_event_t *event)
-{
-  assert(event->buffer);           //This is gpu enqueue command
-  assert(event->batch == NULL);    //This command haven't pengding.
-  event->batch = intel_batchbuffer_new(gpgpu->drv);
-  assert(event->batch);
-  intel_batchbuffer_take(gpgpu->batch, event->batch);
-}
-
-static void
-intel_gpgpu_event_resume(intel_event_t *event)
-{
-  assert(event->batch);           //This command have pending.
-  intel_batchbuffer_flush(event->batch);
-  intel_batchbuffer_delete(event->batch);
-  event->batch = NULL;
-}
-
-static void
-intel_gpgpu_event_cancel(intel_event_t *event)
-{
-  assert(event->batch);           //This command have pending.
-  intel_batchbuffer_delete(event->batch);
-  event->batch = NULL;
-}
-
-static void
 intel_gpgpu_event_delete(intel_event_t *event)
 {
-  assert(event->batch == NULL);   //This command must have been flushed.
   if(event->buffer)
     drm_intel_bo_unreference(event->buffer);
   if(event->ts_buf)
@@ -1412,10 +1400,8 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
   cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
   cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
+  cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
   cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status;
-  cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
-  cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
-  cl_gpgpu_event_cancel = (cl_gpgpu_event_cancel_cb *)intel_gpgpu_event_cancel;
   cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
   cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
   cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
-- 
1.8.3.2