[Beignet] [PATCH V2] Modify the multi-thread support for queue.

Mon Dec 30 23:25:57 PST 2013

From: Junyan He <junyan.he at linux.intel.com>

The old multi-thread support for queue do not work
when threads will not exit. If the thread not exit
but the queue is re-generated all the time, the
gpgpu struct resouce will leak, and will fail to
create GPU bo for gpgpu struct finally.
We modify it to release the GPGPU resource every
enqueuNDR finished and we re-alloc our gpgpu struct
context next time.
---
 src/cl_command_queue.c      |    6 ++--
 src/cl_command_queue_gen7.c |    1 +
 src/cl_driver.h             |   10 +++++-
 src/cl_driver_defs.c        |    2 ++
 src/cl_thread.c             |   79 ++++++++++++++++++++++++++++++++++++-------
 src/cl_thread.h             |   10 ++++++
 src/intel/intel_gpgpu.c     |   20 +++++++++--
 7 files changed, 110 insertions(+), 18 deletions(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 3530976..4ac2e11 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -419,15 +419,15 @@ cl_command_queue_flush(cl_command_queue queue)
   GET_QUEUE_THREAD_GPGPU(queue);
 
   cl_gpgpu_flush(gpgpu);
+
+  cl_invalid_thread_gpgpu(queue);
   return CL_SUCCESS;
 }
 
 LOCAL cl_int
 cl_command_queue_finish(cl_command_queue queue)
 {
-  GET_QUEUE_THREAD_GPGPU(queue);
-
-  cl_gpgpu_sync(gpgpu);
+  cl_gpgpu_sync(cl_get_thread_batch_buf());
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 923a881..ba69589 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -336,6 +336,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Start a new batch buffer */
   batch_sz = cl_kernel_compute_batch_sz(ker);
   cl_gpgpu_batch_reset(gpgpu, batch_sz);
+  cl_set_thread_batch_buf(cl_gpgpu_ref_batch_buf(gpgpu));
   cl_gpgpu_batch_start(gpgpu);
 
   /* Issue the GPGPU_WALKER command */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index a34c22e..96fc377 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -95,7 +95,7 @@ typedef void (cl_gpgpu_delete_cb)(cl_gpgpu);
 extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
 
 /* Synchonize GPU with CPU */
-typedef cl_gpgpu (cl_gpgpu_sync_cb)(cl_gpgpu);
+typedef void (cl_gpgpu_sync_cb)(void*);
 extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
 
 /* Bind a regular unformatted buffer */
@@ -200,6 +200,14 @@ extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
 typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
 extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
 
+/* Get current batch buffer handle */
+typedef void* (cl_gpgpu_ref_batch_buf_cb)(cl_gpgpu);
+extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
+
+/* Get release batch buffer handle */
+typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
+extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
+
 /* Will spawn all threads */
 typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
                                   uint32_t simd_sz,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index b46799a..0a9012c 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -82,4 +82,6 @@ LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
 LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
 LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL;
 LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
+LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
+LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
 
diff --git a/src/cl_thread.c b/src/cl_thread.c
index fbad5c5..cadc3cd 100644
--- a/src/cl_thread.c
+++ b/src/cl_thread.c
@@ -20,30 +20,75 @@
 #include "cl_alloc.h"
 #include "cl_utils.h"
 
+static __thread void* thread_batch_buf = NULL;
+
+typedef struct _cl_thread_spec_data {
+  cl_gpgpu gpgpu ;
+  int valid;
+}cl_thread_spec_data;
+
+void cl_set_thread_batch_buf(void* buf) {
+  if (thread_batch_buf) {
+    cl_gpgpu_unref_batch_buf(thread_batch_buf);
+  }
+  thread_batch_buf = buf;
+}
+
+void* cl_get_thread_batch_buf(void) {
+  return thread_batch_buf;
+}
+
 cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
 {
   pthread_key_t* key = queue->thread_data;
-  cl_gpgpu gpgpu = pthread_getspecific(*key);
+  cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key);
 
-  if (!gpgpu) {
-    TRY_ALLOC_NO_ERR (gpgpu, cl_gpgpu_new(queue->ctx->drv));
+  if (!thread_spec_data) {
+    TRY_ALLOC_NO_ERR(thread_spec_data, CALLOC(struct _cl_thread_spec_data));
+    if (pthread_setspecific(*key, thread_spec_data)) {
+      cl_free(thread_spec_data);
+      return NULL;
+    }
   }
 
-  if (pthread_setspecific(*key, gpgpu)) {
-    cl_gpgpu_delete(gpgpu);
-    goto error;
+  if (!thread_spec_data->valid) {
+    TRY_ALLOC_NO_ERR(thread_spec_data->gpgpu, cl_gpgpu_new(queue->ctx->drv));
+    thread_spec_data->valid = 1;
   }
 
-exit:
-  return gpgpu;
 error:
-  pthread_setspecific(*key, NULL);
-  goto exit;
+  return thread_spec_data->gpgpu;
+}
+
+void cl_invalid_thread_gpgpu(cl_command_queue queue)
+{
+  pthread_key_t* key = queue->thread_data;
+  cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key);
+
+  if (!thread_spec_data) {
+    return;
+  }
+
+  if (!thread_spec_data->valid) {
+    return;
+  }
+
+  assert(thread_spec_data->gpgpu);
+  cl_gpgpu_delete(thread_spec_data->gpgpu);
+  thread_spec_data->valid = 0;
 }
 
 static void thread_data_destructor(void *data) {
-  cl_gpgpu gpgpu = (cl_gpgpu)data;
-  cl_gpgpu_delete(gpgpu);
+  cl_thread_spec_data* thread_spec_data = (cl_thread_spec_data *)data;
+
+  if (thread_batch_buf) {
+    cl_gpgpu_unref_batch_buf(thread_batch_buf);
+    thread_batch_buf = NULL;
+  }
+
+  if (thread_spec_data->valid)
+    cl_gpgpu_delete(thread_spec_data->gpgpu);
+  cl_free(thread_spec_data);
 }
 
 /* Create the thread specific data. */
@@ -67,6 +112,16 @@ void* cl_thread_data_create(void)
 void cl_thread_data_destroy(void * data)
 {
   pthread_key_t *thread_specific_key = (pthread_key_t *)data;
+
+  /* First release self spec data. */
+  cl_thread_spec_data* thread_spec_data =
+         pthread_getspecific(*thread_specific_key);
+  if (thread_spec_data && thread_spec_data->valid) {
+    cl_gpgpu_delete(thread_spec_data->gpgpu);
+    if (thread_spec_data)
+      cl_free(thread_spec_data);
+  }
+
   pthread_key_delete(*thread_specific_key);
   cl_free(thread_specific_key);
 }
diff --git a/src/cl_thread.h b/src/cl_thread.h
index 65f1bcf..c8ab63c 100644
--- a/src/cl_thread.h
+++ b/src/cl_thread.h
@@ -31,4 +31,14 @@ void cl_thread_data_destroy(void * data);
 
 /* Used to get the gpgpu struct of each thread. */
 cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
+
+/* Used to release the gpgpu struct of each thread. */
+void cl_invalid_thread_gpgpu(cl_command_queue queue);
+
+/* Used to set the batch buffer of each thread. */
+void cl_set_thread_batch_buf(void* buf);
+
+/* Used to get the batch buffer of each thread. */
+void* cl_get_thread_batch_buf(void);
+
 #endif /* __CL_THREAD_H__ */
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index b1597ac..b2d8bb0 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -117,10 +117,24 @@ typedef struct intel_gpgpu intel_gpgpu_t;
 
 
 static void
-intel_gpgpu_sync(intel_gpgpu_t *gpgpu)
+intel_gpgpu_sync(void *buf)
+{
+  if (buf)
+    drm_intel_bo_wait_rendering((drm_intel_bo *)buf);
+}
+
+static void *intel_gpgpu_ref_batch_buf(intel_gpgpu_t *gpgpu)
 {
   if (gpgpu->batch->last_bo)
-    drm_intel_bo_wait_rendering(gpgpu->batch->last_bo);
+    drm_intel_bo_reference(gpgpu->batch->last_bo);
+
+  return gpgpu->batch->last_bo;
+}
+
+static void intel_gpgpu_unref_batch_buf(void *buf)
+{
+  if (buf)
+    drm_intel_bo_unreference((drm_intel_bo *)buf);
 }
 
 static void
@@ -1111,5 +1125,7 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
   cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
   cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
+  cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
+  cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
 }
 
-- 
1.7.9.5