[Beignet] [PATCH] Refine the cl thread implement for queue.

Mon May 19 23:26:47 PDT 2014

From: Junyan He <junyan.he at linux.intel.com>

Because the cl_command_queue can be used in several threads simultaneously but
without add ref to it, we now handle it like this:
Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it
does not have a slot, assign it.
The resources are keeped in queue private, and resize it if needed.
When the thread exit, the slot will be set invalid.
When queue released, all the resources will be released. If user still enqueue, flush
or finish the queue after it has been released, the behavior is undefined.
TODO: Need to shrink the slot map.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 src/cl_command_queue.c      |   6 +-
 src/cl_command_queue_gen7.c |   2 +-
 src/cl_context.c            |   2 +-
 src/cl_device_id.c          |   2 +-
 src/cl_thread.c             | 261 +++++++++++++++++++++++++++++++++-----------
 src/cl_thread.h             |   6 +-
 6 files changed, 205 insertions(+), 74 deletions(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 6a699c0..802d313 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -60,6 +60,7 @@ cl_command_queue_new(cl_context ctx)
   /* The queue also belongs to its context */
   cl_context_add_ref(ctx);
 
+
 exit:
   return queue;
 error:
@@ -74,6 +75,7 @@ cl_command_queue_delete(cl_command_queue queue)
   assert(queue);
   if (atomic_dec(&queue->ref_n) != 1) return;
 
+  if (queue->ref_n < 0)
   /* Remove it from the list */
   assert(queue->ctx);
   pthread_mutex_lock(&queue->ctx->queue_lock);
@@ -89,7 +91,7 @@ cl_command_queue_delete(cl_command_queue queue)
     queue->fulsim_out = NULL;
   }
 
-  cl_thread_data_destroy(queue->thread_data);
+  cl_thread_data_destroy(queue);
   queue->thread_data = NULL;
   cl_mem_delete(queue->perf);
   cl_context_delete(queue->ctx);
@@ -430,7 +432,7 @@ cl_command_queue_flush(cl_command_queue queue)
 LOCAL cl_int
 cl_command_queue_finish(cl_command_queue queue)
 {
-  cl_gpgpu_sync(cl_get_thread_batch_buf());
+  cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 891d6f1..d875021 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -327,7 +327,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Start a new batch buffer */
   batch_sz = cl_kernel_compute_batch_sz(ker);
   cl_gpgpu_batch_reset(gpgpu, batch_sz);
-  cl_set_thread_batch_buf(cl_gpgpu_ref_batch_buf(gpgpu));
+  cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
   cl_gpgpu_batch_start(gpgpu);
 
   /* Issue the GPGPU_WALKER command */
diff --git a/src/cl_context.c b/src/cl_context.c
index 8190e6a..452afb8 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -203,7 +203,7 @@ cl_context_delete(cl_context ctx)
   assert(ctx->buffers == NULL);
   assert(ctx->drv);
   cl_free(ctx->prop_user);
-  cl_set_thread_batch_buf(NULL);
+//  cl_set_thread_batch_buf(NULL);
   cl_driver_delete(ctx->drv);
   ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(ctx);
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 268789a..15a776d 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -106,7 +106,7 @@ LOCAL cl_device_id
 cl_get_gt_device(void)
 {
   cl_device_id ret = NULL;
-  cl_set_thread_batch_buf(NULL);
+ // cl_set_thread_batch_buf(NULL);
   const int device_id = cl_driver_get_device_id();
   cl_device_id device = NULL;
 
diff --git a/src/cl_thread.c b/src/cl_thread.c
index cadc3cd..1b517ac 100644
--- a/src/cl_thread.c
+++ b/src/cl_thread.c
@@ -15,113 +15,242 @@
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
  */
+#include <string.h>
+#include <stdio.h>
 
 #include "cl_thread.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
 
-static __thread void* thread_batch_buf = NULL;
+/* Because the cl_command_queue can be used in several threads simultaneously but
+   without add ref to it, we now handle it like this:
+   Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it
+   does not have a slot, assign it.
+   The resources are keeped in queue private, and resize it if needed.
+   When the thread exit, the slot will be set invalid.
+   When queue released, all the resources will be released. If user still enqueue, flush
+   or finish the queue after it has been released, the behavior is undefined.
+   TODO: Need to shrink the slot map.
+   */
 
-typedef struct _cl_thread_spec_data {
+static int thread_array_num = 1;
+static int *thread_slot_map = NULL;
+static int thread_magic_num = 1;
+static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_key_t destroy_key;
+
+static __thread int thread_id = -1;
+static __thread int thread_magic = -1;
+
+typedef struct _thread_spec_data {
   cl_gpgpu gpgpu ;
   int valid;
-}cl_thread_spec_data;
+  void* thread_batch_buf;
+  int thread_magic;
+} thread_spec_data;
 
-void cl_set_thread_batch_buf(void* buf) {
-  if (thread_batch_buf) {
-    cl_gpgpu_unref_batch_buf(thread_batch_buf);
-  }
-  thread_batch_buf = buf;
-}
+typedef struct _queue_thread_private {
+  thread_spec_data**  threads_data;
+  int threads_data_num;
+  pthread_mutex_t thread_data_lock;
+} queue_thread_private;
 
-void* cl_get_thread_batch_buf(void) {
-  return thread_batch_buf;
+static void thread_data_destructor(void *dummy) {
+  pthread_mutex_lock(&thread_queue_map_lock);
+  thread_slot_map[thread_id] = 0;
+  pthread_mutex_unlock(&thread_queue_map_lock);
+  free(dummy);
 }
 
-cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
+static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create)
 {
-  pthread_key_t* key = queue->thread_data;
-  cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key);
-
-  if (!thread_spec_data) {
-    TRY_ALLOC_NO_ERR(thread_spec_data, CALLOC(struct _cl_thread_spec_data));
-    if (pthread_setspecific(*key, thread_spec_data)) {
-      cl_free(thread_spec_data);
-      return NULL;
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
+  int i = 0;
+  int *old;
+
+  if (thread_id == -1) {
+    void * dummy = malloc(sizeof(int));
+
+    pthread_mutex_lock(&thread_queue_map_lock);
+    for (i = 0; i < thread_array_num; i++) {
+      if (thread_slot_map[i] == 0) {
+        thread_id = i;
+        break;
+      }
+    }
+
+    if (i == thread_array_num) {
+      thread_array_num *= 2;
+      old = thread_slot_map;
+      thread_slot_map = malloc(sizeof(int) * thread_array_num);
+      memset(thread_slot_map, 0, (sizeof(int) * thread_array_num));
+      for (i = 0; i < thread_array_num/2; i++) {
+        thread_slot_map[i] = old[i];
+      }
+
+      thread_id = thread_array_num/2;
+      cl_free(old);
+    }
+
+    thread_slot_map[thread_id] = 1;
+
+    thread_magic = thread_magic_num++;
+    pthread_mutex_unlock(&thread_queue_map_lock);
+
+    pthread_setspecific(destroy_key, dummy);
+  }
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  if (thread_array_num > thread_private->threads_data_num) {// just enlarge
+    thread_spec_data ** old = thread_private->threads_data;
+    int old_num = thread_private->threads_data_num;
+    thread_private->threads_data_num = thread_array_num;
+    thread_private->threads_data = malloc(thread_private->threads_data_num * sizeof(void *));
+    memset(thread_private->threads_data, 0, sizeof(void*) * thread_private->threads_data_num);
+    for (i = 0; i < old_num; i++) {
+      thread_private->threads_data[i] = old[i];
     }
+    free(old);
   }
 
-  if (!thread_spec_data->valid) {
-    TRY_ALLOC_NO_ERR(thread_spec_data->gpgpu, cl_gpgpu_new(queue->ctx->drv));
-    thread_spec_data->valid = 1;
+  assert(thread_id != -1 && thread_id < thread_array_num);
+  spec = thread_private->threads_data[thread_id];
+  if (!spec && create) {
+	spec = CALLOC(thread_spec_data);
+	spec->thread_magic = thread_magic;
+	thread_private->threads_data[thread_id] = spec;
   }
 
-error:
-  return thread_spec_data->gpgpu;
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  return spec;
 }
 
-void cl_invalid_thread_gpgpu(cl_command_queue queue)
+void* cl_thread_data_create(void)
 {
-  pthread_key_t* key = queue->thread_data;
-  cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key);
+  queue_thread_private* thread_private = CALLOC(queue_thread_private);
 
-  if (!thread_spec_data) {
-    return;
-  }
+  if (thread_private == NULL)
+    return NULL;
 
-  if (!thread_spec_data->valid) {
-    return;
+  if (thread_slot_map == NULL) {
+    pthread_mutex_lock(&thread_queue_map_lock);
+    thread_slot_map = malloc(sizeof(int) * thread_array_num);
+    memset(thread_slot_map, 0, (sizeof(int) * thread_array_num));
+    pthread_mutex_unlock(&thread_queue_map_lock);
+
+    pthread_key_create(&destroy_key, thread_data_destructor);
   }
 
-  assert(thread_spec_data->gpgpu);
-  cl_gpgpu_delete(thread_spec_data->gpgpu);
-  thread_spec_data->valid = 0;
+  pthread_mutex_init(&thread_private->thread_data_lock, NULL);
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  thread_private->threads_data = malloc(thread_array_num * sizeof(void *));
+  memset(thread_private->threads_data, 0, sizeof(void*) * thread_array_num);
+  thread_private->threads_data_num = thread_array_num;
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  return thread_private;
 }
 
-static void thread_data_destructor(void *data) {
-  cl_thread_spec_data* thread_spec_data = (cl_thread_spec_data *)data;
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
 
-  if (thread_batch_buf) {
-    cl_gpgpu_unref_batch_buf(thread_batch_buf);
-    thread_batch_buf = NULL;
+  if (!spec->thread_magic && spec->thread_magic != thread_magic) {
+    //We may get the slot from last thread. So free the resource.
+    spec->valid = 0;
   }
 
-  if (thread_spec_data->valid)
-    cl_gpgpu_delete(thread_spec_data->gpgpu);
-  cl_free(thread_spec_data);
+  if (!spec->valid) {
+    if (spec->thread_batch_buf) {
+      cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
+      spec->thread_batch_buf = NULL;
+    }
+    if (spec->gpgpu) {
+      cl_gpgpu_delete(spec->gpgpu);
+      spec->gpgpu = NULL;
+    }
+    TRY_ALLOC_NO_ERR(spec->gpgpu, cl_gpgpu_new(queue->ctx->drv));
+    spec->valid = 1;
+  }
+
+ error:
+  return spec->gpgpu;
 }
 
-/* Create the thread specific data. */
-void* cl_thread_data_create(void)
+void cl_set_thread_batch_buf(cl_command_queue queue, void* buf)
 {
-  int rc = 0;
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
 
-  pthread_key_t *thread_specific_key = CALLOC(pthread_key_t);
-  if (thread_specific_key == NULL)
-    return NULL;
+  assert(spec && spec->thread_magic == thread_magic);
+
+  if (spec->thread_batch_buf) {
+    cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
+  }
+  spec->thread_batch_buf = buf;
+}
 
-  rc = pthread_key_create(thread_specific_key, thread_data_destructor);
+void* cl_get_thread_batch_buf(cl_command_queue queue) {
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
 
-  if (rc != 0)
-    return NULL;
+  assert(spec && spec->thread_magic == thread_magic);
+
+  return spec->thread_batch_buf;
+}
+
+void cl_invalid_thread_gpgpu(cl_command_queue queue)
+{
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
 
-  return thread_specific_key;
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  spec = thread_private->threads_data[thread_id];
+  assert(spec);
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  if (!spec->valid) {
+    return;
+  }
+
+  assert(spec->gpgpu);
+  cl_gpgpu_delete(spec->gpgpu);
+  spec->gpgpu = NULL;
+  spec->valid = 0;
 }
 
 /* The destructor for clean the thread specific data. */
-void cl_thread_data_destroy(void * data)
+void cl_thread_data_destroy(cl_command_queue queue)
 {
-  pthread_key_t *thread_specific_key = (pthread_key_t *)data;
-
-  /* First release self spec data. */
-  cl_thread_spec_data* thread_spec_data =
-         pthread_getspecific(*thread_specific_key);
-  if (thread_spec_data && thread_spec_data->valid) {
-    cl_gpgpu_delete(thread_spec_data->gpgpu);
-    if (thread_spec_data)
-      cl_free(thread_spec_data);
+  int i = 0;
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  int threads_data_num;
+  thread_spec_data** threads_data;
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  assert(thread_private->threads_data_num == thread_array_num);
+  threads_data_num = thread_private->threads_data_num;
+  threads_data = thread_private->threads_data;
+  thread_private->threads_data_num = 0;
+  thread_private->threads_data = NULL;
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+  cl_free(thread_private);
+  queue->thread_data = NULL;
+
+  for (i = 0; i < threads_data_num; i++) {
+    if (threads_data[i] != NULL && threads_data[i]->thread_batch_buf) {
+      cl_gpgpu_unref_batch_buf(threads_data[i]->thread_batch_buf);
+      threads_data[i]->thread_batch_buf = NULL;
+    }
+
+    if (threads_data[i] != NULL && threads_data[i]->valid) {
+      cl_gpgpu_delete(threads_data[i]->gpgpu);
+      threads_data[i]->gpgpu = NULL;
+      threads_data[i]->valid = 0;
+    }
+    cl_free(threads_data[i]);
   }
 
-  pthread_key_delete(*thread_specific_key);
-  cl_free(thread_specific_key);
+  cl_free(threads_data);
 }
diff --git a/src/cl_thread.h b/src/cl_thread.h
index c8ab63c..bf855a2 100644
--- a/src/cl_thread.h
+++ b/src/cl_thread.h
@@ -27,7 +27,7 @@
 void* cl_thread_data_create(void);
 
 /* The destructor for clean the thread specific data. */
-void cl_thread_data_destroy(void * data);
+void cl_thread_data_destroy(cl_command_queue queue);
 
 /* Used to get the gpgpu struct of each thread. */
 cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
@@ -36,9 +36,9 @@ cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
 void cl_invalid_thread_gpgpu(cl_command_queue queue);
 
 /* Used to set the batch buffer of each thread. */
-void cl_set_thread_batch_buf(void* buf);
+void cl_set_thread_batch_buf(cl_command_queue queue, void* buf);
 
 /* Used to get the batch buffer of each thread. */
-void* cl_get_thread_batch_buf(void);
+void* cl_get_thread_batch_buf(cl_command_queue queue);
 
 #endif /* __CL_THREAD_H__ */
-- 
1.8.3.2