[Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context

Thu Nov 7 08:58:00 PST 2013

From: Junyan He <junyan.he at linux.intel.com>

We find some cases will use multi-threads to run on the same queue,
executing the same kernel. This will cause the gpgpu struct which
is very important for GPU context setting be destroyed because we
do not implement any sync protect on it now.
Move the gpgpu struct into thread specific space will fix this problem
because the lib_drm will do the GPU command serialization for us.
---
 src/CMakeLists.txt          |    1 +
 src/cl_command_queue.c      |   27 +++++++++++-----
 src/cl_command_queue.h      |    9 +++++-
 src/cl_command_queue_gen7.c |    7 +++--
 src/cl_event.c              |    6 ++--
 src/cl_thread.c             |   72 +++++++++++++++++++++++++++++++++++++++++++
 src/cl_thread.h             |   34 ++++++++++++++++++++
 utests/CMakeLists.txt       |    2 +-
 8 files changed, 144 insertions(+), 14 deletions(-)
 create mode 100644 src/cl_thread.c
 create mode 100644 src/cl_thread.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1e28c6c..59d330e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -39,6 +39,7 @@ set(OPENCL_SRC
     cl_command_queue.c
     cl_command_queue.h
     cl_command_queue_gen7.c
+    cl_thread.c
     cl_driver.h
     cl_driver.cpp
     cl_driver_defs.c
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 3f9d95c..3530976 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -24,6 +24,7 @@
 #include "cl_device_id.h"
 #include "cl_mem.h"
 #include "cl_utils.h"
+#include "cl_thread.h"
 #include "cl_alloc.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
@@ -43,7 +44,9 @@ cl_command_queue_new(cl_context ctx)
   queue->magic = CL_MAGIC_QUEUE_HEADER;
   queue->ref_n = 1;
   queue->ctx = ctx;
-  TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv));
+  if ((queue->thread_data = cl_thread_data_create()) == NULL) {
+    goto error;
+  }
 
   /* Append the command queue in the list */
   pthread_mutex_lock(&ctx->queue_lock);
@@ -84,9 +87,11 @@ cl_command_queue_delete(cl_command_queue queue)
     cl_mem_delete(queue->fulsim_out);
     queue->fulsim_out = NULL;
   }
+
+  cl_thread_data_destroy(queue->thread_data);
+  queue->thread_data = NULL;
   cl_mem_delete(queue->perf);
   cl_context_delete(queue->ctx);
-  cl_gpgpu_delete(queue->gpgpu);
   cl_free(queue->wait_events);
   queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(queue);
@@ -119,13 +124,15 @@ LOCAL cl_int
 cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
 {
   uint32_t i;
+  GET_QUEUE_THREAD_GPGPU(queue);
+
   for (i = 0; i < k->image_sz; i++) {
     int id = k->images[i].arg_idx;
     struct _cl_mem_image *image;
     assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
-    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, image->base.bo, image->offset,
+    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
                         image->intel_fmt, image->image_type,
                         image->w, image->h, image->depth,
                         image->row_pitch, image->tiling);
@@ -136,6 +143,8 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
 LOCAL cl_int
 cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
 {
+  GET_QUEUE_THREAD_GPGPU(queue);
+
   /* Bind all user buffers (given by clSetKernelArg) */
   uint32_t i;
   enum gbe_arg_type arg_type; /* kind of argument */
@@ -147,9 +156,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
       struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
-      cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
     } else {
-      cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
     }
   }
 
@@ -407,14 +416,18 @@ error:
 LOCAL cl_int
 cl_command_queue_flush(cl_command_queue queue)
 {
-  cl_gpgpu_flush(queue->gpgpu);
+  GET_QUEUE_THREAD_GPGPU(queue);
+
+  cl_gpgpu_flush(gpgpu);
   return CL_SUCCESS;
 }
 
 LOCAL cl_int
 cl_command_queue_finish(cl_command_queue queue)
 {
-  cl_gpgpu_sync(queue->gpgpu);
+  GET_QUEUE_THREAD_GPGPU(queue);
+
+  cl_gpgpu_sync(gpgpu);
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 9396fd7..40c272c 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -22,6 +22,7 @@
 
 #include "cl_internals.h"
 #include "cl_driver.h"
+#include "cl_thread.h"
 #include "CL/cl.h"
 #include <stdint.h>
 
@@ -40,11 +41,17 @@ struct _cl_command_queue {
   cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
   cl_command_queue_properties  props;  /* Queue properties */
   cl_command_queue prev, next;         /* We chain the command queues together */
-  cl_gpgpu gpgpu;                      /* Setup all GEN commands */
+  void *thread_data;                   /* Used to store thread context data */
   cl_mem perf;                         /* Where to put the perf counters */
   cl_mem fulsim_out;                   /* Fulsim will output this buffer */
 };
 
+/* The macro to get the thread specified gpgpu struct. */
+#define GET_QUEUE_THREAD_GPGPU(queue) \
+	cl_gpgpu gpgpu = queue ? cl_get_thread_gpgpu(queue) : NULL;  \
+	if (queue) \
+	  assert(gpgpu);
+
 /* Allocate and initialize a new command queue. Also insert it in the list of
  * command queue in the associated context
  */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 65f8e17..fb7a8a3 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -99,6 +99,7 @@ static void
 cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
 {
   /* calculate constant buffer size */
+  GET_QUEUE_THREAD_GPGPU(queue);
   int32_t arg;
   size_t offset;
   gbe_program prog = ker->program->opaque;
@@ -115,7 +116,7 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
   if(global_const_size == 0 && constant_buf_size == 0)
      return;
 
-  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, constant_buf_size + global_const_size + 4);
   cl_buffer_map(bo, 1);
   char * cst_addr = cl_buffer_get_virtual(bo);
   offset = 0;
@@ -256,8 +257,8 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                const size_t *global_wk_sz,
                                const size_t *local_wk_sz)
 {
+  GET_QUEUE_THREAD_GPGPU(queue);
   cl_context ctx = queue->ctx;
-  cl_gpgpu gpgpu = queue->gpgpu;
   char *final_curbe = NULL;  /* Includes them and one sub-buffer per group */
   cl_gpgpu_kernel kernel;
   const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
@@ -297,7 +298,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind user images */
   cl_command_queue_bind_image(queue, ker);
   /* Bind all samplers */
-  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
+  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
 
   cl_setup_scratch(gpgpu, ker);
   /* Bind a stack if needed */
diff --git a/src/cl_event.c b/src/cl_event.c
index 1dc02ae..028dfb6 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -48,6 +48,7 @@ cl_event_is_gpu_command_type(cl_command_type type)
 cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
 {
   cl_event event = NULL;
+  GET_QUEUE_THREAD_GPGPU(queue);
 
   /* Allocate and inialize the structure itself */
   TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
@@ -75,7 +76,7 @@ cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type ty
   else {
     event->status = CL_QUEUED;
     if(cl_event_is_gpu_command_type(event->type))
-      event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
+      event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
   }
   cl_event_add_ref(event);       //dec when complete
   event->user_cb = NULL;
@@ -257,6 +258,7 @@ void cl_event_new_enqueue_callback(cl_event event,
   user_event *user_events, *u_ev;
   cl_command_queue queue = event->queue;
   cl_int i;
+  GET_QUEUE_THREAD_GPGPU(data->queue);
 
   /* Allocate and inialize the structure itself */
   TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
@@ -336,7 +338,7 @@ void cl_event_new_enqueue_callback(cl_event event,
     }
   }
   if(data->queue != NULL && event->gpgpu_event != NULL) {
-    cl_gpgpu_event_pending(data->queue->gpgpu, event->gpgpu_event);
+    cl_gpgpu_event_pending(gpgpu, event->gpgpu_event);
     data->ptr = (void *)event->gpgpu_event;
   }
   cb->data = *data;
diff --git a/src/cl_thread.c b/src/cl_thread.c
new file mode 100644
index 0000000..fbad5c5
--- /dev/null
+++ b/src/cl_thread.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_thread.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
+{
+  pthread_key_t* key = queue->thread_data;
+  cl_gpgpu gpgpu = pthread_getspecific(*key);
+
+  if (!gpgpu) {
+    TRY_ALLOC_NO_ERR (gpgpu, cl_gpgpu_new(queue->ctx->drv));
+  }
+
+  if (pthread_setspecific(*key, gpgpu)) {
+    cl_gpgpu_delete(gpgpu);
+    goto error;
+  }
+
+exit:
+  return gpgpu;
+error:
+  pthread_setspecific(*key, NULL);
+  goto exit;
+}
+
+static void thread_data_destructor(void *data) {
+  cl_gpgpu gpgpu = (cl_gpgpu)data;
+  cl_gpgpu_delete(gpgpu);
+}
+
+/* Create the thread specific data. */
+void* cl_thread_data_create(void)
+{
+  int rc = 0;
+
+  pthread_key_t *thread_specific_key = CALLOC(pthread_key_t);
+  if (thread_specific_key == NULL)
+    return NULL;
+
+  rc = pthread_key_create(thread_specific_key, thread_data_destructor);
+
+  if (rc != 0)
+    return NULL;
+
+  return thread_specific_key;
+}
+
+/* The destructor for clean the thread specific data. */
+void cl_thread_data_destroy(void * data)
+{
+  pthread_key_t *thread_specific_key = (pthread_key_t *)data;
+  pthread_key_delete(*thread_specific_key);
+  cl_free(thread_specific_key);
+}
diff --git a/src/cl_thread.h b/src/cl_thread.h
new file mode 100644
index 0000000..65f1bcf
--- /dev/null
+++ b/src/cl_thread.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __CL_THREAD_H__
+#define __CL_THREAD_H__
+
+#include <pthread.h>
+#include "cl_internals.h"
+#include "cl_command_queue.h"
+
+/* Create the thread specific data. */
+void* cl_thread_data_create(void);
+
+/* The destructor for clean the thread specific data. */
+void cl_thread_data_destroy(void * data);
+
+/* Used to get the gpgpu struct of each thread. */
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
+#endif /* __CL_THREAD_H__ */
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index c87cba8..5e0bc19 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -53,7 +53,7 @@ set (utests_sources
   compiler_if_else.cpp
   compiler_integer_division.cpp
   compiler_integer_remainder.cpp
-	compiler_insert_vector.cpp
+  compiler_insert_vector.cpp
   compiler_lower_return0.cpp
   compiler_lower_return1.cpp
   compiler_lower_return2.cpp
-- 
1.7.9.5