[Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context

Thu Nov 7 16:29:05 PST 2013

I agree with you that use thread data is better than locking.
One comment, how about to use thread local storage to simplify
this patch as below:

struct _cl_commonand_queue {
...
__thread cl_gpgpu gpgpu;
...
};

Then in the initialization stage, set it to NULL;

queue->gpgpu = NULL;

In the head of each functions which use queue->gpgpu, add the following
code:

if (queue->gpgpu == NULL)
  TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv));

Then we don't need to change any other code?

What's your opinion?

On Fri, Nov 08, 2013 at 12:58:00AM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> We find some cases will use multi-threads to run on the same queue,
> executing the same kernel. This will cause the gpgpu struct which
> is very important for GPU context setting be destroyed because we
> do not implement any sync protect on it now.
> Move the gpgpu struct into thread specific space will fix this problem
> because the lib_drm will do the GPU command serialization for us.
> ---
>  src/CMakeLists.txt          |    1 +
>  src/cl_command_queue.c      |   27 +++++++++++-----
>  src/cl_command_queue.h      |    9 +++++-
>  src/cl_command_queue_gen7.c |    7 +++--
>  src/cl_event.c              |    6 ++--
>  src/cl_thread.c             |   72 +++++++++++++++++++++++++++++++++++++++++++
>  src/cl_thread.h             |   34 ++++++++++++++++++++
>  utests/CMakeLists.txt       |    2 +-
>  8 files changed, 144 insertions(+), 14 deletions(-)
>  create mode 100644 src/cl_thread.c
>  create mode 100644 src/cl_thread.h
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index 1e28c6c..59d330e 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -39,6 +39,7 @@ set(OPENCL_SRC
>      cl_command_queue.c
>      cl_command_queue.h
>      cl_command_queue_gen7.c
> +    cl_thread.c
>      cl_driver.h
>      cl_driver.cpp
>      cl_driver_defs.c
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index 3f9d95c..3530976 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -24,6 +24,7 @@
>  #include "cl_device_id.h"
>  #include "cl_mem.h"
>  #include "cl_utils.h"
> +#include "cl_thread.h"
>  #include "cl_alloc.h"
>  #include "cl_driver.h"
>  #include "cl_khr_icd.h"
> @@ -43,7 +44,9 @@ cl_command_queue_new(cl_context ctx)
>    queue->magic = CL_MAGIC_QUEUE_HEADER;
>    queue->ref_n = 1;
>    queue->ctx = ctx;
> -  TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv));
> +  if ((queue->thread_data = cl_thread_data_create()) == NULL) {
> +    goto error;
> +  }
>  
>    /* Append the command queue in the list */
>    pthread_mutex_lock(&ctx->queue_lock);
> @@ -84,9 +87,11 @@ cl_command_queue_delete(cl_command_queue queue)
>      cl_mem_delete(queue->fulsim_out);
>      queue->fulsim_out = NULL;
>    }
> +
> +  cl_thread_data_destroy(queue->thread_data);
> +  queue->thread_data = NULL;
>    cl_mem_delete(queue->perf);
>    cl_context_delete(queue->ctx);
> -  cl_gpgpu_delete(queue->gpgpu);
>    cl_free(queue->wait_events);
>    queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
>    cl_free(queue);
> @@ -119,13 +124,15 @@ LOCAL cl_int
>  cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
>  {
>    uint32_t i;
> +  GET_QUEUE_THREAD_GPGPU(queue);
> +
>    for (i = 0; i < k->image_sz; i++) {
>      int id = k->images[i].arg_idx;
>      struct _cl_mem_image *image;
>      assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
>      image = cl_mem_image(k->args[id].mem);
>      set_image_info(k->curbe, &k->images[i], image);
> -    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, image->base.bo, image->offset,
> +    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
>                          image->intel_fmt, image->image_type,
>                          image->w, image->h, image->depth,
>                          image->row_pitch, image->tiling);
> @@ -136,6 +143,8 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
>  LOCAL cl_int
>  cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
>  {
> +  GET_QUEUE_THREAD_GPGPU(queue);
> +
>    /* Bind all user buffers (given by clSetKernelArg) */
>    uint32_t i;
>    enum gbe_arg_type arg_type; /* kind of argument */
> @@ -147,9 +156,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
>      offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
>      if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
>        struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
> -      cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
> +      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
>      } else {
> -      cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
> +      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
>      }
>    }
>  
> @@ -407,14 +416,18 @@ error:
>  LOCAL cl_int
>  cl_command_queue_flush(cl_command_queue queue)
>  {
> -  cl_gpgpu_flush(queue->gpgpu);
> +  GET_QUEUE_THREAD_GPGPU(queue);
> +
> +  cl_gpgpu_flush(gpgpu);
>    return CL_SUCCESS;
>  }
>  
>  LOCAL cl_int
>  cl_command_queue_finish(cl_command_queue queue)
>  {
> -  cl_gpgpu_sync(queue->gpgpu);
> +  GET_QUEUE_THREAD_GPGPU(queue);
> +
> +  cl_gpgpu_sync(gpgpu);
>    return CL_SUCCESS;
>  }
>  
> diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
> index 9396fd7..40c272c 100644
> --- a/src/cl_command_queue.h
> +++ b/src/cl_command_queue.h
> @@ -22,6 +22,7 @@
>  
>  #include "cl_internals.h"
>  #include "cl_driver.h"
> +#include "cl_thread.h"
>  #include "CL/cl.h"
>  #include <stdint.h>
>  
> @@ -40,11 +41,17 @@ struct _cl_command_queue {
>    cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
>    cl_command_queue_properties  props;  /* Queue properties */
>    cl_command_queue prev, next;         /* We chain the command queues together */
> -  cl_gpgpu gpgpu;                      /* Setup all GEN commands */
> +  void *thread_data;                   /* Used to store thread context data */
>    cl_mem perf;                         /* Where to put the perf counters */
>    cl_mem fulsim_out;                   /* Fulsim will output this buffer */
>  };
>  
> +/* The macro to get the thread specified gpgpu struct. */
> +#define GET_QUEUE_THREAD_GPGPU(queue) \
> +	cl_gpgpu gpgpu = queue ? cl_get_thread_gpgpu(queue) : NULL;  \
> +	if (queue) \
> +	  assert(gpgpu);
> +
>  /* Allocate and initialize a new command queue. Also insert it in the list of
>   * command queue in the associated context
>   */
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index 65f8e17..fb7a8a3 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -99,6 +99,7 @@ static void
>  cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
>  {
>    /* calculate constant buffer size */
> +  GET_QUEUE_THREAD_GPGPU(queue);
>    int32_t arg;
>    size_t offset;
>    gbe_program prog = ker->program->opaque;
> @@ -115,7 +116,7 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
>    if(global_const_size == 0 && constant_buf_size == 0)
>       return;
>  
> -  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
> +  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, constant_buf_size + global_const_size + 4);
>    cl_buffer_map(bo, 1);
>    char * cst_addr = cl_buffer_get_virtual(bo);
>    offset = 0;
> @@ -256,8 +257,8 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>                                 const size_t *global_wk_sz,
>                                 const size_t *local_wk_sz)
>  {
> +  GET_QUEUE_THREAD_GPGPU(queue);
>    cl_context ctx = queue->ctx;
> -  cl_gpgpu gpgpu = queue->gpgpu;
>    char *final_curbe = NULL;  /* Includes them and one sub-buffer per group */
>    cl_gpgpu_kernel kernel;
>    const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
> @@ -297,7 +298,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>    /* Bind user images */
>    cl_command_queue_bind_image(queue, ker);
>    /* Bind all samplers */
> -  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
> +  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
>  
>    cl_setup_scratch(gpgpu, ker);
>    /* Bind a stack if needed */
> diff --git a/src/cl_event.c b/src/cl_event.c
> index 1dc02ae..028dfb6 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -48,6 +48,7 @@ cl_event_is_gpu_command_type(cl_command_type type)
>  cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
>  {
>    cl_event event = NULL;
> +  GET_QUEUE_THREAD_GPGPU(queue);
>  
>    /* Allocate and inialize the structure itself */
>    TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
> @@ -75,7 +76,7 @@ cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type ty
>    else {
>      event->status = CL_QUEUED;
>      if(cl_event_is_gpu_command_type(event->type))
> -      event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
> +      event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
>    }
>    cl_event_add_ref(event);       //dec when complete
>    event->user_cb = NULL;
> @@ -257,6 +258,7 @@ void cl_event_new_enqueue_callback(cl_event event,
>    user_event *user_events, *u_ev;
>    cl_command_queue queue = event->queue;
>    cl_int i;
> +  GET_QUEUE_THREAD_GPGPU(data->queue);
>  
>    /* Allocate and inialize the structure itself */
>    TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
> @@ -336,7 +338,7 @@ void cl_event_new_enqueue_callback(cl_event event,
>      }
>    }
>    if(data->queue != NULL && event->gpgpu_event != NULL) {
> -    cl_gpgpu_event_pending(data->queue->gpgpu, event->gpgpu_event);
> +    cl_gpgpu_event_pending(gpgpu, event->gpgpu_event);
>      data->ptr = (void *)event->gpgpu_event;
>    }
>    cb->data = *data;
> diff --git a/src/cl_thread.c b/src/cl_thread.c
> new file mode 100644
> index 0000000..fbad5c5
> --- /dev/null
> +++ b/src/cl_thread.c
> @@ -0,0 +1,72 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library. If not, see <http://www.gnu.org/licenses/>.
> + *
> + */
> +
> +#include "cl_thread.h"
> +#include "cl_alloc.h"
> +#include "cl_utils.h"
> +
> +cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
> +{
> +  pthread_key_t* key = queue->thread_data;
> +  cl_gpgpu gpgpu = pthread_getspecific(*key);
> +
> +  if (!gpgpu) {
> +    TRY_ALLOC_NO_ERR (gpgpu, cl_gpgpu_new(queue->ctx->drv));
> +  }
> +
> +  if (pthread_setspecific(*key, gpgpu)) {
> +    cl_gpgpu_delete(gpgpu);
> +    goto error;
> +  }
> +
> +exit:
> +  return gpgpu;
> +error:
> +  pthread_setspecific(*key, NULL);
> +  goto exit;
> +}
> +
> +static void thread_data_destructor(void *data) {
> +  cl_gpgpu gpgpu = (cl_gpgpu)data;
> +  cl_gpgpu_delete(gpgpu);
> +}
> +
> +/* Create the thread specific data. */
> +void* cl_thread_data_create(void)
> +{
> +  int rc = 0;
> +
> +  pthread_key_t *thread_specific_key = CALLOC(pthread_key_t);
> +  if (thread_specific_key == NULL)
> +    return NULL;
> +
> +  rc = pthread_key_create(thread_specific_key, thread_data_destructor);
> +
> +  if (rc != 0)
> +    return NULL;
> +
> +  return thread_specific_key;
> +}
> +
> +/* The destructor for clean the thread specific data. */
> +void cl_thread_data_destroy(void * data)
> +{
> +  pthread_key_t *thread_specific_key = (pthread_key_t *)data;
> +  pthread_key_delete(*thread_specific_key);
> +  cl_free(thread_specific_key);
> +}
> diff --git a/src/cl_thread.h b/src/cl_thread.h
> new file mode 100644
> index 0000000..65f1bcf
> --- /dev/null
> +++ b/src/cl_thread.h
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library. If not, see <http://www.gnu.org/licenses/>.
> + *
> + */
> +
> +#ifndef __CL_THREAD_H__
> +#define __CL_THREAD_H__
> +
> +#include <pthread.h>
> +#include "cl_internals.h"
> +#include "cl_command_queue.h"
> +
> +/* Create the thread specific data. */
> +void* cl_thread_data_create(void);
> +
> +/* The destructor for clean the thread specific data. */
> +void cl_thread_data_destroy(void * data);
> +
> +/* Used to get the gpgpu struct of each thread. */
> +cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
> +#endif /* __CL_THREAD_H__ */
> diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
> index c87cba8..5e0bc19 100644
> --- a/utests/CMakeLists.txt
> +++ b/utests/CMakeLists.txt
> @@ -53,7 +53,7 @@ set (utests_sources
>    compiler_if_else.cpp
>    compiler_integer_division.cpp
>    compiler_integer_remainder.cpp
> -	compiler_insert_vector.cpp
> +  compiler_insert_vector.cpp
>    compiler_lower_return0.cpp
>    compiler_lower_return1.cpp
>    compiler_lower_return2.cpp
> -- 
> 1.7.9.5
> 
> 
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet