[Beignet] [PATCH] Move the gpgpu struct from cl_command_queue to thread specific context

Thu Nov 7 18:36:59 PST 2013

I have tried __thread extension of GCC as you said.
It seems OK for global var but now workable for struct field.
I think struct memory may be allocated from heap, may be
on the stack, and can also be global var. So it may be impossible
for compiler to figure out how to store one of its field in thread
specific space.

As you said, it really may have a problem if one thread configure all
the queue context and then create another thread to exec NDRange.
But in current code, the gpu state will always be inited every time
when call exec NDRange.

On Fri, 2013-11-08 at 08:29 +0800, Zhigang Gong wrote:
> I agree with you that use thread data is better than locking.
> One comment, how about to use thread local storage to simplify
> this patch as below:
> 
> struct _cl_commonand_queue {
> ...
> __thread cl_gpgpu gpgpu;
> ...
> };
> 
> Then in the initialization stage, set it to NULL;
> 
> queue->gpgpu = NULL;
> 
> In the head of each functions which use queue->gpgpu, add the following
> code:
> 
> if (queue->gpgpu == NULL)
>   TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv));
> 
> Then we don't need to change any other code?
> 
> What's your opinion?
> 
> On Fri, Nov 08, 2013 at 12:58:00AM +0800, junyan.he at inbox.com wrote:
> > From: Junyan He <junyan.he at linux.intel.com>
> > 
> > We find some cases will use multi-threads to run on the same queue,
> > executing the same kernel. This will cause the gpgpu struct which
> > is very important for GPU context setting be destroyed because we
> > do not implement any sync protect on it now.
> > Move the gpgpu struct into thread specific space will fix this problem
> > because the lib_drm will do the GPU command serialization for us.
> > ---
> >  src/CMakeLists.txt          |    1 +
> >  src/cl_command_queue.c      |   27 +++++++++++-----
> >  src/cl_command_queue.h      |    9 +++++-
> >  src/cl_command_queue_gen7.c |    7 +++--
> >  src/cl_event.c              |    6 ++--
> >  src/cl_thread.c             |   72 +++++++++++++++++++++++++++++++++++++++++++
> >  src/cl_thread.h             |   34 ++++++++++++++++++++
> >  utests/CMakeLists.txt       |    2 +-
> >  8 files changed, 144 insertions(+), 14 deletions(-)
> >  create mode 100644 src/cl_thread.c
> >  create mode 100644 src/cl_thread.h
> > 
> > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> > index 1e28c6c..59d330e 100644
> > --- a/src/CMakeLists.txt
> > +++ b/src/CMakeLists.txt
> > @@ -39,6 +39,7 @@ set(OPENCL_SRC
> >      cl_command_queue.c
> >      cl_command_queue.h
> >      cl_command_queue_gen7.c
> > +    cl_thread.c
> >      cl_driver.h
> >      cl_driver.cpp
> >      cl_driver_defs.c
> > diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> > index 3f9d95c..3530976 100644
> > --- a/src/cl_command_queue.c
> > +++ b/src/cl_command_queue.c
> > @@ -24,6 +24,7 @@
> >  #include "cl_device_id.h"
> >  #include "cl_mem.h"
> >  #include "cl_utils.h"
> > +#include "cl_thread.h"
> >  #include "cl_alloc.h"
> >  #include "cl_driver.h"
> >  #include "cl_khr_icd.h"
> > @@ -43,7 +44,9 @@ cl_command_queue_new(cl_context ctx)
> >    queue->magic = CL_MAGIC_QUEUE_HEADER;
> >    queue->ref_n = 1;
> >    queue->ctx = ctx;
> > -  TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv));
> > +  if ((queue->thread_data = cl_thread_data_create()) == NULL) {
> > +    goto error;
> > +  }
> >  
> >    /* Append the command queue in the list */
> >    pthread_mutex_lock(&ctx->queue_lock);
> > @@ -84,9 +87,11 @@ cl_command_queue_delete(cl_command_queue queue)
> >      cl_mem_delete(queue->fulsim_out);
> >      queue->fulsim_out = NULL;
> >    }
> > +
> > +  cl_thread_data_destroy(queue->thread_data);
> > +  queue->thread_data = NULL;
> >    cl_mem_delete(queue->perf);
> >    cl_context_delete(queue->ctx);
> > -  cl_gpgpu_delete(queue->gpgpu);
> >    cl_free(queue->wait_events);
> >    queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
> >    cl_free(queue);
> > @@ -119,13 +124,15 @@ LOCAL cl_int
> >  cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
> >  {
> >    uint32_t i;
> > +  GET_QUEUE_THREAD_GPGPU(queue);
> > +
> >    for (i = 0; i < k->image_sz; i++) {
> >      int id = k->images[i].arg_idx;
> >      struct _cl_mem_image *image;
> >      assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
> >      image = cl_mem_image(k->args[id].mem);
> >      set_image_info(k->curbe, &k->images[i], image);
> > -    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, image->base.bo, image->offset,
> > +    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
> >                          image->intel_fmt, image->image_type,
> >                          image->w, image->h, image->depth,
> >                          image->row_pitch, image->tiling);
> > @@ -136,6 +143,8 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
> >  LOCAL cl_int
> >  cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
> >  {
> > +  GET_QUEUE_THREAD_GPGPU(queue);
> > +
> >    /* Bind all user buffers (given by clSetKernelArg) */
> >    uint32_t i;
> >    enum gbe_arg_type arg_type; /* kind of argument */
> > @@ -147,9 +156,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
> >      offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
> >      if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
> >        struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
> > -      cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
> > +      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
> >      } else {
> > -      cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
> > +      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
> >      }
> >    }
> >  
> > @@ -407,14 +416,18 @@ error:
> >  LOCAL cl_int
> >  cl_command_queue_flush(cl_command_queue queue)
> >  {
> > -  cl_gpgpu_flush(queue->gpgpu);
> > +  GET_QUEUE_THREAD_GPGPU(queue);
> > +
> > +  cl_gpgpu_flush(gpgpu);
> >    return CL_SUCCESS;
> >  }
> >  
> >  LOCAL cl_int
> >  cl_command_queue_finish(cl_command_queue queue)
> >  {
> > -  cl_gpgpu_sync(queue->gpgpu);
> > +  GET_QUEUE_THREAD_GPGPU(queue);
> > +
> > +  cl_gpgpu_sync(gpgpu);
> >    return CL_SUCCESS;
> >  }
> >  
> > diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
> > index 9396fd7..40c272c 100644
> > --- a/src/cl_command_queue.h
> > +++ b/src/cl_command_queue.h
> > @@ -22,6 +22,7 @@
> >  
> >  #include "cl_internals.h"
> >  #include "cl_driver.h"
> > +#include "cl_thread.h"
> >  #include "CL/cl.h"
> >  #include <stdint.h>
> >  
> > @@ -40,11 +41,17 @@ struct _cl_command_queue {
> >    cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
> >    cl_command_queue_properties  props;  /* Queue properties */
> >    cl_command_queue prev, next;         /* We chain the command queues together */
> > -  cl_gpgpu gpgpu;                      /* Setup all GEN commands */
> > +  void *thread_data;                   /* Used to store thread context data */
> >    cl_mem perf;                         /* Where to put the perf counters */
> >    cl_mem fulsim_out;                   /* Fulsim will output this buffer */
> >  };
> >  
> > +/* The macro to get the thread specified gpgpu struct. */
> > +#define GET_QUEUE_THREAD_GPGPU(queue) \
> > +	cl_gpgpu gpgpu = queue ? cl_get_thread_gpgpu(queue) : NULL;  \
> > +	if (queue) \
> > +	  assert(gpgpu);
> > +
> >  /* Allocate and initialize a new command queue. Also insert it in the list of
> >   * command queue in the associated context
> >   */
> > diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> > index 65f8e17..fb7a8a3 100644
> > --- a/src/cl_command_queue_gen7.c
> > +++ b/src/cl_command_queue_gen7.c
> > @@ -99,6 +99,7 @@ static void
> >  cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
> >  {
> >    /* calculate constant buffer size */
> > +  GET_QUEUE_THREAD_GPGPU(queue);
> >    int32_t arg;
> >    size_t offset;
> >    gbe_program prog = ker->program->opaque;
> > @@ -115,7 +116,7 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
> >    if(global_const_size == 0 && constant_buf_size == 0)
> >       return;
> >  
> > -  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
> > +  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, constant_buf_size + global_const_size + 4);
> >    cl_buffer_map(bo, 1);
> >    char * cst_addr = cl_buffer_get_virtual(bo);
> >    offset = 0;
> > @@ -256,8 +257,8 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
> >                                 const size_t *global_wk_sz,
> >                                 const size_t *local_wk_sz)
> >  {
> > +  GET_QUEUE_THREAD_GPGPU(queue);
> >    cl_context ctx = queue->ctx;
> > -  cl_gpgpu gpgpu = queue->gpgpu;
> >    char *final_curbe = NULL;  /* Includes them and one sub-buffer per group */
> >    cl_gpgpu_kernel kernel;
> >    const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
> > @@ -297,7 +298,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
> >    /* Bind user images */
> >    cl_command_queue_bind_image(queue, ker);
> >    /* Bind all samplers */
> > -  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
> > +  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
> >  
> >    cl_setup_scratch(gpgpu, ker);
> >    /* Bind a stack if needed */
> > diff --git a/src/cl_event.c b/src/cl_event.c
> > index 1dc02ae..028dfb6 100644
> > --- a/src/cl_event.c
> > +++ b/src/cl_event.c
> > @@ -48,6 +48,7 @@ cl_event_is_gpu_command_type(cl_command_type type)
> >  cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
> >  {
> >    cl_event event = NULL;
> > +  GET_QUEUE_THREAD_GPGPU(queue);
> >  
> >    /* Allocate and inialize the structure itself */
> >    TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
> > @@ -75,7 +76,7 @@ cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type ty
> >    else {
> >      event->status = CL_QUEUED;
> >      if(cl_event_is_gpu_command_type(event->type))
> > -      event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
> > +      event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
> >    }
> >    cl_event_add_ref(event);       //dec when complete
> >    event->user_cb = NULL;
> > @@ -257,6 +258,7 @@ void cl_event_new_enqueue_callback(cl_event event,
> >    user_event *user_events, *u_ev;
> >    cl_command_queue queue = event->queue;
> >    cl_int i;
> > +  GET_QUEUE_THREAD_GPGPU(data->queue);
> >  
> >    /* Allocate and inialize the structure itself */
> >    TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
> > @@ -336,7 +338,7 @@ void cl_event_new_enqueue_callback(cl_event event,
> >      }
> >    }
> >    if(data->queue != NULL && event->gpgpu_event != NULL) {
> > -    cl_gpgpu_event_pending(data->queue->gpgpu, event->gpgpu_event);
> > +    cl_gpgpu_event_pending(gpgpu, event->gpgpu_event);
> >      data->ptr = (void *)event->gpgpu_event;
> >    }
> >    cb->data = *data;
> > diff --git a/src/cl_thread.c b/src/cl_thread.c
> > new file mode 100644
> > index 0000000..fbad5c5
> > --- /dev/null
> > +++ b/src/cl_thread.c
> > @@ -0,0 +1,72 @@
> > +/*
> > + * Copyright © 2012 Intel Corporation
> > + *
> > + * This library is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2 of the License, or (at your option) any later version.
> > + *
> > + * This library is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with this library. If not, see <http://www.gnu.org/licenses/>.
> > + *
> > + */
> > +
> > +#include "cl_thread.h"
> > +#include "cl_alloc.h"
> > +#include "cl_utils.h"
> > +
> > +cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
> > +{
> > +  pthread_key_t* key = queue->thread_data;
> > +  cl_gpgpu gpgpu = pthread_getspecific(*key);
> > +
> > +  if (!gpgpu) {
> > +    TRY_ALLOC_NO_ERR (gpgpu, cl_gpgpu_new(queue->ctx->drv));
> > +  }
> > +
> > +  if (pthread_setspecific(*key, gpgpu)) {
> > +    cl_gpgpu_delete(gpgpu);
> > +    goto error;
> > +  }
> > +
> > +exit:
> > +  return gpgpu;
> > +error:
> > +  pthread_setspecific(*key, NULL);
> > +  goto exit;
> > +}
> > +
> > +static void thread_data_destructor(void *data) {
> > +  cl_gpgpu gpgpu = (cl_gpgpu)data;
> > +  cl_gpgpu_delete(gpgpu);
> > +}
> > +
> > +/* Create the thread specific data. */
> > +void* cl_thread_data_create(void)
> > +{
> > +  int rc = 0;
> > +
> > +  pthread_key_t *thread_specific_key = CALLOC(pthread_key_t);
> > +  if (thread_specific_key == NULL)
> > +    return NULL;
> > +
> > +  rc = pthread_key_create(thread_specific_key, thread_data_destructor);
> > +
> > +  if (rc != 0)
> > +    return NULL;
> > +
> > +  return thread_specific_key;
> > +}
> > +
> > +/* The destructor for clean the thread specific data. */
> > +void cl_thread_data_destroy(void * data)
> > +{
> > +  pthread_key_t *thread_specific_key = (pthread_key_t *)data;
> > +  pthread_key_delete(*thread_specific_key);
> > +  cl_free(thread_specific_key);
> > +}
> > diff --git a/src/cl_thread.h b/src/cl_thread.h
> > new file mode 100644
> > index 0000000..65f1bcf
> > --- /dev/null
> > +++ b/src/cl_thread.h
> > @@ -0,0 +1,34 @@
> > +/*
> > + * Copyright © 2012 Intel Corporation
> > + *
> > + * This library is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2 of the License, or (at your option) any later version.
> > + *
> > + * This library is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with this library. If not, see <http://www.gnu.org/licenses/>.
> > + *
> > + */
> > +
> > +#ifndef __CL_THREAD_H__
> > +#define __CL_THREAD_H__
> > +
> > +#include <pthread.h>
> > +#include "cl_internals.h"
> > +#include "cl_command_queue.h"
> > +
> > +/* Create the thread specific data. */
> > +void* cl_thread_data_create(void);
> > +
> > +/* The destructor for clean the thread specific data. */
> > +void cl_thread_data_destroy(void * data);
> > +
> > +/* Used to get the gpgpu struct of each thread. */
> > +cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
> > +#endif /* __CL_THREAD_H__ */
> > diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
> > index c87cba8..5e0bc19 100644
> > --- a/utests/CMakeLists.txt
> > +++ b/utests/CMakeLists.txt
> > @@ -53,7 +53,7 @@ set (utests_sources
> >    compiler_if_else.cpp
> >    compiler_integer_division.cpp
> >    compiler_integer_remainder.cpp
> > -	compiler_insert_vector.cpp
> > +  compiler_insert_vector.cpp
> >    compiler_lower_return0.cpp
> >    compiler_lower_return1.cpp
> >    compiler_lower_return2.cpp
> > -- 
> > 1.7.9.5
> > 
> > 
> > 
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet