[Beignet] [PATCH 5/6] Add the printf logic into the run time.

Tue Jun 10 18:07:45 PDT 2014

On Tue, Jun 10, 2014 at 12:53:12PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  backend/src/gbe_bin_interpreter.cpp |   6 ++
>  src/cl_command_queue.c              |  14 +++++
>  src/cl_command_queue_gen7.c         |  24 ++++++++
>  src/cl_driver.h                     |  28 ++++++++++
>  src/cl_driver_defs.c                |   7 +++
>  src/cl_gbe_loader.cpp               |  25 +++++++++
>  src/intel/intel_gpgpu.c             | 106 ++++++++++++++++++++++++++++++++++++
>  7 files changed, 210 insertions(+)
> 
> diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
> index 64bf5c4..1945b5b 100644
> --- a/backend/src/gbe_bin_interpreter.cpp
> +++ b/backend/src/gbe_bin_interpreter.cpp
> @@ -21,6 +21,7 @@
>  #include "sys/assert.cpp"
>  #include "sys/platform.cpp"
>  #include "ir/constant.cpp"
> +#include "ir/printf.cpp"
>  
>  #pragma GCC diagnostic ignored "-Wunused-function"
>  #pragma GCC diagnostic ignored "-Wunused-variable"
> @@ -62,6 +63,11 @@ struct BinInterpCallBackInitializer
>      gbe_kernel_get_image_data = gbe::kernelGetImageData;
>      gbe_get_image_base_index = gbe::getImageBaseIndex;
>      gbe_set_image_base_index = gbe::setImageBaseIndex;
> +    gbe_get_printf_num = gbe::kernelGetPrintfNum;
> +    gbe_dup_printfset = gbe::kernelDupPrintfSet;
> +    gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
> +    gbe_release_printf_info = gbe::kernelReleasePrintfSet;
> +    gbe_output_printf = gbe::kernelOutputPrintf;
>    }
>  
>    ~BinInterpCallBackInitializer() {
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index a2109d7..9bfc23e 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -420,9 +420,23 @@ LOCAL cl_int
>  cl_command_queue_flush(cl_command_queue queue)
>  {
>    GET_QUEUE_THREAD_GPGPU(queue);
> +  size_t global_wk_sz[3];
> +  void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz);
>  
>    cl_gpgpu_flush(gpgpu);
>  
> +  if (printf_info && gbe_get_printf_num(printf_info)) {
> +    void *index_addr = cl_gpgpu_map_printf_buffer(gpgpu, 0);
> +    void *buf_addr = cl_gpgpu_map_printf_buffer(gpgpu, 1);
> +    gbe_output_printf(printf_info, index_addr, buf_addr, global_wk_sz[0],
> +                      global_wk_sz[1], global_wk_sz[2]);

I have two general comments here:

The first is that according the spec:
  6.12.13.1 printf output synchronization
    When the event that is associated with a particular kernel invocation is completed, the output of
    all printf() calls executed by this kernel invocation is flushed to the implementation-defined
    output stream. Calling clFinish on a command queue flushes all pending output by printf in
    previously enqueued and completed commands to the implementation-defined output stream.

  The correct timing to call gbe_output_printf should be in clFinish or a event complete on a specific kernel.

The second is about the output stream. I think it's better to add a lock here to make sure
the gbe_output_printf will not be called concurrently in more than one thread.

> +    cl_gpgpu_unmap_printf_buffer(gpgpu, 0);
> +    cl_gpgpu_unmap_printf_buffer(gpgpu, 1);
> +    gbe_release_printf_info(printf_info);
> +    global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
> +    cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
> +  }
> +
>    cl_invalid_thread_gpgpu(queue);
>    return CL_SUCCESS;
>  }
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index d875021..84bb41c 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -246,6 +246,19 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
>    cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
>  }
>  
> +static void
> +cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
> +  int32_t value = GBE_CURBE_PRINTF_INDEX_POINTER;
> +  int32_t offset = gbe_kernel_get_curbe_offset(ker->opaque, value, 0);
> +  size_t buf_size = global_sz * sizeof(int) * printf_num;
> +  cl_gpgpu_set_printf_buffer(gpgpu, 0, buf_size, offset);
> +
> +  value = GBE_CURBE_PRINTF_BUF_POINTER;
> +  offset = gbe_kernel_get_curbe_offset(ker->opaque, value, 0);
> +  buf_size = gbe_get_printf_sizeof_size(printf_info) * global_sz;
> +  cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset);
> +}
> +
>  LOCAL cl_int
>  cl_command_queue_ND_range_gen7(cl_command_queue queue,
>                                 cl_kernel ker,
> @@ -263,7 +276,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>    size_t cst_sz = ker->curbe_sz= gbe_kernel_get_curbe_size(ker->opaque);
>    int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);
>    size_t thread_n = 0u;
> +  int printf_num = 0;
>    cl_int err = CL_SUCCESS;
> +  size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
> +  void* printf_info = NULL;
>  
>    /* Setup kernel */
>    kernel.name = "KERNEL";
> @@ -291,12 +307,20 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>      }
>    }
>  
> +  printf_info = gbe_dup_printfset(ker->opaque);
> +  cl_gpgpu_set_printf_info(gpgpu, printf_info, (size_t *)global_wk_sz);
> +
>    /* Setup the kernel */
>    if (queue->props & CL_QUEUE_PROFILING_ENABLE)
>      cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
>    else
>      cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
>  
> +  printf_num = gbe_get_printf_num(printf_info);
> +  if (printf_num) {
> +    cl_bind_printf(gpgpu, ker, printf_info, printf_num, global_size);
> +  }
> +
>    /* Bind user buffers */
>    cl_command_queue_bind_surface(queue, ker);
>    /* Bind user images */
> diff --git a/src/cl_driver.h b/src/cl_driver.h
> index 3e01c92..771fcc2 100644
> --- a/src/cl_driver.h
> +++ b/src/cl_driver.h
> @@ -208,6 +208,34 @@ extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
>  typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
>  extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
>  
> +/* Set the printf buffer */
> +typedef void (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint32_t);
> +extern cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer;
> +
> +/* get the printf buffer offset in the apeture*/
> +typedef unsigned long (cl_gpgpu_reloc_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t);
> +extern cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer;
> +
> +/* map the printf buffer */
> +typedef void* (cl_gpgpu_map_printf_buffer_cb)(cl_gpgpu, uint32_t);
> +extern cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer;
> +
> +/* unmap the printf buffer */
> +typedef void (cl_gpgpu_unmap_printf_buffer_cb)(cl_gpgpu, uint32_t);
> +extern cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer;
> +
> +/* release the printf buffer */
> +typedef unsigned long (cl_gpgpu_release_printf_buffer_cb)(cl_gpgpu, uint32_t);
> +extern cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer;
> +
> +/* Set the last printfset pointer */
> +typedef void (cl_gpgpu_set_printf_info_cb)(cl_gpgpu, void *, size_t*);
> +extern cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info;
> +
> +/* Get the last printfset pointer */
> +typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu, size_t*);
> +extern cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info;
> +
>  /* Will spawn all threads */
>  typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
>                                    uint32_t simd_sz,
> diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
> index 95a1a03..0781dbf 100644
> --- a/src/cl_driver_defs.c
> +++ b/src/cl_driver_defs.c
> @@ -85,4 +85,11 @@ LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp =
>  LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
>  LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
>  LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
> +LOCAL cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer = NULL;
> +LOCAL cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer = NULL;
> +LOCAL cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer = NULL;
> +LOCAL cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer = NULL;
> +LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
> +LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
> +LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
>  
> diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
> index b1b75d6..38f9ab6 100644
> --- a/src/cl_gbe_loader.cpp
> +++ b/src/cl_gbe_loader.cpp
> @@ -59,6 +59,11 @@ gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
>  gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
>  gbe_set_image_base_index_cb *gbe_set_image_base_index_interp = NULL;
>  gbe_get_image_base_index_cb *gbe_get_image_base_index = NULL;
> +gbe_get_printf_num_cb* gbe_get_printf_num = NULL;
> +gbe_dup_printfset_cb* gbe_dup_printfset = NULL;
> +gbe_get_printf_sizeof_size_cb* gbe_get_printf_sizeof_size = NULL;
> +gbe_release_printf_info_cb* gbe_release_printf_info = NULL;
> +gbe_output_printf_cb* gbe_output_printf = NULL;
>  
>  struct GbeLoaderInitializer
>  {
> @@ -200,6 +205,26 @@ struct GbeLoaderInitializer
>      if (gbe_get_image_base_index == NULL)
>        return false;
>  
> +    gbe_get_printf_num = *(gbe_get_printf_num_cb**)dlsym(dlhInterp, "gbe_get_printf_num");
> +    if (gbe_get_printf_num == NULL)
> +      return false;
> +
> +    gbe_dup_printfset = *(gbe_dup_printfset_cb**)dlsym(dlhInterp, "gbe_dup_printfset");
> +    if (gbe_dup_printfset == NULL)
> +      return false;
> +
> +    gbe_get_printf_sizeof_size = *(gbe_get_printf_sizeof_size_cb**)dlsym(dlhInterp, "gbe_get_printf_sizeof_size");
> +    if (gbe_get_printf_sizeof_size == NULL)
> +      return false;
> +
> +    gbe_release_printf_info = *(gbe_release_printf_info_cb**)dlsym(dlhInterp, "gbe_release_printf_info");
> +    if (gbe_release_printf_info == NULL)
> +      return false;
> +
> +    gbe_output_printf = *(gbe_output_printf_cb**)dlsym(dlhInterp, "gbe_output_printf");
> +    if (gbe_output_printf == NULL)
> +      return false;
> +
>      return true;
>    }
>  
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index bde9bd5..fa6034b 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -78,6 +78,9 @@ enum {max_sampler_n = 16 };
>  /* Handle GPGPU state */
>  struct intel_gpgpu
>  {
> +  void* ker_opaque;
> +  size_t global_wk_sz[3];
> +  void* printf_info;
>    intel_driver_t *drv;
>    intel_batchbuffer_t *batch;
>    cl_gpgpu_kernel *ker;
> @@ -97,6 +100,8 @@ struct intel_gpgpu
>    struct { drm_intel_bo *bo; } scratch_b;
>    struct { drm_intel_bo *bo; } constant_b;
>    struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
> +  struct { drm_intel_bo *bo;
> +           drm_intel_bo *ibo;} printf_b;      /* the printf buf and index buf*/
>  
>    struct { drm_intel_bo *bo; } aux_buf;
>    struct {
> @@ -152,6 +157,10 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
>      return;
>    if(gpgpu->time_stamp_b.bo)
>      drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
> +  if(gpgpu->printf_b.bo)
> +    drm_intel_bo_unreference(gpgpu->printf_b.bo);
> +  if(gpgpu->printf_b.ibo)
> +    drm_intel_bo_unreference(gpgpu->printf_b.ibo);
>    if (gpgpu->aux_buf.bo)
>      drm_intel_bo_unreference(gpgpu->aux_buf.bo);
>    if (gpgpu->perf_b.bo)
> @@ -532,6 +541,13 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
>    gpgpu->urb.size_cs_entry = size_cs_entry;
>    gpgpu->max_threads = max_threads;
>  
> +  if (gpgpu->printf_b.ibo)
> +    dri_bo_unreference(gpgpu->printf_b.ibo);
> +  gpgpu->printf_b.ibo = NULL;
> +  if (gpgpu->printf_b.bo)
> +    dri_bo_unreference(gpgpu->printf_b.bo);
> +  gpgpu->printf_b.bo = NULL;
> +
>    /* Set the profile buffer*/
>    if(gpgpu->time_stamp_b.bo)
>      dri_bo_unreference(gpgpu->time_stamp_b.bo);
> @@ -1174,6 +1190,90 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
>    drm_intel_gem_bo_unmap_gtt(event->ts_buf);
>  }
>  
> +static void
> +intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint32_t offset)
> +{
> +  drm_intel_bo *bo = NULL;
> +  if (i == 0) { // the index buffer.
> +    if (gpgpu->printf_b.ibo)
> +      dri_bo_unreference(gpgpu->printf_b.ibo);
> +    gpgpu->printf_b.ibo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf index buffer", size, 4096);
> +    bo = gpgpu->printf_b.ibo;
> +  } else if (i == 1) {
> +    if (gpgpu->printf_b.bo)
> +      dri_bo_unreference(gpgpu->printf_b.bo);
> +    gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf output buffer", size, 4096);
> +    bo = gpgpu->printf_b.bo;
> +  } else
> +    assert(0);
> +
> +  drm_intel_bo_map(bo, 1);
> +  memset(bo->virtual, 0, size);
> +  drm_intel_bo_unmap(bo);
> +
> +  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, 0);
> +}
> +
> +static void*
> +intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
> +{
> +  drm_intel_bo *bo = NULL;
> +  if (i == 0) {
> +    bo = gpgpu->printf_b.ibo;
> +  } else if (i == 1) {
> +    bo = gpgpu->printf_b.bo;
> +  } else
> +    assert(0);
> +
> +  drm_intel_bo_map(bo, 1);
> +  return bo->virtual;
> +}
> +
> +static void
> +intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu, uint32_t i)
> +{
> +  drm_intel_bo *bo = NULL;
> +  if (i == 0) {
> +    bo = gpgpu->printf_b.ibo;
> +  } else if (i == 1) {
> +    bo = gpgpu->printf_b.bo;
> +  } else
> +  assert(0);
> +
> +  drm_intel_bo_unmap(bo);
> +}
> +
> +static void
> +intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
> +{
> +  if (i == 0) {
> +    drm_intel_bo_unreference(gpgpu->printf_b.ibo);
> +    gpgpu->printf_b.ibo = NULL;
> +  } else if (i == 1) {
> +    drm_intel_bo_unreference(gpgpu->printf_b.bo);
> +    gpgpu->printf_b.bo = NULL;
> +  } else
> +    assert(0);
> +}
> +
> +static void
> +intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info, size_t * global_sz)
> +{
> +  gpgpu->printf_info = printf_info;
> +  gpgpu->global_wk_sz[0] = global_sz[0];
> +  gpgpu->global_wk_sz[1] = global_sz[1];
> +  gpgpu->global_wk_sz[2] = global_sz[2];
> +}
> +
> +static void*
> +intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu, size_t * global_sz)
> +{
> +  global_sz[0] = gpgpu->global_wk_sz[0];
> +  global_sz[1] = gpgpu->global_wk_sz[1];
> +  global_sz[2] = gpgpu->global_wk_sz[2];
> +  return gpgpu->printf_info;
> +}
> +
>  LOCAL void
>  intel_set_gpgpu_callbacks(int device_id)
>  {
> @@ -1204,6 +1304,12 @@ intel_set_gpgpu_callbacks(int device_id)
>    cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
>    cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
>    cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
> +  cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf;
> +  cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf;
> +  cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr;
> +  cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
> +  cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
> +  cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
>  
>    if (IS_HASWELL(device_id)) {
>      cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
> -- 
> 1.8.3.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet