[Beignet] [PATCH 5/6] Add the printf logic into the run time.
Zhigang Gong
zhigang.gong at linux.intel.com
Tue Jun 10 18:07:45 PDT 2014
On Tue, Jun 10, 2014 at 12:53:12PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
>
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
> backend/src/gbe_bin_interpreter.cpp | 6 ++
> src/cl_command_queue.c | 14 +++++
> src/cl_command_queue_gen7.c | 24 ++++++++
> src/cl_driver.h | 28 ++++++++++
> src/cl_driver_defs.c | 7 +++
> src/cl_gbe_loader.cpp | 25 +++++++++
> src/intel/intel_gpgpu.c | 106 ++++++++++++++++++++++++++++++++++++
> 7 files changed, 210 insertions(+)
>
> diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
> index 64bf5c4..1945b5b 100644
> --- a/backend/src/gbe_bin_interpreter.cpp
> +++ b/backend/src/gbe_bin_interpreter.cpp
> @@ -21,6 +21,7 @@
> #include "sys/assert.cpp"
> #include "sys/platform.cpp"
> #include "ir/constant.cpp"
> +#include "ir/printf.cpp"
>
> #pragma GCC diagnostic ignored "-Wunused-function"
> #pragma GCC diagnostic ignored "-Wunused-variable"
> @@ -62,6 +63,11 @@ struct BinInterpCallBackInitializer
> gbe_kernel_get_image_data = gbe::kernelGetImageData;
> gbe_get_image_base_index = gbe::getImageBaseIndex;
> gbe_set_image_base_index = gbe::setImageBaseIndex;
> + gbe_get_printf_num = gbe::kernelGetPrintfNum;
> + gbe_dup_printfset = gbe::kernelDupPrintfSet;
> + gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
> + gbe_release_printf_info = gbe::kernelReleasePrintfSet;
> + gbe_output_printf = gbe::kernelOutputPrintf;
> }
>
> ~BinInterpCallBackInitializer() {
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index a2109d7..9bfc23e 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -420,9 +420,23 @@ LOCAL cl_int
> cl_command_queue_flush(cl_command_queue queue)
> {
> GET_QUEUE_THREAD_GPGPU(queue);
> + size_t global_wk_sz[3];
> + void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz);
>
> cl_gpgpu_flush(gpgpu);
>
> + if (printf_info && gbe_get_printf_num(printf_info)) {
> + void *index_addr = cl_gpgpu_map_printf_buffer(gpgpu, 0);
> + void *buf_addr = cl_gpgpu_map_printf_buffer(gpgpu, 1);
> + gbe_output_printf(printf_info, index_addr, buf_addr, global_wk_sz[0],
> + global_wk_sz[1], global_wk_sz[2]);
I have two general comments here:
The first is that according the spec:
6.12.13.1 printf output synchronization
When the event that is associated with a particular kernel invocation is completed, the output of
all printf() calls executed by this kernel invocation is flushed to the implementation-defined
output stream. Calling clFinish on a command queue flushes all pending output by printf in
previously enqueued and completed commands to the implementation-defined output stream.
The correct timing to call gbe_output_printf should be in clFinish or a event complete on a specific kernel.
The second is about the output stream. I think it's better to add a lock here to make sure
the gbe_output_printf will not be called concurrently in more than one thread.
> + cl_gpgpu_unmap_printf_buffer(gpgpu, 0);
> + cl_gpgpu_unmap_printf_buffer(gpgpu, 1);
> + gbe_release_printf_info(printf_info);
> + global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
> + cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
> + }
> +
> cl_invalid_thread_gpgpu(queue);
> return CL_SUCCESS;
> }
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index d875021..84bb41c 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -246,6 +246,19 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
> cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
> }
>
> +static void
> +cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
> + int32_t value = GBE_CURBE_PRINTF_INDEX_POINTER;
> + int32_t offset = gbe_kernel_get_curbe_offset(ker->opaque, value, 0);
> + size_t buf_size = global_sz * sizeof(int) * printf_num;
> + cl_gpgpu_set_printf_buffer(gpgpu, 0, buf_size, offset);
> +
> + value = GBE_CURBE_PRINTF_BUF_POINTER;
> + offset = gbe_kernel_get_curbe_offset(ker->opaque, value, 0);
> + buf_size = gbe_get_printf_sizeof_size(printf_info) * global_sz;
> + cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset);
> +}
> +
> LOCAL cl_int
> cl_command_queue_ND_range_gen7(cl_command_queue queue,
> cl_kernel ker,
> @@ -263,7 +276,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
> size_t cst_sz = ker->curbe_sz= gbe_kernel_get_curbe_size(ker->opaque);
> int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);
> size_t thread_n = 0u;
> + int printf_num = 0;
> cl_int err = CL_SUCCESS;
> + size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
> + void* printf_info = NULL;
>
> /* Setup kernel */
> kernel.name = "KERNEL";
> @@ -291,12 +307,20 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
> }
> }
>
> + printf_info = gbe_dup_printfset(ker->opaque);
> + cl_gpgpu_set_printf_info(gpgpu, printf_info, (size_t *)global_wk_sz);
> +
> /* Setup the kernel */
> if (queue->props & CL_QUEUE_PROFILING_ENABLE)
> cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
> else
> cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
>
> + printf_num = gbe_get_printf_num(printf_info);
> + if (printf_num) {
> + cl_bind_printf(gpgpu, ker, printf_info, printf_num, global_size);
> + }
> +
> /* Bind user buffers */
> cl_command_queue_bind_surface(queue, ker);
> /* Bind user images */
> diff --git a/src/cl_driver.h b/src/cl_driver.h
> index 3e01c92..771fcc2 100644
> --- a/src/cl_driver.h
> +++ b/src/cl_driver.h
> @@ -208,6 +208,34 @@ extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
> typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
> extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
>
> +/* Set the printf buffer */
> +typedef void (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint32_t);
> +extern cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer;
> +
> +/* get the printf buffer offset in the apeture*/
> +typedef unsigned long (cl_gpgpu_reloc_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t);
> +extern cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer;
> +
> +/* map the printf buffer */
> +typedef void* (cl_gpgpu_map_printf_buffer_cb)(cl_gpgpu, uint32_t);
> +extern cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer;
> +
> +/* unmap the printf buffer */
> +typedef void (cl_gpgpu_unmap_printf_buffer_cb)(cl_gpgpu, uint32_t);
> +extern cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer;
> +
> +/* release the printf buffer */
> +typedef unsigned long (cl_gpgpu_release_printf_buffer_cb)(cl_gpgpu, uint32_t);
> +extern cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer;
> +
> +/* Set the last printfset pointer */
> +typedef void (cl_gpgpu_set_printf_info_cb)(cl_gpgpu, void *, size_t*);
> +extern cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info;
> +
> +/* Get the last printfset pointer */
> +typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu, size_t*);
> +extern cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info;
> +
> /* Will spawn all threads */
> typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
> uint32_t simd_sz,
> diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
> index 95a1a03..0781dbf 100644
> --- a/src/cl_driver_defs.c
> +++ b/src/cl_driver_defs.c
> @@ -85,4 +85,11 @@ LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp =
> LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
> LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
> LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
> +LOCAL cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer = NULL;
> +LOCAL cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer = NULL;
> +LOCAL cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer = NULL;
> +LOCAL cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer = NULL;
> +LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
> +LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
> +LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
>
> diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
> index b1b75d6..38f9ab6 100644
> --- a/src/cl_gbe_loader.cpp
> +++ b/src/cl_gbe_loader.cpp
> @@ -59,6 +59,11 @@ gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
> gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
> gbe_set_image_base_index_cb *gbe_set_image_base_index_interp = NULL;
> gbe_get_image_base_index_cb *gbe_get_image_base_index = NULL;
> +gbe_get_printf_num_cb* gbe_get_printf_num = NULL;
> +gbe_dup_printfset_cb* gbe_dup_printfset = NULL;
> +gbe_get_printf_sizeof_size_cb* gbe_get_printf_sizeof_size = NULL;
> +gbe_release_printf_info_cb* gbe_release_printf_info = NULL;
> +gbe_output_printf_cb* gbe_output_printf = NULL;
>
> struct GbeLoaderInitializer
> {
> @@ -200,6 +205,26 @@ struct GbeLoaderInitializer
> if (gbe_get_image_base_index == NULL)
> return false;
>
> + gbe_get_printf_num = *(gbe_get_printf_num_cb**)dlsym(dlhInterp, "gbe_get_printf_num");
> + if (gbe_get_printf_num == NULL)
> + return false;
> +
> + gbe_dup_printfset = *(gbe_dup_printfset_cb**)dlsym(dlhInterp, "gbe_dup_printfset");
> + if (gbe_dup_printfset == NULL)
> + return false;
> +
> + gbe_get_printf_sizeof_size = *(gbe_get_printf_sizeof_size_cb**)dlsym(dlhInterp, "gbe_get_printf_sizeof_size");
> + if (gbe_get_printf_sizeof_size == NULL)
> + return false;
> +
> + gbe_release_printf_info = *(gbe_release_printf_info_cb**)dlsym(dlhInterp, "gbe_release_printf_info");
> + if (gbe_release_printf_info == NULL)
> + return false;
> +
> + gbe_output_printf = *(gbe_output_printf_cb**)dlsym(dlhInterp, "gbe_output_printf");
> + if (gbe_output_printf == NULL)
> + return false;
> +
> return true;
> }
>
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index bde9bd5..fa6034b 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -78,6 +78,9 @@ enum {max_sampler_n = 16 };
> /* Handle GPGPU state */
> struct intel_gpgpu
> {
> + void* ker_opaque;
> + size_t global_wk_sz[3];
> + void* printf_info;
> intel_driver_t *drv;
> intel_batchbuffer_t *batch;
> cl_gpgpu_kernel *ker;
> @@ -97,6 +100,8 @@ struct intel_gpgpu
> struct { drm_intel_bo *bo; } scratch_b;
> struct { drm_intel_bo *bo; } constant_b;
> struct { drm_intel_bo *bo; } time_stamp_b; /* time stamp buffer */
> + struct { drm_intel_bo *bo;
> + drm_intel_bo *ibo;} printf_b; /* the printf buf and index buf*/
>
> struct { drm_intel_bo *bo; } aux_buf;
> struct {
> @@ -152,6 +157,10 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
> return;
> if(gpgpu->time_stamp_b.bo)
> drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
> + if(gpgpu->printf_b.bo)
> + drm_intel_bo_unreference(gpgpu->printf_b.bo);
> + if(gpgpu->printf_b.ibo)
> + drm_intel_bo_unreference(gpgpu->printf_b.ibo);
> if (gpgpu->aux_buf.bo)
> drm_intel_bo_unreference(gpgpu->aux_buf.bo);
> if (gpgpu->perf_b.bo)
> @@ -532,6 +541,13 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
> gpgpu->urb.size_cs_entry = size_cs_entry;
> gpgpu->max_threads = max_threads;
>
> + if (gpgpu->printf_b.ibo)
> + dri_bo_unreference(gpgpu->printf_b.ibo);
> + gpgpu->printf_b.ibo = NULL;
> + if (gpgpu->printf_b.bo)
> + dri_bo_unreference(gpgpu->printf_b.bo);
> + gpgpu->printf_b.bo = NULL;
> +
> /* Set the profile buffer*/
> if(gpgpu->time_stamp_b.bo)
> dri_bo_unreference(gpgpu->time_stamp_b.bo);
> @@ -1174,6 +1190,90 @@ intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
> drm_intel_gem_bo_unmap_gtt(event->ts_buf);
> }
>
> +static void
> +intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint32_t offset)
> +{
> + drm_intel_bo *bo = NULL;
> + if (i == 0) { // the index buffer.
> + if (gpgpu->printf_b.ibo)
> + dri_bo_unreference(gpgpu->printf_b.ibo);
> + gpgpu->printf_b.ibo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf index buffer", size, 4096);
> + bo = gpgpu->printf_b.ibo;
> + } else if (i == 1) {
> + if (gpgpu->printf_b.bo)
> + dri_bo_unreference(gpgpu->printf_b.bo);
> + gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf output buffer", size, 4096);
> + bo = gpgpu->printf_b.bo;
> + } else
> + assert(0);
> +
> + drm_intel_bo_map(bo, 1);
> + memset(bo->virtual, 0, size);
> + drm_intel_bo_unmap(bo);
> +
> + intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, 0);
> +}
> +
> +static void*
> +intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
> +{
> + drm_intel_bo *bo = NULL;
> + if (i == 0) {
> + bo = gpgpu->printf_b.ibo;
> + } else if (i == 1) {
> + bo = gpgpu->printf_b.bo;
> + } else
> + assert(0);
> +
> + drm_intel_bo_map(bo, 1);
> + return bo->virtual;
> +}
> +
> +static void
> +intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu, uint32_t i)
> +{
> + drm_intel_bo *bo = NULL;
> + if (i == 0) {
> + bo = gpgpu->printf_b.ibo;
> + } else if (i == 1) {
> + bo = gpgpu->printf_b.bo;
> + } else
> + assert(0);
> +
> + drm_intel_bo_unmap(bo);
> +}
> +
> +static void
> +intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
> +{
> + if (i == 0) {
> + drm_intel_bo_unreference(gpgpu->printf_b.ibo);
> + gpgpu->printf_b.ibo = NULL;
> + } else if (i == 1) {
> + drm_intel_bo_unreference(gpgpu->printf_b.bo);
> + gpgpu->printf_b.bo = NULL;
> + } else
> + assert(0);
> +}
> +
> +static void
> +intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info, size_t * global_sz)
> +{
> + gpgpu->printf_info = printf_info;
> + gpgpu->global_wk_sz[0] = global_sz[0];
> + gpgpu->global_wk_sz[1] = global_sz[1];
> + gpgpu->global_wk_sz[2] = global_sz[2];
> +}
> +
> +static void*
> +intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu, size_t * global_sz)
> +{
> + global_sz[0] = gpgpu->global_wk_sz[0];
> + global_sz[1] = gpgpu->global_wk_sz[1];
> + global_sz[2] = gpgpu->global_wk_sz[2];
> + return gpgpu->printf_info;
> +}
> +
> LOCAL void
> intel_set_gpgpu_callbacks(int device_id)
> {
> @@ -1204,6 +1304,12 @@ intel_set_gpgpu_callbacks(int device_id)
> cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
> cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
> cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
> + cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf;
> + cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf;
> + cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr;
> + cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
> + cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
> + cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
>
> if (IS_HASWELL(device_id)) {
> cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
> --
> 1.8.3.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list