[Beignet] [PATCH OCL20 03/11] OCL20: Implement clSetKernelExecInfo api
Xiuli Pan
xiuli.pan at intel.com
Wed Mar 2 03:01:56 UTC 2016
From: Yang Rong <rong.r.yang at intel.com>
The extra exec info need reloc, otherwize gpu can't read/write.
And it don't need set to curbe.
So reloc it to unused binding table.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
src/cl_api.c | 31 +++++++++++++++++++++++++++++--
src/cl_command_queue.c | 38 +++++++++++++++++++++++++++++++++-----
src/cl_command_queue.h | 7 +++++--
src/cl_command_queue_gen7.c | 7 +++++--
src/cl_kernel.c | 22 ++++++++++++++++++++++
src/cl_kernel.h | 6 +++++-
src/cl_khr_icd.c | 1 +
src/intel/intel_gpgpu.c | 10 ++++++----
8 files changed, 106 insertions(+), 16 deletions(-)
diff --git a/src/cl_api.c b/src/cl_api.c
index a8a4056..3867261 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -659,7 +659,7 @@ error:
}
void
- clSVMFree (cl_context context, void* svm_pointer)
+clSVMFree (cl_context context, void* svm_pointer)
{
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
@@ -1548,7 +1548,7 @@ error:
}
cl_int
-clSetKernelArgSVMPointer (cl_kernel kernel,
+clSetKernelArgSVMPointer(cl_kernel kernel,
cl_uint arg_index,
const void *arg_value)
{
@@ -1559,6 +1559,33 @@ clSetKernelArgSVMPointer (cl_kernel kernel,
error:
return err;
}
+cl_int
+clSetKernelExecInfo(cl_kernel kernel,
+ cl_kernel_exec_info param_name,
+ size_t param_value_size,
+ const void *param_value)
+{
+
+ cl_int err = CL_SUCCESS;
+ CHECK_KERNEL(kernel);
+
+ if((param_name != CL_KERNEL_EXEC_INFO_SVM_PTRS &&
+ param_name != CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM) ||
+ param_value == NULL || param_value_size == 0) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(param_name == CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM &&
+ *(cl_bool *)param_value == CL_TRUE) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ err = cl_kernel_set_exec_info(kernel, param_value_size, param_value);
+error:
+ return err;
+}
cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret)
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index e163d73..00f758d 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -123,7 +123,7 @@ set_image_info(char *curbe,
}
LOCAL cl_int
-cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
+cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k, uint32_t *max_bti)
{
uint32_t i;
GET_QUEUE_THREAD_GPGPU(queue);
@@ -135,6 +135,8 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
image = cl_mem_image(k->args[id].mem);
set_image_info(k->curbe, &k->images[i], image);
+ if(*max_bti < k->images[i].idx)
+ *max_bti = k->images[i].idx;
cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
image->intel_fmt, image->image_type, image->bpp,
image->w, image->h, image->depth,
@@ -151,12 +153,12 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
}
LOCAL cl_int
-cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
+cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, uint32_t *max_bti)
{
GET_QUEUE_THREAD_GPGPU(queue);
/* Bind all user buffers (given by clSetKernelArg) */
- uint32_t i;
+ uint32_t i, bti;
enum gbe_arg_type arg_type; /* kind of argument */
for (i = 0; i < k->arg_n; ++i) {
int32_t offset; // location of the address in the curbe
@@ -166,15 +168,41 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
if (offset < 0)
continue;
+ bti = interp_kernel_get_arg_bti(k->opaque, i);
+ if(*max_bti < bti)
+ *max_bti = bti;
if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
- cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+ cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, bti);
} else {
size_t mem_offset = 0; //
if(k->args[i].is_svm) {
mem_offset = (size_t)k->args[i].ptr - (size_t)k->args[i].mem->host_ptr;
}
- cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + mem_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+ cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + mem_offset, k->args[i].mem->size, bti);
+ }
+ }
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, uint32_t max_bti)
+{
+ uint32_t i;
+ size_t mem_offset, bti = max_bti;
+ cl_mem svm_mem;
+
+ GET_QUEUE_THREAD_GPGPU(queue);
+
+ for (i = 0; i < k->exec_info_n; i++) {
+ void *ptr = k->exec_info[i];
+ if((svm_mem = cl_context_get_svm_from_ptr(k->program->ctx, ptr)) != NULL) {
+ mem_offset = (size_t)ptr - (size_t)svm_mem->host_ptr;
+ /* only need realloc in surface state, don't need realloc in curbe */
+ cl_gpgpu_bind_buf(gpgpu, svm_mem->bo, -1, svm_mem->offset + mem_offset, svm_mem->size, bti++);
+ if(bti == BTI_WORKAROUND_IMAGE_OFFSET)
+ bti = max_bti + BTI_WORKAROUND_IMAGE_OFFSET;
+ assert(bti < BTI_MAX_ID);
}
}
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 2cd6739..bdf1a43 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -84,10 +84,13 @@ extern int cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
extern cl_int cl_command_queue_finish(cl_command_queue);
/* Bind all the surfaces in the GPGPU state */
-extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
+extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel, uint32_t *);
/* Bind all the image surfaces in the GPGPU state */
-extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
+extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel, uint32_t *);
+
+/* Bind all exec info to bind table */
+extern cl_int cl_command_queue_bind_exec_info(cl_command_queue, cl_kernel, uint32_t);
/* Insert a user event to command's wait_events */
extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 63cac6e..38cf56b 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -350,6 +350,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
cl_int err = CL_SUCCESS;
size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
void* printf_info = NULL;
+ uint32_t max_bti = 0;
/* Setup kernel */
kernel.name = "KERNEL";
@@ -397,9 +398,11 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
}
/* Bind user buffers */
- cl_command_queue_bind_surface(queue, ker);
+ cl_command_queue_bind_surface(queue, ker, &max_bti);
/* Bind user images */
- cl_command_queue_bind_image(queue, ker);
+ cl_command_queue_bind_image(queue, ker, &max_bti);
+ /* Bind all exec infos */
+ cl_command_queue_bind_exec_info(queue, ker, max_bti);
/* Bind all samplers */
cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 723eac3..e67e442 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -57,6 +57,8 @@ cl_kernel_delete(cl_kernel k)
}
if (k->image_sz)
cl_free(k->images);
+ if (k->exec_info)
+ cl_free(k->exec_info);
k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
cl_free(k);
}
@@ -254,6 +256,21 @@ cl_kernel_set_arg_svm_pointer(cl_kernel k, cl_uint index, const void *value)
return 0;
}
+LOCAL cl_int
+cl_kernel_set_exec_info(cl_kernel k, size_t n, const void *value)
+{
+ cl_int err = CL_SUCCESS;
+ assert(k != NULL);
+
+ if (n == 0) return err;
+ TRY_ALLOC(k->exec_info, cl_calloc(n, 1));
+ memcpy(k->exec_info, value, n);
+ k->exec_info_n = n / sizeof(void *);
+
+error:
+ return err;
+}
+
LOCAL int
cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret)
@@ -410,6 +427,7 @@ cl_kernel_dup(cl_kernel from)
to->curbe_sz = from->curbe_sz;
to->sampler_sz = from->sampler_sz;
to->image_sz = from->image_sz;
+ to->exec_info_n = from->exec_info_n;
memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
to->stack_size = from->stack_size;
if (to->sampler_sz)
@@ -419,6 +437,10 @@ cl_kernel_dup(cl_kernel from)
memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0]));
} else
to->images = NULL;
+ if (to->exec_info_n) { /* Must always 0 here */
+ TRY_ALLOC_NO_ERR(to->exec_info, cl_calloc(to->exec_info_n, sizeof(void *)));
+ memcpy(to->exec_info, from->exec_info, to->exec_info_n * sizeof(void *));
+ }
TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 5b3294b..87187bc 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -67,6 +67,8 @@ struct _cl_kernel {
cl_argument *args; /* To track argument setting */
uint32_t arg_n:31; /* Number of arguments */
uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
+ uint32_t exec_info_n; /* The kernel's exec info count */
+ void** exec_info; /* The kernel's exec info */
};
/* Allocate an empty kernel */
@@ -103,7 +105,9 @@ extern int cl_kernel_set_arg(cl_kernel,
extern int cl_kernel_set_arg_svm_pointer(cl_kernel,
uint32_t arg_index,
const void *arg_value);
-
+extern cl_int cl_kernel_set_exec_info(cl_kernel k,
+ size_t n,
+ const void *value);
/* Get the argument information */
extern int cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index,
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index 3fec069..1e63b73 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -188,6 +188,7 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
(void *) clEnqueueSVMUnmap,
(void *) NULL /* clCreateSamplerWithProperties */,
clSetKernelArgSVMPointer,
+ clSetKernelExecInfo,
#endif
};
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 48396e0..ffdd122 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1439,10 +1439,12 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
uint32_t internal_offset, size_t size, uint8_t bti)
{
assert(gpgpu->binded_n < max_buf_n);
- gpgpu->binded_buf[gpgpu->binded_n] = buf;
- gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
- gpgpu->binded_offset[gpgpu->binded_n] = offset;
- gpgpu->binded_n++;
+ if(offset != -1) {
+ gpgpu->binded_buf[gpgpu->binded_n] = buf;
+ gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
+ gpgpu->binded_offset[gpgpu->binded_n] = offset;
+ gpgpu->binded_n++;
+ }
intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti, I965_SURFACEFORMAT_RAW);
}
--
2.5.0
More information about the Beignet
mailing list