[Beignet] [PATCH 4/5] runtime: set CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE to kernel's SIMD_WIDTH.
Zhigang Gong
zhigang.gong at intel.com
Thu Nov 12 00:47:04 PST 2015
It makes sense to set CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE to the
corresponding SIMD size. Then it provides a way for intel's OCL application
to get SIMD width at runtime and make some SIMD width dependant optimization
possible.
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
src/cl_api.c | 3 ++-
src/cl_command_queue_gen7.c | 2 +-
src/cl_device_id.c | 11 ++++++++++-
src/cl_device_id.h | 2 --
src/cl_gt_device.h | 1 -
5 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/src/cl_api.c b/src/cl_api.c
index a18bc99..64206eb 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -3001,6 +3001,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
err = cl_command_queue_flush(command_queue);
}
+error:
if(b_output_kernel_perf)
{
if(kernel->program->build_opts != NULL)
@@ -3008,7 +3009,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
else
time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue);
}
-error:
+
return err;
}
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 2edc3be..f0ee20a 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -329,7 +329,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Compute the number of HW threads we need */
if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) {
- fprintf(stderr, "Beignet: Work group size exceed Kerne's work group size.\n");
+ fprintf(stderr, "Beignet: Work group size exceed Kernel's work group size.\n");
return err;
}
kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 4551aa8..8186ac8 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -966,7 +966,16 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
return CL_SUCCESS;
}
}
- DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device->preferred_wg_sz_mul)
+ case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+ {
+ if (param_value && param_value_size < sizeof(size_t))
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret != NULL)
+ *param_value_size_ret = sizeof(size_t);
+ if (param_value)
+ *(size_t*)param_value = interp_kernel_get_simd_width(kernel->opaque);
+ return CL_SUCCESS;
+ }
case CL_KERNEL_LOCAL_MEM_SIZE:
{
size_t local_mem_sz = interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 4a923ef..c5f9e57 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -108,8 +108,6 @@ struct _cl_device_id {
size_t driver_version_sz;
size_t spir_versions_sz;
size_t built_in_kernels_sz;
- /* Kernel specific info that we're assigning statically */
- size_t preferred_wg_sz_mul;
/* SubDevice specific info */
cl_device_id parent_device;
cl_uint partition_max_sub_device;
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index de7a636..12987b7 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -39,7 +39,6 @@
.native_vector_width_float = 4,
.native_vector_width_double = 2,
.native_vector_width_half = 8,
-.preferred_wg_sz_mul = 16,
.address_bits = 32,
.max_mem_alloc_size = 512 * 1024 * 1024,
.image_support = CL_TRUE,
--
1.9.1
More information about the Beignet
mailing list