[Beignet] [PATCH v3] runtime: fix max group size calculation issue.
Allan MacKinnon
allanmac at pixel.io
Mon Jun 30 22:00:02 PDT 2014
Does this mean Beignet can launch 560 work item (70 hardware thread * 10 EU
* SIMD8) work groups on Haswell as requested here:
https://software.intel.com/en-us/forums/topic/516373 ?
On Mon, Jun 30, 2014 at 7:32 PM, Song, Ruiling <ruiling.song at intel.com>
wrote:
> Looks good to me
>
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Tuesday, July 01, 2014 8:31 AM
> To: beignet at lists.freedesktop.org
> Cc: Zhigang Gong
> Subject: [Beignet] [PATCH v3] runtime: fix max group size calculation
> issue.
>
> From: Zhigang Gong <zhigang.gong at linux.intel.com>
>
> If the kernel doesn't use slm/barrier, there is no hard limitation for the
> max group size. And if the max work group size is more than 1024, the
> original 64 urb entry count will not be sufficient to hold all the curbe
> payload. Change the entry count to max thread count to fix this potential
> issue.
>
> I found this bug when I tried to run phoronix test suite's juliagpu test
> case on my MBA.
>
> v2:
> refine the max kernel work group size calculation mechanism.
> the wg_sz should not be a device's member variable, it should be a
> variable derived from kernel and device's attriute at runtime.
> also fix wrong configuration for IVB GT1.
>
> v3:
> Add an important max thread limitation in the GPGPU_WALKER command.
> For non-Baytrail, the max thread depth * max thread height * max thread
> width should less than 64 (under either simd16 or simd8), no matter whether
> SLM/barrier is used. We oversighted that limitation before, thus for a
> simd8 kernel which use work group size 1024 will exceed this limitation and
> half of the thread will not be executed at all.
>
> Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
> ---
> src/cl_command_queue_gen7.c | 6 -----
> src/cl_device_id.c | 63
> +++++++++++++++++++++++++++++++--------------
> src/cl_device_id.h | 12 ++++-----
> src/cl_kernel.c | 2 +-
> src/intel/intel_gpgpu.c | 3 ++-
> 5 files changed, 53 insertions(+), 33 deletions(-)
>
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index 9af4829..5b80d74 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -304,12 +304,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
> kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
> kernel.curbe_sz = cst_sz;
>
> - /* Barrier and SLM must fit into a single half slice */
> - if(kernel.use_slm > 0 && simd_sz == 8 && local_sz >
> MAX_GROUP_SIZE_IN_HALFSLICE){
> - fprintf(stderr, "Beignet: Work group CAN NOT large than %d when using
> barrier or local momery.\n", MAX_GROUP_SIZE_IN_HALFSLICE);
> - return CL_OUT_OF_RESOURCES;
> - }
> -
> if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
> fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz);
> return CL_OUT_OF_RESOURCES;
> diff --git a/src/cl_device_id.c b/src/cl_device_id.c index
> 0263f02..4db580e 100644
> --- a/src/cl_device_id.c
> +++ b/src/cl_device_id.c
> @@ -40,21 +40,19 @@ static struct _cl_device_id intel_ivb_gt2_device = {
> INIT_ICD(dispatch)
> .max_compute_unit = 16,
> .max_thread_per_unit = 8,
> - .max_work_item_sizes = {512, 512, 512},
> + .max_work_item_sizes = {1024, 1024, 1024},
> .max_work_group_size = 1024,
> .max_clock_frequency = 1000,
> - .wg_sz = 1024,
> #include "cl_gen7_device.h"
> };
>
> static struct _cl_device_id intel_ivb_gt1_device = {
> INIT_ICD(dispatch)
> - .max_compute_unit = 8,
> - .max_thread_per_unit = 8,
> + .max_compute_unit = 6,
> + .max_thread_per_unit = 6,
> .max_work_item_sizes = {512, 512, 512},
> .max_work_group_size = 512,
> .max_clock_frequency = 1000,
> - .wg_sz = 512,
> #include "cl_gen7_device.h"
> };
>
> @@ -63,9 +61,8 @@ static struct _cl_device_id intel_baytrail_t_device = {
> .max_compute_unit = 4,
> .max_thread_per_unit = 8,
> .max_work_item_sizes = {512, 512, 512},
> - .max_work_group_size = 256,
> + .max_work_group_size = 512,
> .max_clock_frequency = 1000,
> - .wg_sz = 256,
> #include "cl_gen7_device.h"
> };
>
> @@ -74,10 +71,9 @@ static struct _cl_device_id intel_hsw_gt1_device = {
> INIT_ICD(dispatch)
> .max_compute_unit = 10,
> .max_thread_per_unit = 7,
> - .max_work_item_sizes = {512, 512, 512},
> - .max_work_group_size = 512,
> + .max_work_item_sizes = {1024, 1024, 1024}, .max_work_group_size =
> + 1024,
> .max_clock_frequency = 1000,
> - .wg_sz = 512,
> #include "cl_gen75_device.h"
> };
>
> @@ -85,10 +81,9 @@ static struct _cl_device_id intel_hsw_gt2_device = {
> INIT_ICD(dispatch)
> .max_compute_unit = 20,
> .max_thread_per_unit = 7,
> - .max_work_item_sizes = {512, 512, 512},
> + .max_work_item_sizes = {1024, 1024, 1024},
> .max_work_group_size = 1024,
> .max_clock_frequency = 1000,
> - .wg_sz = 1024,
> #include "cl_gen75_device.h"
> };
>
> @@ -96,10 +91,9 @@ static struct _cl_device_id intel_hsw_gt3_device = {
> INIT_ICD(dispatch)
> .max_compute_unit = 40,
> .max_thread_per_unit = 7,
> - .max_work_item_sizes = {512, 512, 512},
> + .max_work_item_sizes = {1024, 1024, 1024},
> .max_work_group_size = 1024,
> .max_clock_frequency = 1000,
> - .wg_sz = 2048,
> #include "cl_gen75_device.h"
> };
>
> @@ -465,6 +459,26 @@ cl_device_get_version(cl_device_id device, cl_int
> *ver)
> _DECL_FIELD(FIELD)
>
> #include "cl_kernel.h"
> +#include "cl_program.h"
> +
> +LOCAL size_t
> +cl_get_kernel_max_wg_sz(cl_kernel kernel) {
> + size_t work_group_size;
> + int simd_width = interp_kernel_get_simd_width(kernel->opaque);
> + int vendor_id = kernel->program->ctx->device->vendor_id;
> + if (!interp_kernel_use_slm(kernel->opaque)) {
> + if (!IS_BAYTRAIL_T(vendor_id) || simd_width == 16)
> + work_group_size = simd_width * 64;
> + else
> + work_group_size = kernel->program->ctx->device->max_compute_unit *
> +
> +kernel->program->ctx->device->max_thread_per_unit * simd_width;
> + } else
> + work_group_size = kernel->program->ctx->device->max_work_group_size /
> + (16 / simd_width);
> + return work_group_size;
> +}
> +
> LOCAL cl_int
> cl_get_kernel_workgroup_info(cl_kernel kernel,
> cl_device_id device, @@ -484,13 +498,24 @@
> cl_get_kernel_workgroup_info(cl_kernel kernel,
>
> CHECK_KERNEL(kernel);
> switch (param_name) {
> - DECL_FIELD(WORK_GROUP_SIZE, device->wg_sz)
> + case CL_KERNEL_WORK_GROUP_SIZE:
> + {
> + if (param_value && param_value_size < sizeof(size_t))
> + return CL_INVALID_VALUE;
> + if (param_value_size_ret != NULL)
> + *param_value_size_ret = sizeof(size_t);
> + if (param_value) {
> + size_t work_group_size = cl_get_kernel_max_wg_sz(kernel);
> + *(size_t*)param_value = work_group_size;
> + return CL_SUCCESS;
> + }
> + }
> DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
> device->preferred_wg_sz_mul)
> case CL_KERNEL_LOCAL_MEM_SIZE:
> - {
> - size_t local_mem_sz = interp_kernel_get_slm_size(kernel->opaque)
> + kernel->local_mem_sz;
> - _DECL_FIELD(local_mem_sz)
> - }
> + {
> + size_t local_mem_sz = interp_kernel_get_slm_size(kernel->opaque) +
> kernel->local_mem_sz;
> + _DECL_FIELD(local_mem_sz)
> + }
> DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
> DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
> default:
> diff --git a/src/cl_device_id.h b/src/cl_device_id.h index
> 422ef37..c4f8227 100644
> --- a/src/cl_device_id.h
> +++ b/src/cl_device_id.h
> @@ -25,11 +25,11 @@ struct _cl_device_id {
> DEFINE_ICD(dispatch)
> cl_device_type device_type;
> cl_uint vendor_id;
> - cl_uint max_compute_unit;
> - cl_uint max_thread_per_unit;
> - cl_uint max_work_item_dimensions;
> - size_t max_work_item_sizes[3];
> - size_t max_work_group_size;
> + cl_uint max_compute_unit; // maximum EU number
> + cl_uint max_thread_per_unit; // maximum EU threads per EU.
> + cl_uint max_work_item_dimensions; // should be 3.
> + size_t max_work_item_sizes[3]; // equal to maximum work group
> size.
> + size_t max_work_group_size; // maximum work group size
> under simd16 mode.
> cl_uint preferred_vector_width_char;
> cl_uint preferred_vector_width_short;
> cl_uint preferred_vector_width_int;
> @@ -101,7 +101,6 @@ struct _cl_device_id {
> size_t driver_version_sz;
> size_t built_in_kernels_sz;
> /* Kernel specific info that we're assigning statically */
> - size_t wg_sz;
> size_t preferred_wg_sz_mul;
> /* SubDevice specific info */
> cl_device_id parent_device;
> @@ -137,6 +136,7 @@ extern cl_int cl_get_kernel_workgroup_info(cl_kernel
> kernel,
> size_t *
> param_value_size_ret);
> /* Returns the Gen device ID */
> extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
> +extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
>
> #endif /* __CL_DEVICE_ID_H__ */
>
> diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 5d0b36e..aad3c04
> 100644
> --- a/src/cl_kernel.c
> +++ b/src/cl_kernel.c
> @@ -410,7 +410,7 @@ cl_kernel_work_group_sz(cl_kernel ker,
> for (i = 1; i < wk_dim; ++i)
> sz *= local_wk_sz[i];
>
> - if (sz > ker->program->ctx->device->max_work_group_size) {
> + if (sz > cl_get_kernel_max_wg_sz(ker)) {
> err = CL_INVALID_WORK_ITEM_SIZE;
> goto error;
> }
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index
> 3b89539..d403aa0 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -577,7 +577,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
> gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
>
> /* URB */
> - gpgpu->urb.num_cs_entries = 64;
> + gpgpu->urb.num_cs_entries = max_threads;
> gpgpu->urb.size_cs_entry = size_cs_entry;
> gpgpu->max_threads = max_threads;
>
> @@ -1113,6 +1113,7 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
> BEGIN_BATCH(gpgpu->batch, 11);
> OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
> OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0
> */
> + assert(thread_n <= 64);
> if (simd_sz == 16)
> OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread
> max */
> else
> --
> 1.8.3.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
>
--
Allan MacKinnon :: allanmac at pixel.io
Pixel I/O, LLC :: www.pixel.io :: @pixelio
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/beignet/attachments/20140630/38a1cd1f/attachment-0001.html>
More information about the Beignet
mailing list