[Beignet] [PATCH] Change the IVB/HSW's max_work_group_size to 512, and BYT to 256.

Zhigang Gong zhigang.gong at linux.intel.com
Tue Nov 25 20:22:21 PST 2014


LGTM, pushed, thanks.

On Fri, Nov 21, 2014 at 01:39:10PM +0800, Yang Rong wrote:
> To decide the kernel's work group size, application should get
> CL_DEVICE_MAX_WORK_GROUP_SIZE first, and then get the CL_KERNEL_WORK_GROUP_SIZE
> after clBuildProgram.
> But some application only check the CL_DEVICE_MAX_WORK_GROUP_SIZE, and if kernel run
> simd8 mode or other cause, may exceed the CL_KERNEL_WORK_GROUP_SIZE.
> So change to CL_DEVICE_MAX_WORK_GROUP_SIZE to the minimum CL_KERNEL_WORK_GROUP_SIZE.
> 
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
>  src/cl_device_id.c | 30 +++++++++++++++---------------
>  1 file changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/src/cl_device_id.c b/src/cl_device_id.c
> index 522c3c5..5ef0bde 100644
> --- a/src/cl_device_id.c
> +++ b/src/cl_device_id.c
> @@ -42,8 +42,8 @@ static struct _cl_device_id intel_ivb_gt2_device = {
>    .max_compute_unit = 16,
>    .max_thread_per_unit = 8,
>    .sub_slice_count = 2,
> -  .max_work_item_sizes = {1024, 1024, 1024},
> -  .max_work_group_size = 1024,
> +  .max_work_item_sizes = {512, 512, 512},
> +  .max_work_group_size = 512,
>    .max_clock_frequency = 1000,
>  #include "cl_gen7_device.h"
>  };
> @@ -64,8 +64,8 @@ static struct _cl_device_id intel_baytrail_t_device = {
>    .max_compute_unit = 4,
>    .max_thread_per_unit = 8,
>    .sub_slice_count = 1,
> -  .max_work_item_sizes = {512, 512, 512},
> -  .max_work_group_size = 512,
> +  .max_work_item_sizes = {256, 256, 256},
> +  .max_work_group_size = 256,
>    .max_clock_frequency = 1000,
>  #include "cl_gen7_device.h"
>  };
> @@ -76,8 +76,8 @@ static struct _cl_device_id intel_hsw_gt1_device = {
>    .max_compute_unit = 10,
>    .max_thread_per_unit = 7,
>    .sub_slice_count = 1,
> -  .max_work_item_sizes = {1024, 1024, 1024},
> -  .max_work_group_size = 1024,
> +  .max_work_item_sizes = {512, 512, 512},
> +  .max_work_group_size = 512,
>    .max_clock_frequency = 1000,
>  #include "cl_gen75_device.h"
>  };
> @@ -87,8 +87,8 @@ static struct _cl_device_id intel_hsw_gt2_device = {
>    .max_compute_unit = 20,
>    .max_thread_per_unit = 7,
>    .sub_slice_count = 2,
> -  .max_work_item_sizes = {1024, 1024, 1024},
> -  .max_work_group_size = 1024,
> +  .max_work_item_sizes = {512, 512, 512},
> +  .max_work_group_size = 512,
>    .max_clock_frequency = 1000,
>  #include "cl_gen75_device.h"
>  };
> @@ -98,8 +98,8 @@ static struct _cl_device_id intel_hsw_gt3_device = {
>    .max_compute_unit = 40,
>    .max_thread_per_unit = 7,
>    .sub_slice_count = 4,
> -  .max_work_item_sizes = {1024, 1024, 1024},
> -  .max_work_group_size = 1024,
> +  .max_work_item_sizes = {512, 512, 512},
> +  .max_work_group_size = 512,
>    .max_clock_frequency = 1000,
>  #include "cl_gen75_device.h"
>  };
> @@ -110,7 +110,7 @@ static struct _cl_device_id intel_brw_gt1_device = {
>    .max_compute_unit = 12,
>    .max_thread_per_unit = 7,
>    .sub_slice_count = 2,
> -  .max_work_item_sizes = {1024, 1024, 1024},
> +  .max_work_item_sizes = {512, 512, 512},
>    .max_work_group_size = 512,
>    .max_clock_frequency = 1000,
>  #include "cl_gen75_device.h"
> @@ -121,7 +121,7 @@ static struct _cl_device_id intel_brw_gt2_device = {
>    .max_compute_unit = 24,
>    .max_thread_per_unit = 7,
>    .sub_slice_count = 3,
> -  .max_work_item_sizes = {1024, 1024, 1024},
> +  .max_work_item_sizes = {512, 512, 512},
>    .max_work_group_size = 512,
>    .max_clock_frequency = 1000,
>  #include "cl_gen75_device.h"
> @@ -132,7 +132,7 @@ static struct _cl_device_id intel_brw_gt3_device = {
>    .max_compute_unit = 48,
>    .max_thread_per_unit = 7,
>    .sub_slice_count = 6,
> -  .max_work_item_sizes = {1024, 1024, 1024},
> +  .max_work_item_sizes = {512, 512, 512},
>    .max_work_group_size = 512,
>    .max_clock_frequency = 1000,
>  #include "cl_gen75_device.h"
> @@ -669,9 +669,9 @@ cl_get_kernel_max_wg_sz(cl_kernel kernel)
>      if(thread_cnt > 64)
>        thread_cnt = 64;
>      work_group_size = thread_cnt * simd_width;
> -    if(work_group_size > kernel->program->ctx->device->max_work_group_size)
> -      work_group_size = kernel->program->ctx->device->max_work_group_size;
>    }
> +  if(work_group_size > kernel->program->ctx->device->max_work_group_size)
> +    work_group_size = kernel->program->ctx->device->max_work_group_size;
>    return work_group_size;
>  }
>  
> -- 
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list