[Beignet] [PATCH v2 2/4] HSW: Set correct max work group size for GT2 and GT3.

Yang Rong rong.r.yang at intel.com
Mon Jun 9 08:29:48 PDT 2014


v2: Return an error when can't fit work group to a single half slice.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 src/cl_command_queue_gen7.c | 7 +++++++
 src/cl_device_id.c          | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index d875021..c9818e6 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -30,6 +30,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#define MAX_GROUP_SIZE_IN_HALFSLICE   512
 static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; }
 
 /* "Varing" payload is the part of the curbe that changes accross threads in the
@@ -278,6 +279,12 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
   kernel.curbe_sz = cst_sz;
 
+  /* Barrier and SLM must fit into a single half slice */
+  if(kernel.use_slm > 0 && simd_sz == 8 && local_sz > MAX_GROUP_SIZE_IN_HALFSLICE){
+    fprintf(stderr, "Beignet: Work group CAN NOT large than %d when using barrier or local momery.\n", MAX_GROUP_SIZE_IN_HALFSLICE);
+    return CL_OUT_OF_RESOURCES;
+  }
+
   if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
     fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz);
     return CL_OUT_OF_RESOURCES;
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index d2b3bed..c435307 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -86,7 +86,7 @@ static struct _cl_device_id intel_hsw_gt2_device = {
   .max_compute_unit = 140,
   .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
+  .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
   .wg_sz = 1024,
 #include "cl_gen75_device.h"
@@ -97,7 +97,7 @@ static struct _cl_device_id intel_hsw_gt3_device = {
   .max_compute_unit = 280,
   .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
+  .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
   .wg_sz = 2048,
 #include "cl_gen75_device.h"
-- 
1.8.3.2



More information about the Beignet mailing list