[Beignet] [PATCH] runtime: choose the actual EU number as the max compute units.

Fri Jun 20 03:09:05 PDT 2014

Use the EU number as compute unit make more sense.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 src/cl_command_queue_gen7.c |  6 +++---
 src/cl_device_id.c          | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 978650a..7f00a87 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -243,7 +243,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
    */
   assert(offset >= 0);
   stack_sz *= interp_kernel_get_simd_width(ker->opaque);
-  stack_sz *= device->max_compute_unit;
+  stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
   /* Because HSW calc stack offset per thread is relative with half slice, when
      thread schedule in half slice is not balance, would out of bound. Because
      the max half slice is 4 in GT4, multiply stack size with 4 for safe.
@@ -326,9 +326,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Setup the kernel */
   if (queue->props & CL_QUEUE_PROFILING_ENABLE)
-    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
+    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 1);
   else
-    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
+    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 0);
 
   printf_num = interp_get_printf_num(printf_info);
   if (printf_num) {
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 1a2565c..f8c5c86 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -38,7 +38,7 @@
 
 static struct _cl_device_id intel_ivb_gt2_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 128,
+  .max_compute_unit = 16,
   .max_thread_per_unit = 8,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 1024,
@@ -49,7 +49,7 @@ static struct _cl_device_id intel_ivb_gt2_device = {
 
 static struct _cl_device_id intel_ivb_gt1_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 64,
+  .max_compute_unit = 8,
   .max_thread_per_unit = 8,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
@@ -60,7 +60,7 @@ static struct _cl_device_id intel_ivb_gt1_device = {
 
 static struct _cl_device_id intel_baytrail_t_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 32,
+  .max_compute_unit = 4,
   .max_thread_per_unit = 8,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 256,
@@ -72,7 +72,7 @@ static struct _cl_device_id intel_baytrail_t_device = {
 /* XXX we clone IVB for HSW now */
 static struct _cl_device_id intel_hsw_gt1_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 70,
+  .max_compute_unit = 10,
   .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
@@ -83,7 +83,7 @@ static struct _cl_device_id intel_hsw_gt1_device = {
 
 static struct _cl_device_id intel_hsw_gt2_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 140,
+  .max_compute_unit = 20,
   .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 1024,
@@ -94,7 +94,7 @@ static struct _cl_device_id intel_hsw_gt2_device = {
 
 static struct _cl_device_id intel_hsw_gt3_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 280,
+  .max_compute_unit = 40,
   .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 1024,
-- 
1.8.3.2