[Beignet] [PATCH 2/2] Runtime: Add support for non uniform group size

Xiuli Pan xiuli.pan at intel.com
Tue Mar 15 23:52:46 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Enqueue multiple times if the the size is not uniform, at most 2
times for 1D, 4times for 2D and 8 times for 3D. Using the workdim
offset of walker in batch buffer to keep work groups in series.

TODO: handle events for the flush between multiple enqueues
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 src/cl_api.c                |  8 ------
 src/cl_command_queue.c      | 60 +++++++++++++++++++++++++++++++++++++++++++--
 src/cl_command_queue_gen7.c | 19 ++++++++------
 src/cl_driver.h             |  1 +
 src/intel/intel_gpgpu.c     | 14 ++++++-----
 5 files changed, 78 insertions(+), 24 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index 298a9ab..f45bd25 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -3394,14 +3394,6 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
       }
     }
 
-  /* Local sizes must be non-null and divide global sizes */
-  if (local_work_size != NULL)
-    for (i = 0; i < work_dim; ++i)
-      if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) {
-        err = CL_INVALID_WORK_GROUP_SIZE;
-        goto error;
-      }
-
   /* Queue and kernel must share the same context */
   assert(kernel->program);
   if (command_queue->ctx != kernel->program->ctx) {
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 24094c8..6572c47 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -209,7 +209,7 @@ cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, uint32_t ma
   return CL_SUCCESS;
 }
 
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *,const size_t *, const size_t *, const size_t *, const size_t *, const size_t *);
 
 static cl_int
 cl_kernel_check_args(cl_kernel k)
@@ -222,6 +222,61 @@ cl_kernel_check_args(cl_kernel k)
 }
 
 LOCAL cl_int
+cl_command_queue_ND_range_wrap(cl_command_queue queue,
+                               cl_kernel ker,
+                               const uint32_t work_dim,
+                               const size_t *global_wk_off,
+                               const size_t *global_wk_sz,
+                               const size_t *local_wk_sz)
+{
+  /* Used for non uniform work group size */
+  cl_int err = CL_SUCCESS;
+  int i,j,k,count = 0;
+  const size_t global_wk_sz_div[3] = {
+    global_wk_sz[0]/local_wk_sz[0]*local_wk_sz[0],
+    global_wk_sz[1]/local_wk_sz[1]*local_wk_sz[1],
+    global_wk_sz[2]/local_wk_sz[2]*local_wk_sz[2]
+  };
+
+  const size_t global_wk_sz_rem[3] = {
+    global_wk_sz[0]%local_wk_sz[0],
+    global_wk_sz[1]%local_wk_sz[1],
+    global_wk_sz[2]%local_wk_sz[2]
+  };
+
+  const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
+  /* Go through the at most 8 cases and euque if there is work items left */
+  for(i = 0; i < 2;i++) {
+    for(j = 0; j < 2;j++) {
+      for(k = 0; k < 2; k++) {
+        size_t global_wk_sz_use[3] = {global_wk_all[k][0],global_wk_all[j][1],global_wk_all[i][2]};
+        size_t global_dim_off[3] = {
+          k * global_wk_sz_div[0] / local_wk_sz[0],
+          j * global_wk_sz_div[1] / local_wk_sz[1],
+          i * global_wk_sz_div[2] / local_wk_sz[2]
+        };
+        size_t local_wk_sz_use[3] = {
+          k ? global_wk_sz_rem[0] : local_wk_sz[0],
+          j ? global_wk_sz_rem[1] : local_wk_sz[1],
+          i ? global_wk_sz_rem[2] : local_wk_sz[2]
+        };
+        if(local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0) continue;
+        TRY (cl_command_queue_ND_range_gen7, queue, ker, work_dim, global_wk_off,global_dim_off, global_wk_sz,global_wk_sz_use,local_wk_sz, local_wk_sz_use);
+        /* TODO: need to handle events for multiple enqueue, now is a workaroud for uniform group size */
+        if(!(global_wk_sz_rem[0] == 0 && global_wk_sz_rem[1] == 0 && global_wk_sz_rem[2] == 0))
+          err = cl_command_queue_flush(queue);
+      }
+      if(work_dim < 2)
+        break;
+    }
+    if(work_dim < 3)
+      break;
+  }
+error:
+  return err;
+}
+
+LOCAL cl_int
 cl_command_queue_ND_range(cl_command_queue queue,
                           cl_kernel k,
                           const uint32_t work_dim,
@@ -238,7 +293,8 @@ cl_command_queue_ND_range(cl_command_queue queue,
   TRY (cl_kernel_check_args, k);
 
   if (ver == 7 || ver == 75 || ver == 8 || ver == 9)
-    TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+    //TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+    TRY (cl_command_queue_ND_range_wrap, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
   else
     FATAL ("Unknown Gen Device");
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 6bfacbf..b00e383 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -240,9 +240,9 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
   UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
   UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
-  UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]);
-  UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
-  UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
+  UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0] / enqueued_local_wk_sz[0] + (global_wk_sz[0]%enqueued_local_wk_sz[0]?1:0));
+  UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1] / enqueued_local_wk_sz[1] + (global_wk_sz[1]%enqueued_local_wk_sz[1]?1:0));
+  UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2] / enqueued_local_wk_sz[2] + (global_wk_sz[2]%enqueued_local_wk_sz[2]?1:0));
   UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
@@ -338,8 +338,11 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                cl_kernel ker,
                                const uint32_t work_dim,
                                const size_t *global_wk_off,
+                               const size_t *global_dim_off,
                                const size_t *global_wk_sz,
-                               const size_t *local_wk_sz)
+                               const size_t *global_wk_sz_use,
+                               const size_t *local_wk_sz,
+                               const size_t *local_wk_sz_use)
 {
   GET_QUEUE_THREAD_GPGPU(queue);
   cl_context ctx = queue->ctx;
@@ -365,7 +368,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   kernel.use_slm = interp_kernel_use_slm(ker->opaque);
 
   /* Compute the number of HW threads we need */
-  if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) {
+  if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz_use, 3, &local_sz) != CL_SUCCESS)) {
     fprintf(stderr, "Beignet: Work group size exceed Kerne's work group size.\n");
     return err;
   }
@@ -378,7 +381,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   }
   /* Curbe step 1: fill the constant urb buffer data shared by all threads */
   if (ker->curbe) {
-    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz ,local_wk_sz, thread_n);
+    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz_use ,local_wk_sz, thread_n);
     if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
       fprintf(stderr, "Beignet: Out of shared local memory %d.\n", kernel.slm_sz);
       return CL_OUT_OF_RESOURCES;
@@ -428,7 +431,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
     for (i = 0; i < thread_n; ++i) {
         memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
     }
-    TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
+    TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz_use, simd_sz, cst_sz, thread_n);
     if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0)
       goto error;
   }
@@ -441,7 +444,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   cl_gpgpu_batch_start(gpgpu);
 
   /* Issue the GPGPU_WALKER command */
-  cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+  cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off,global_dim_off, global_wk_sz_use, local_wk_sz_use);
 
   /* Close the batch buffer and submit it */
   cl_gpgpu_batch_end(gpgpu, 0);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 39c5f30..25323ac 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -285,6 +285,7 @@ typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
                                   uint32_t simd_sz,
                                   uint32_t thread_n,
                                   const size_t global_wk_off[3],
+                                  const size_t global_dim_off[3],
                                   const size_t global_wk_sz[3],
                                   const size_t local_wk_sz[3]);
 extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index ffdd122..727c0fb 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1840,6 +1840,7 @@ intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu,
                    uint32_t simd_sz,
                    uint32_t thread_n,
                    const size_t global_wk_off[3],
+                   const size_t global_dim_off[3],
                    const size_t global_wk_sz[3],
                    const size_t local_wk_sz[3])
 {
@@ -1889,6 +1890,7 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
                    uint32_t simd_sz,
                    uint32_t thread_n,
                    const size_t global_wk_off[3],
+                   const size_t global_dim_off[3],
                    const size_t global_wk_sz[3],
                    const size_t local_wk_sz[3])
 {
@@ -1916,14 +1918,14 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
     OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
   else
     OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
+  OUT_BATCH(gpgpu->batch, global_dim_off[0]);
   OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[0]+global_dim_off[0]);
+  OUT_BATCH(gpgpu->batch, global_dim_off[1]);
   OUT_BATCH(gpgpu->batch, 0);
-  OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
-  OUT_BATCH(gpgpu->batch, 0);
-  OUT_BATCH(gpgpu->batch, 0);
-  OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
-  OUT_BATCH(gpgpu->batch, 0);
-  OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[1]+global_dim_off[1]);
+  OUT_BATCH(gpgpu->batch, global_dim_off[2]);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[2]+global_dim_off[2]);
   OUT_BATCH(gpgpu->batch, right_mask);
   OUT_BATCH(gpgpu->batch, ~0x0);                     /* we always set height as 1, so set bottom mask as all 1*/
   ADVANCE_BATCH(gpgpu->batch);
-- 
2.5.0



More information about the Beignet mailing list