[Beignet] [PATCH v2] runtime: fix potential curbe allocation issue.

Mon Jun 30 23:54:07 PDT 2014

According to spec, different platforms have different curbe
allocation restrication. The previous code set the curbe
allocated size to 480 statically which is not correct.

This patch change to always set the curbe entry num to 64
which is the maximum work group size. And set proper curbe
allocation size according to the platform's hard limitation
and a relatively reasonable kernel argument usage limitation.

v2:
when we call load_vte_state, we already know the eaxctly constant urb
size used in the current kernel. We could choose a smallest valid curbe
size for this kernel. And if the size exceed the hardware limitation,
we report it as a warning here.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 src/cl_gt_device.h      |  2 +-
 src/intel/intel_gpgpu.c | 41 ++++++++++++++++++++++++++++-------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 63c9047..97ba7e2 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -39,7 +39,7 @@
 .address_bits = 32,
 .max_mem_alloc_size = 256 * 1024 * 1024,
 .image_support = CL_TRUE,
-.max_read_image_args = 128,
+.max_read_image_args = 16,
 .max_write_image_args = 8,
 .image_max_array_size = 2048,
 .image2d_max_width = 8192,
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index d403aa0..48e2769 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -116,7 +116,7 @@ struct intel_gpgpu
   struct {
     uint32_t num_cs_entries;
     uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
-  } urb;
+  } curb;
 
   uint32_t max_threads;      /* max threads requested by the user */
 };
@@ -275,6 +275,22 @@ uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
     return index;
 }
 
+#define MAX_KERNEL_ARG_SIZE (32 * 4 + 24 * 4 + 5 * 64) * 64 // 32 integer arguments, 24 uniform special register and 5 vector special register.
+
+LOCAL cl_int
+cl_get_max_curbe_size(uint32_t device_id)
+{
+  int max_curbe_size;
+  if (IS_BAYTRAIL_T(device_id) ||
+      IS_IVB_GT1(device_id))
+    max_curbe_size = 992;
+  else
+    max_curbe_size = 2016;
+
+  return (max_curbe_size*32) > MAX_KERNEL_ARG_SIZE ?
+         (MAX_KERNEL_ARG_SIZE / 32) : max_curbe_size;
+}
+
 static void
 intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
 {
@@ -293,10 +309,10 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
     OUT_BATCH(gpgpu->batch, 0);
   }
   /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
-  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (64 << 8) | 0xc4);
+  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (0 << 8) | 0xc4);
   OUT_BATCH(gpgpu->batch, 0);
   /* curbe_size */
-  OUT_BATCH(gpgpu->batch, 480);
+  OUT_BATCH(gpgpu->batch, cl_get_max_curbe_size(gpgpu->drv->device_id));
   OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, 0);
@@ -306,17 +322,16 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
 static void
 intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
 {
+  int curbe_size = gpgpu->curb.size_cs_entry * gpgpu->curb.num_cs_entries;
+  if (curbe_size > cl_get_max_curbe_size(gpgpu->drv->device_id)) {
+    curbe_size = cl_get_max_curbe_size(gpgpu->drv->device_id);
+    fprintf(stderr, "warning, curbe size exceed limitation.\n");
+  }
   BEGIN_BATCH(gpgpu->batch, 4);
   OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
   OUT_BATCH(gpgpu->batch, 0);                     /* mbz */
-// XXX
-#if 1
   OUT_BATCH(gpgpu->batch,
-            gpgpu->urb.size_cs_entry*
-            gpgpu->urb.num_cs_entries*32);
-#else
-  OUT_BATCH(gpgpu->batch, 5120);
-#endif
+            curbe_size * 32);
   OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.curbe_offset);
   ADVANCE_BATCH(gpgpu->batch);
 }
@@ -577,8 +592,8 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
 
   /* URB */
-  gpgpu->urb.num_cs_entries = max_threads;
-  gpgpu->urb.size_cs_entry = size_cs_entry;
+  gpgpu->curb.num_cs_entries = 64;
+  gpgpu->curb.size_cs_entry = size_cs_entry;
   gpgpu->max_threads = max_threads;
 
   if (gpgpu->printf_b.ibo)
@@ -616,7 +631,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   //curbe must be 32 bytes aligned
   size_aux = ALIGN(size_aux, 32);
   gpgpu->aux_offset.curbe_offset = size_aux;
-  size_aux += gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
+  size_aux += gpgpu->curb.num_cs_entries * gpgpu->curb.size_cs_entry * 32;
 
   //idrt must be 32 bytes aligned
   size_aux = ALIGN(size_aux, 32);
-- 
1.8.3.2