[Beignet] [PATCH v2 3/4] HSW: enable the surface's cache in HSW.

Mon Jun 9 08:29:49 PDT 2014

HSW's surface cache control is changed, correct it. Also correct scratch size calculate.
And disable exec flag for slm. When kernel parse cmd finish, need remove it totally

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 src/cl_command_queue.c      |  4 +--
 src/cl_command_queue_gen7.c |  4 +--
 src/cl_driver.h             | 19 +++++++++++++-
 src/cl_driver_defs.c        |  1 +
 src/intel/intel_gpgpu.c     | 61 ++++++++++++++++++++++++++++-----------------
 5 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index a2109d7..e6553ec 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -157,9 +157,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
       struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cl_gpgpu_get_cache_ctrl());
     } else {
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cl_gpgpu_get_cache_ctrl());
     }
   }
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index c9818e6..3401baa 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -31,7 +31,7 @@
 #include <string.h>
 
 #define MAX_GROUP_SIZE_IN_HALFSLICE   512
-static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; }
+static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+128; }
 
 /* "Varing" payload is the part of the curbe that changes accross threads in the
  *  same work group. Right now, it consists in local IDs and block IPs
@@ -244,7 +244,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   assert(offset >= 0);
   stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
   stack_sz *= device->max_compute_unit;
-  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
+  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl());
 }
 
 LOCAL cl_int
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 3e01c92..2bca443 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -59,7 +59,7 @@ typedef enum cl_gpgpu_tiling {
   GPGPU_TILE_Y  = 2,
 } cl_gpgpu_tiling;
 
-/* Cache control options */
+/* Cache control options for gen7 */
 typedef enum cl_cache_control {
   cc_gtt      = 0x0,
   cc_l3       = 0x1,
@@ -67,6 +67,20 @@ typedef enum cl_cache_control {
   cc_llc_l3   = 0x3
 } cl_cache_control;
 
+/* L3 Cache control options for gen75 */
+typedef enum cl_l3_cache_control {
+  l3cc_uc      = 0x0,
+  l3cc_ec       = 0x1
+} cl_l3_cache_control;
+
+/* LLCCC Cache control options for gen75 */
+typedef enum cl_llccc_cache_control {
+  llccc_pte      = 0x0<<1,
+  llccc_uc       = 0x1<<1,
+  llccc_ec       = 0x2<<1,
+  llccc_ucllc    = 0x3<<1
+} cl_llccc_cache_control;
+
 typedef enum gpu_command_status {
   command_queued    = 3,
   command_submitted = 2,
@@ -106,6 +120,9 @@ extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
 extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 
+/* get the default cache control value. */
+typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
+extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
 /* Set a 2d texture */
 typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
                                       uint32_t id,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 95a1a03..ab3af49 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -64,6 +64,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
 LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
 LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index bde9bd5..20b832a 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -121,8 +121,8 @@ typedef struct intel_gpgpu intel_gpgpu_t;
 typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
 intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
 
-typedef uint32_t (get_scratch_index_t)(uint32_t size);
-get_scratch_index_t *get_scratch_index = NULL;
+typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size);
+intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL;
 
 static void
 intel_gpgpu_sync(void *buf)
@@ -194,10 +194,22 @@ intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen7()
+{
+  return cc_llc_l3;
+}
+
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen75()
+{
+  return llccc_ec | l3cc_ec;
+}
+
 static void
 intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
 {
-  const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
+  const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
   BEGIN_BATCH(gpgpu->batch, 10);
   OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
   /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
@@ -233,12 +245,12 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
-uint32_t get_scratch_index_gen7(uint32_t size) {
+uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
   return size / 1024 - 1;
 }
 
-uint32_t get_scratch_index_gen75(uint32_t size) {
-    size = size >> 12;
+uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
+    size = size >> 11;
     uint32_t index = 0;
     while((size >>= 1) > 0)
       index++;   //get leading one
@@ -256,7 +268,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
   OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
 
   if(gpgpu->per_thread_scratch > 0) {
-    scratch_index = get_scratch_index(gpgpu->per_thread_scratch);
+    scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
     OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
               I915_GEM_DOMAIN_RENDER,
               I915_GEM_DOMAIN_RENDER,
@@ -356,11 +368,9 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
 static void
 intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
-  /* still set L3 in batch buffer for fulsim. */
   BEGIN_BATCH(gpgpu->batch, 9);
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
-
   OUT_BATCH(gpgpu->batch, 0x00730000);
 
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
@@ -377,7 +387,7 @@ intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
   else
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
-    ADVANCE_BATCH(gpgpu->batch);
+  ADVANCE_BATCH(gpgpu->batch);
 
   intel_gpgpu_pipe_control(gpgpu);
 }
@@ -411,25 +421,29 @@ static void
 intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
   /* still set L3 in batch buffer for fulsim. */
-  BEGIN_BATCH(gpgpu->batch, 6);
+  BEGIN_BATCH(gpgpu->batch, 9);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x00610000);
+
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
   if (use_slm)
-    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[8]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
   else
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
 
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
   if (use_slm)
-    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[8]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
   else
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
     ADVANCE_BATCH(gpgpu->batch);
 
-  //To set L3 in HSW, enable the flag I915_EXEC_ENABLE_SLM flag when exec
-  if(use_slm)
-    gpgpu->batch->enable_slm = 1;
+  //if(use_slm)
+  //  gpgpu->batch->enable_slm = 1;
   intel_gpgpu_pipe_control(gpgpu);
 }
 
@@ -614,7 +628,7 @@ intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
   ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
   ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
   ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
-  ss2->ss5.cache_control = cc_llc_l3;
+  ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
   heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
 
   if(gpgpu->constant_b.bo)
@@ -652,7 +666,7 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
   ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */
   ss0->ss3.depth  = 1023; /* bits 30:21 of sz */
   ss1->ss3.depth  = 1023;  /* bits 30:21 of sz */
-  ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3;
+  ss1->ss5.cache_control = ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
   heap->binding_table[0] = offsetof(surface_heap_t, surface);
   heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
 }
@@ -702,7 +716,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
   ss->ss4.not_str_buf.rt_view_extent = depth - 1;
   ss->ss4.not_str_buf.min_array_element = 0;
   ss->ss3.pitch = pitch - 1;
-  ss->ss5.cache_control = cc_llc_l3;
+  ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
   if (tiling == GPGPU_TILE_X) {
     ss->ss0.tiled_surface = 1;
     ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
@@ -743,7 +757,7 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
   ss->ss4.not_str_buf.rt_view_extent = depth - 1;
   ss->ss4.not_str_buf.min_array_element = 0;
   ss->ss3.pitch = pitch - 1;
-  ss->ss5.cache_control = cc_llc_l3;
+  ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
   ss->ss7.shader_r = I965_SURCHAN_SELECT_RED;
   ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
   ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
@@ -1208,7 +1222,8 @@ intel_set_gpgpu_callbacks(int device_id)
   if (IS_HASWELL(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
-    get_scratch_index = get_scratch_index_gen75;
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
   }
   else if (IS_IVYBRIDGE(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
@@ -1216,9 +1231,9 @@ intel_set_gpgpu_callbacks(int device_id)
       intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
     else
       intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
-    get_scratch_index = get_scratch_index_gen7;
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
   }
   else
     assert(0);
 }
-
-- 
1.8.3.2