[Beignet] [PATCH 4/5] HSW: enable the surface's cache in HSW.

Thu May 29 09:37:33 PDT 2014

HSW's surface cache control is changed, correct it. And also disable
exec flag for slm. When kernel parse cmd finish, need remove it totally

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 src/cl_command_queue.c      |  4 +--
 src/cl_command_queue_gen7.c |  4 +--
 src/cl_device_id.c          |  2 +-
 src/cl_driver.h             | 19 +++++++++++++-
 src/cl_driver_defs.c        |  1 +
 src/intel/intel_gpgpu.c     | 61 ++++++++++++++++++++++++++++-----------------
 6 files changed, 62 insertions(+), 29 deletions(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 6a699c0..d4da269 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -157,9 +157,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
       struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cl_gpgpu_get_cache_ctrl());
     } else {
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cl_gpgpu_get_cache_ctrl());
     }
   }
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 891d6f1..f1eb7aa 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -30,7 +30,7 @@
 #include <stdio.h>
 #include <string.h>
 
-static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; }
+static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+128; }
 
 /* "Varing" payload is the part of the curbe that changes accross threads in the
  *  same work group. Right now, it consists in local IDs and block IPs
@@ -243,7 +243,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   assert(offset >= 0);
   stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
   stack_sz *= device->max_compute_unit;
-  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
+  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl());
 }
 
 LOCAL cl_int
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 018da95..538c88a 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -86,7 +86,7 @@ static struct _cl_device_id intel_hsw_gt2_device = {
   .max_compute_unit = 140,
   .max_thread_per_unit = 7,
   .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
+  .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
   .wg_sz = 1024,
 #include "cl_gen75_device.h"
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 9dc2330..ba2ab0d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -59,7 +59,7 @@ typedef enum cl_gpgpu_tiling {
   GPGPU_TILE_Y  = 2,
 } cl_gpgpu_tiling;
 
-/* Cache control options */
+/* Cache control options for gen7 */
 typedef enum cl_cache_control {
   cc_gtt      = 0x0,
   cc_l3       = 0x1,
@@ -67,6 +67,20 @@ typedef enum cl_cache_control {
   cc_llc_l3   = 0x3
 } cl_cache_control;
 
+/* L3 Cache control options for gen75 */
+typedef enum cl_l3_cache_control {
+  l3cc_uc      = 0x0,
+  l3cc_ec       = 0x1
+} cl_l3_cache_control;
+
+/* LLCCC Cache control options for gen75 */
+typedef enum cl_llccc_cache_control {
+  llccc_pte      = 0x0<<1,
+  llccc_uc       = 0x1<<1,
+  llccc_ec       = 0x2<<1,
+  llccc_ucllc    = 0x3<<1
+} cl_llccc_cache_control;
+
 typedef enum gpu_command_status {
   command_queued    = 3,
   command_submitted = 2,
@@ -106,6 +120,9 @@ extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
 extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 
+/* get the default cache control value. */
+typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
+extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
 /* Set a 2d texture */
 typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
                                       uint32_t id,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 95a1a03..ab3af49 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -64,6 +64,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
 LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
 LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index fba480c..2cde179 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -121,8 +121,8 @@ typedef struct intel_gpgpu intel_gpgpu_t;
 typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
 intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
 
-typedef uint32_t (get_scratch_index_t)(uint32_t size);
-get_scratch_index_t *get_scratch_index = NULL;
+typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size);
+intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL;
 
 static void
 intel_gpgpu_sync(void *buf)
@@ -194,10 +194,22 @@ intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen7()
+{
+  return cc_llc_l3;
+}
+
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen75()
+{
+  return llccc_ec | l3cc_ec;
+}
+
 static void
 intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
 {
-  const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
+  const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
   BEGIN_BATCH(gpgpu->batch, 10);
   OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
   /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
@@ -233,12 +245,12 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
-uint32_t get_scratch_index_gen7(uint32_t size) {
+uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
   return size / 1024 - 1;
 }
 
-uint32_t get_scratch_index_gen75(uint32_t size) {
-    size = size >> 12;
+uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
+    size = size >> 11;
     uint32_t index = 0;
     while((size >>= 1) > 0)
       index++;   //get leading one
@@ -256,7 +268,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
   OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
 
   if(gpgpu->per_thread_scratch > 0) {
-    scratch_index = get_scratch_index(gpgpu->per_thread_scratch);
+    scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
     OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
               I915_GEM_DOMAIN_RENDER,
               I915_GEM_DOMAIN_RENDER,
@@ -356,11 +368,9 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
 static void
 intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
-  /* still set L3 in batch buffer for fulsim. */
   BEGIN_BATCH(gpgpu->batch, 9);
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
-
   OUT_BATCH(gpgpu->batch, 0x00730000);
 
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
@@ -377,7 +387,7 @@ intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
   else
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
-    ADVANCE_BATCH(gpgpu->batch);
+  ADVANCE_BATCH(gpgpu->batch);
 
   intel_gpgpu_pipe_control(gpgpu);
 }
@@ -411,25 +421,29 @@ static void
 intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
   /* still set L3 in batch buffer for fulsim. */
-  BEGIN_BATCH(gpgpu->batch, 6);
+  BEGIN_BATCH(gpgpu->batch, 9);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x00610000);
+
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
   if (use_slm)
-    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[8]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
   else
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
 
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
   if (use_slm)
-    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[8]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
   else
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
     ADVANCE_BATCH(gpgpu->batch);
 
-  //To set L3 in HSW, enable the flag I915_EXEC_ENABLE_SLM flag when exec
-  if(use_slm)
-    gpgpu->batch->enable_slm = 1;
+  //if(use_slm)
+  //  gpgpu->batch->enable_slm = 1;
   intel_gpgpu_pipe_control(gpgpu);
 }
 
@@ -614,7 +628,7 @@ intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
   ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
   ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
   ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
-  ss2->ss5.cache_control = cc_llc_l3;
+  ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
   heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
 
   if(gpgpu->constant_b.bo)
@@ -652,7 +666,7 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
   ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */
   ss0->ss3.depth  = 1023; /* bits 30:21 of sz */
   ss1->ss3.depth  = 1023;  /* bits 30:21 of sz */
-  ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3;
+  ss1->ss5.cache_control = ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
   heap->binding_table[0] = offsetof(surface_heap_t, surface);
   heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
 }
@@ -702,7 +716,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
   ss->ss4.not_str_buf.rt_view_extent = depth - 1;
   ss->ss4.not_str_buf.min_array_element = 0;
   ss->ss3.pitch = pitch - 1;
-  ss->ss5.cache_control = cc_llc_l3;
+  ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
   if (tiling == GPGPU_TILE_X) {
     ss->ss0.tiled_surface = 1;
     ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
@@ -743,7 +757,7 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
   ss->ss4.not_str_buf.rt_view_extent = depth - 1;
   ss->ss4.not_str_buf.min_array_element = 0;
   ss->ss3.pitch = pitch - 1;
-  ss->ss5.cache_control = cc_llc_l3;
+  ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
   ss->ss7.shader_r = I965_SURCHAN_SELECT_RED;
   ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
   ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
@@ -1200,7 +1214,8 @@ intel_set_gpgpu_callbacks(int device_id)
   if (IS_HASWELL(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
-    get_scratch_index = get_scratch_index_gen75;
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
   }
   else if (IS_IVYBRIDGE(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
@@ -1208,9 +1223,9 @@ intel_set_gpgpu_callbacks(int device_id)
       intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
     else
       intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
-    get_scratch_index = get_scratch_index_gen7;
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
   }
   else
     assert(0);
 }
-
-- 
1.8.3.2