[Beignet] [PATCH 4/5] HSW: enable the surface's cache in HSW.
Yang Rong
rong.r.yang at intel.com
Thu May 29 09:37:33 PDT 2014
HSW's surface cache control is changed, correct it. And also disable
exec flag for slm. When kernel parse cmd finish, need remove it totally
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
src/cl_command_queue.c | 4 +--
src/cl_command_queue_gen7.c | 4 +--
src/cl_device_id.c | 2 +-
src/cl_driver.h | 19 +++++++++++++-
src/cl_driver_defs.c | 1 +
src/intel/intel_gpgpu.c | 61 ++++++++++++++++++++++++++++-----------------
6 files changed, 62 insertions(+), 29 deletions(-)
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 6a699c0..d4da269 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -157,9 +157,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
- cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
+ cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cl_gpgpu_get_cache_ctrl());
} else {
- cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
+ cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cl_gpgpu_get_cache_ctrl());
}
}
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 891d6f1..f1eb7aa 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -30,7 +30,7 @@
#include <stdio.h>
#include <string.h>
-static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; }
+static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+128; }
/* "Varing" payload is the part of the curbe that changes accross threads in the
* same work group. Right now, it consists in local IDs and block IPs
@@ -243,7 +243,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
assert(offset >= 0);
stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
stack_sz *= device->max_compute_unit;
- cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
+ cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl());
}
LOCAL cl_int
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 018da95..538c88a 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -86,7 +86,7 @@ static struct _cl_device_id intel_hsw_gt2_device = {
.max_compute_unit = 140,
.max_thread_per_unit = 7,
.max_work_item_sizes = {512, 512, 512},
- .max_work_group_size = 512,
+ .max_work_group_size = 1024,
.max_clock_frequency = 1000,
.wg_sz = 1024,
#include "cl_gen75_device.h"
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 9dc2330..ba2ab0d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -59,7 +59,7 @@ typedef enum cl_gpgpu_tiling {
GPGPU_TILE_Y = 2,
} cl_gpgpu_tiling;
-/* Cache control options */
+/* Cache control options for gen7 */
typedef enum cl_cache_control {
cc_gtt = 0x0,
cc_l3 = 0x1,
@@ -67,6 +67,20 @@ typedef enum cl_cache_control {
cc_llc_l3 = 0x3
} cl_cache_control;
+/* L3 Cache control options for gen75 */
+typedef enum cl_l3_cache_control {
+ l3cc_uc = 0x0,
+ l3cc_ec = 0x1
+} cl_l3_cache_control;
+
+/* LLCCC Cache control options for gen75 */
+typedef enum cl_llccc_cache_control {
+ llccc_pte = 0x0<<1,
+ llccc_uc = 0x1<<1,
+ llccc_ec = 0x2<<1,
+ llccc_ucllc = 0x3<<1
+} cl_llccc_cache_control;
+
typedef enum gpu_command_status {
command_queued = 3,
command_submitted = 2,
@@ -106,6 +120,9 @@ extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
+/* get the default cache control value. */
+typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
+extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
/* Set a 2d texture */
typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
uint32_t id,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 95a1a03..ab3af49 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -64,6 +64,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index fba480c..2cde179 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -121,8 +121,8 @@ typedef struct intel_gpgpu intel_gpgpu_t;
typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
-typedef uint32_t (get_scratch_index_t)(uint32_t size);
-get_scratch_index_t *get_scratch_index = NULL;
+typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size);
+intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL;
static void
intel_gpgpu_sync(void *buf)
@@ -194,10 +194,22 @@ intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
ADVANCE_BATCH(gpgpu->batch);
}
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen7()
+{
+ return cc_llc_l3;
+}
+
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen75()
+{
+ return llccc_ec | l3cc_ec;
+}
+
static void
intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
{
- const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
+ const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
BEGIN_BATCH(gpgpu->batch, 10);
OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
/* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
@@ -233,12 +245,12 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
ADVANCE_BATCH(gpgpu->batch);
}
-uint32_t get_scratch_index_gen7(uint32_t size) {
+uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
return size / 1024 - 1;
}
-uint32_t get_scratch_index_gen75(uint32_t size) {
- size = size >> 12;
+uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
+ size = size >> 11;
uint32_t index = 0;
while((size >>= 1) > 0)
index++; //get leading one
@@ -256,7 +268,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
if(gpgpu->per_thread_scratch > 0) {
- scratch_index = get_scratch_index(gpgpu->per_thread_scratch);
+ scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
@@ -356,11 +368,9 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
static void
intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
{
- /* still set L3 in batch buffer for fulsim. */
BEGIN_BATCH(gpgpu->batch, 9);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
-
OUT_BATCH(gpgpu->batch, 0x00730000);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
@@ -377,7 +387,7 @@ intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
else
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
- ADVANCE_BATCH(gpgpu->batch);
+ ADVANCE_BATCH(gpgpu->batch);
intel_gpgpu_pipe_control(gpgpu);
}
@@ -411,25 +421,29 @@ static void
intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
{
/* still set L3 in batch buffer for fulsim. */
- BEGIN_BATCH(gpgpu->batch, 6);
+ BEGIN_BATCH(gpgpu->batch, 9);
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, 0x00610000);
+
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
if (use_slm)
- OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[8]);
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
else
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
if (use_slm)
- OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[8]);
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
else
OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
ADVANCE_BATCH(gpgpu->batch);
- //To set L3 in HSW, enable the flag I915_EXEC_ENABLE_SLM flag when exec
- if(use_slm)
- gpgpu->batch->enable_slm = 1;
+ //if(use_slm)
+ // gpgpu->batch->enable_slm = 1;
intel_gpgpu_pipe_control(gpgpu);
}
@@ -614,7 +628,7 @@ intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
ss2->ss2.width = s & 0x7f; /* bits 6:0 of sz */
ss2->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
ss2->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
- ss2->ss5.cache_control = cc_llc_l3;
+ ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
if(gpgpu->constant_b.bo)
@@ -652,7 +666,7 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */
ss0->ss3.depth = 1023; /* bits 30:21 of sz */
ss1->ss3.depth = 1023; /* bits 30:21 of sz */
- ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3;
+ ss1->ss5.cache_control = ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
heap->binding_table[0] = offsetof(surface_heap_t, surface);
heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
}
@@ -702,7 +716,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
ss->ss4.not_str_buf.rt_view_extent = depth - 1;
ss->ss4.not_str_buf.min_array_element = 0;
ss->ss3.pitch = pitch - 1;
- ss->ss5.cache_control = cc_llc_l3;
+ ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
if (tiling == GPGPU_TILE_X) {
ss->ss0.tiled_surface = 1;
ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
@@ -743,7 +757,7 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
ss->ss4.not_str_buf.rt_view_extent = depth - 1;
ss->ss4.not_str_buf.min_array_element = 0;
ss->ss3.pitch = pitch - 1;
- ss->ss5.cache_control = cc_llc_l3;
+ ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
ss->ss7.shader_r = I965_SURCHAN_SELECT_RED;
ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
@@ -1200,7 +1214,8 @@ intel_set_gpgpu_callbacks(int device_id)
if (IS_HASWELL(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
- get_scratch_index = get_scratch_index_gen75;
+ cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75;
+ intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
}
else if (IS_IVYBRIDGE(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
@@ -1208,9 +1223,9 @@ intel_set_gpgpu_callbacks(int device_id)
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
else
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
- get_scratch_index = get_scratch_index_gen7;
+ cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7;
+ intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
}
else
assert(0);
}
-
--
1.8.3.2
More information about the Beignet
mailing list