<div dir="ltr"><br><div class="gmail_extra"><br><div class="gmail_quote">On Wed, Jun 1, 2016 at 3:04 PM, Jordan Justen <span dir="ltr"><<a href="mailto:jordan.l.justen@intel.com" target="_blank">jordan.l.justen@intel.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">The cross thread constant support appears on Haswell. It allows us to<br>
upload a set of uniform data for all threads without duplicating it<br>
per thread.<br>
<br>
We also support per-thread data which allows us to store a per-thread<br>
ID in one of the uniforms that can be used to calculate the<br>
gl_LocalInvocationIndex and gl_LocalInvocationID variables.<br>
<br>
v4:<br>
* Support the old local ID push constant layout as well (Jason)<br>
<br>
Signed-off-by: Jordan Justen <<a href="mailto:jordan.l.justen@intel.com">jordan.l.justen@intel.com</a>><br>
---<br>
src/mesa/drivers/dri/i965/brw_defines.h | 3 +<br>
src/mesa/drivers/dri/i965/gen7_cs_state.c | 99 +++++++++++++++++--------------<br>
2 files changed, 56 insertions(+), 46 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h<br>
index 4eb6b1f..e7d1a9f 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_defines.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_defines.h<br>
@@ -2943,6 +2943,9 @@ enum brw_wm_barycentric_interp_mode {<br>
# define MEDIA_GPGPU_THREAD_COUNT_MASK INTEL_MASK(7, 0)<br>
# define GEN8_MEDIA_GPGPU_THREAD_COUNT_SHIFT 0<br>
# define GEN8_MEDIA_GPGPU_THREAD_COUNT_MASK INTEL_MASK(9, 0)<br>
+/* GEN7 DW6, GEN8+ DW7 */<br>
+# define CROSS_THREAD_READ_LENGTH_SHIFT 0<br>
+# define CROSS_THREAD_READ_LENGTH_MASK INTEL_MASK(7, 0)<br>
#define MEDIA_STATE_FLUSH 0x7004<br>
#define GPGPU_WALKER 0x7105<br>
/* GEN7 DW0 */<br>
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c<br>
index 619edfb..2fee02d 100644<br>
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c<br>
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c<br>
@@ -42,7 +42,6 @@ brw_upload_cs_state(struct brw_context *brw)<br>
uint32_t offset;<br>
uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,<br>
8 * 4, 64, &offset);<br>
- struct gl_program *prog = (struct gl_program *) brw->compute_program;<br>
struct brw_stage_state *stage_state = &brw->cs.base;<br>
struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;<br>
struct brw_stage_prog_data *prog_data = &cs_prog_data->base;<br>
@@ -59,16 +58,6 @@ brw_upload_cs_state(struct brw_context *brw)<br>
prog_data->binding_table.size_bytes,<br>
32, &stage_state->bind_bo_offset);<br>
<br>
- unsigned local_id_dwords = 0;<br>
-<br>
- if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)<br>
- local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;<br>
-<br>
- unsigned push_constant_data_size =<br>
- (prog_data->nr_params + local_id_dwords) * sizeof(gl_constant_value);<br>
- unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);<br>
- unsigned push_constant_regs = reg_aligned_constant_size / 32;<br>
-<br>
uint32_t dwords = brw->gen < 8 ? 8 : 9;<br>
BEGIN_BATCH(dwords);<br>
OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));<br>
@@ -118,7 +107,8 @@ brw_upload_cs_state(struct brw_context *brw)<br>
* Note: The constant data is built in brw_upload_cs_push_constants below.<br>
*/<br>
const uint32_t vfe_curbe_allocation =<br>
- push_constant_regs * cs_prog_data->threads;<br>
+ ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +<br>
+ cs_prog_data->push.cross_thread.regs, 2);<br>
OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) |<br>
SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC));<br>
OUT_BATCH(0);<br>
@@ -126,11 +116,11 @@ brw_upload_cs_state(struct brw_context *brw)<br>
OUT_BATCH(0);<br>
ADVANCE_BATCH();<br>
<br>
- if (reg_aligned_constant_size > 0) {<br>
+ if (cs_prog_data->push.total.size > 0) {<br>
BEGIN_BATCH(4);<br>
OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));<br>
OUT_BATCH(0);<br>
- OUT_BATCH(ALIGN(reg_aligned_constant_size * cs_prog_data->threads, 64));<br>
+ OUT_BATCH(ALIGN(cs_prog_data->push.total.size, 64));<br>
OUT_BATCH(stage_state->push_const_offset);<br>
ADVANCE_BATCH();<br>
}<br>
@@ -149,7 +139,8 @@ brw_upload_cs_state(struct brw_context *brw)<br>
desc[dw++] = stage_state->sampler_offset |<br>
((stage_state->sampler_count + 3) / 4);<br>
desc[dw++] = stage_state->bind_bo_offset;<br>
- desc[dw++] = SET_FIELD(push_constant_regs, MEDIA_CURBE_READ_LENGTH);<br>
+ desc[dw++] = SET_FIELD(cs_prog_data->push.per_thread.regs,<br>
+ MEDIA_CURBE_READ_LENGTH);<br>
const uint32_t media_threads =<br>
brw->gen >= 8 ?<br>
SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :<br>
@@ -171,6 +162,10 @@ brw_upload_cs_state(struct brw_context *brw)<br>
SET_FIELD(slm_size, MEDIA_SHARED_LOCAL_MEMORY_SIZE) |<br>
media_threads;<br>
<br>
+ desc[dw++] =<br>
+ SET_FIELD(cs_prog_data->push.cross_thread.regs,<br>
+ CROSS_THREAD_READ_LENGTH);<br></blockquote><div><br></div><div>I don't think this needs 3 lines.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+<br>
BEGIN_BATCH(4);<br>
OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));<br>
OUT_BATCH(0);<br>
@@ -213,10 +208,6 @@ brw_upload_cs_push_constants(struct brw_context *brw,<br>
struct gl_context *ctx = &brw->ctx;<br>
const struct brw_stage_prog_data *prog_data =<br>
(struct brw_stage_prog_data*) cs_prog_data;<br>
- unsigned local_id_dwords = 0;<br>
-<br>
- if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)<br>
- local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;<br>
<br>
/* Updates the ParamaterValues[i] pointers for all parameters of the<br>
* basic type of PROGRAM_STATE_VAR.<br>
@@ -224,41 +215,57 @@ brw_upload_cs_push_constants(struct brw_context *brw,<br>
/* XXX: Should this happen somewhere before to get our state flag set? */<br>
_mesa_load_state_parameters(ctx, prog->Parameters);<br>
<br>
- if (prog_data->nr_params == 0 && local_id_dwords == 0) {<br>
+ if (cs_prog_data->push.total.size == 0) {<br>
stage_state->push_const_size = 0;<br>
- } else {<br>
- gl_constant_value *param;<br>
- unsigned i, t;<br>
-<br>
- const unsigned push_constant_data_size =<br>
- (local_id_dwords + prog_data->nr_params) * sizeof(gl_constant_value);<br>
- const unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32);<br>
- const unsigned param_aligned_count =<br>
- reg_aligned_constant_size / sizeof(*param);<br>
+ return;<br>
+ }<br>
<br>
- param = (gl_constant_value*)<br>
- brw_state_batch(brw, type,<br>
- ALIGN(reg_aligned_constant_size *<br>
- cs_prog_data->threads, 64),<br>
- 64, &stage_state->push_const_offset);<br>
- assert(param);<br>
<br>
- STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));<br>
+ gl_constant_value *param;<br>
+ param = (gl_constant_value*)<br></blockquote><div><br></div><div>These could go on the same line.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ brw_state_batch(brw, type,<br>
+ ALIGN(cs_prog_data->push.total.size, 64),<br>
+ 64, &stage_state->push_const_offset);<br>
+ assert(param);<br>
+<br>
+ STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));<br>
+<br>
+ if (cs_prog_data->push.cross_thread.size > 0) {<br>
+ gl_constant_value *param_copy = param;<br>
+ assert(cs_prog_data->thread_local_id_index < 0 ||<br>
+ cs_prog_data->thread_local_id_index >=<br>
+ cs_prog_data->push.cross_thread.dwords);<br>
+ for (unsigned i = 0;<br>
+ i < cs_prog_data->push.cross_thread.dwords;<br>
+ i++) {<br>
+ param_copy[i] = *prog_data->param[i];<br>
+ }<br>
+ }<br>
<br>
+ gl_constant_value thread_id;<br>
+ if (cs_prog_data->push.per_thread.size > 0) {<br>
brw_cs_fill_local_id_payload(cs_prog_data, param, cs_prog_data->threads,<br>
- reg_aligned_constant_size);<br>
-<br>
- /* _NEW_PROGRAM_CONSTANTS */<br>
- for (t = 0; t < cs_prog_data->threads; t++) {<br>
- gl_constant_value *next_param =<br>
- ¶m[t * param_aligned_count + local_id_dwords];<br>
- for (i = 0; i < prog_data->nr_params; i++) {<br>
- next_param[i] = *prog_data->param[i];<br>
+ cs_prog_data->push.per_thread.size);<br>
+ for (unsigned t = 0; t < cs_prog_data->threads; t++) {<br>
+ unsigned dst =<br>
+ 8 * (cs_prog_data->push.per_thread.regs * t +<br>
+ cs_prog_data->push.cross_thread.regs +<br>
+ cs_prog_data->local_invocation_id_regs);<br>
+ unsigned src = cs_prog_data->push.cross_thread.dwords;<br>
+ for ( ; src < prog_data->nr_params; src++, dst++) {<br>
+ if (src != cs_prog_data->thread_local_id_index)<br>
+ param[dst] = *prog_data->param[src];<br>
+ else {<br>
+ thread_id.u = t * cs_prog_data->simd_size;<br>
+ param[dst] = thread_id;<br>
+ }<br>
}<br>
}<br>
-<br>
- stage_state->push_const_size = ALIGN(prog_data->nr_params, 8) / 8;<br>
}<br>
+<br>
+ stage_state->push_const_size =<br>
+ cs_prog_data->push.cross_thread.regs +<br>
+ cs_prog_data->push.per_thread.regs;<br>
}<br>
<span class="HOEnZb"><font color="#888888"><br>
<br>
--<br>
2.8.1<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>