[Mesa-dev] [PATCH v4 12/12] i965: Remove old CS local ID handling
Jordan Justen
jordan.l.justen at intel.com
Wed Jun 1 22:04:19 UTC 2016
The old method pushed data for each channels uvec3 data of
gl_LocalInvocationID.
The new method pushes 1 dword of data that is a 'thread local ID'
value. Based on that value, we can generate gl_LocalInvocationIndex
and gl_LocalInvocationID with some calculations.
Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
---
src/intel/vulkan/anv_cmd_buffer.c | 5 +-
src/mesa/drivers/dri/i965/brw_compiler.h | 8 ---
src/mesa/drivers/dri/i965/brw_fs.cpp | 94 +-------------------------
src/mesa/drivers/dri/i965/brw_fs.h | 1 -
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 7 --
src/mesa/drivers/dri/i965/brw_nir_intrinsics.c | 7 --
src/mesa/drivers/dri/i965/gen7_cs_state.c | 5 +-
7 files changed, 3 insertions(+), 124 deletions(-)
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index edaaa3d..3d37de2 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -1094,13 +1094,10 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
}
if (cs_prog_data->push.per_thread.size > 0) {
- brw_cs_fill_local_id_payload(cs_prog_data, u32_map, cs_prog_data->threads,
- cs_prog_data->push.per_thread.size);
for (unsigned t = 0; t < cs_prog_data->threads; t++) {
unsigned dst =
8 * (cs_prog_data->push.per_thread.regs * t +
- cs_prog_data->push.cross_thread.regs +
- cs_prog_data->local_invocation_id_regs);
+ cs_prog_data->push.cross_thread.regs);
unsigned src = cs_prog_data->push.cross_thread.dwords;
for ( ; src < prog_data->nr_params; src++, dst++) {
if (src != cs_prog_data->thread_local_id_index) {
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index dda6297..6e6d20c 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -439,7 +439,6 @@ struct brw_cs_prog_data {
unsigned threads;
bool uses_barrier;
bool uses_num_work_groups;
- unsigned local_invocation_id_regs;
int thread_local_id_index;
struct {
@@ -831,13 +830,6 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
unsigned *final_assembly_size,
char **error_str);
-/**
- * Fill out local id payload for compute shader according to cs_prog_data.
- */
-void
-brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
- void *buffer, uint32_t threads, uint32_t stride);
-
#ifdef __cplusplus
} /* extern "C" */
#endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 3dd795e..55d600a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5573,31 +5573,6 @@ fs_visitor::setup_vs_payload()
payload.num_regs = 2;
}
-/**
- * We are building the local ID push constant data using the simplest possible
- * method. We simply push the local IDs directly as they should appear in the
- * registers for the uvec3 gl_LocalInvocationID variable.
- *
- * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
- * registers worth of push constant space.
- *
- * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
- * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
- * to coordinated.
- *
- * FINISHME: There are a few easy optimizations to consider.
- *
- * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
- * no need for using push constant space for that dimension.
- *
- * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
- * easily use 16-bit words rather than 32-bit dwords in the push constant
- * data.
- *
- * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
- * conveying the data, and thereby reduce push constant usage.
- *
- */
void
fs_visitor::setup_gs_payload()
{
@@ -5641,16 +5616,7 @@ void
fs_visitor::setup_cs_payload()
{
assert(devinfo->gen >= 7);
- brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
-
payload.num_regs = 1;
-
- if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID &&
- prog_data->thread_local_id_index < 0) {
- prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
- payload.local_invocation_id_reg = payload.num_regs;
- payload.num_regs += prog_data->local_invocation_id_regs;
- }
}
void
@@ -6525,25 +6491,6 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
}
fs_reg *
-fs_visitor::emit_cs_local_invocation_id_setup()
-{
- assert(stage == MESA_SHADER_COMPUTE);
-
- fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
-
- struct brw_reg src =
- brw_vec8_grf(payload.local_invocation_id_reg, 0);
- src = retype(src, BRW_REGISTER_TYPE_UD);
- bld.MOV(*reg, src);
- src.nr += dispatch_width / 8;
- bld.MOV(offset(*reg, bld, 1), src);
- src.nr += dispatch_width / 8;
- bld.MOV(offset(*reg, bld, 2), src);
-
- return reg;
-}
-
-fs_reg *
fs_visitor::emit_cs_work_group_id_setup()
{
assert(stage == MESA_SHADER_COMPUTE);
@@ -6589,9 +6536,7 @@ cs_fill_push_const_info(const struct brw_device_info *devinfo,
unsigned cross_thread_dwords, per_thread_dwords;
if (!cross_thread_supported) {
cross_thread_dwords = 0u;
- per_thread_dwords =
- 8 * cs_prog_data->local_invocation_id_regs +
- prog_data->nr_params;
+ per_thread_dwords = prog_data->nr_params;
} else if (fill_thread_id) {
/* Fill all but the last register with cross-thread payload */
cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8);
@@ -6615,7 +6560,6 @@ cs_fill_push_const_info(const struct brw_device_info *devinfo,
cs_prog_data->push.per_thread.size == 0);
assert(cs_prog_data->push.cross_thread.dwords +
cs_prog_data->push.per_thread.dwords ==
- 8 * cs_prog_data->local_invocation_id_regs +
prog_data->nr_params);
}
@@ -6760,39 +6704,3 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
return g.get_assembly(final_assembly_size);
}
-
-void
-brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
- void *buffer, uint32_t threads, uint32_t stride)
-{
- if (prog_data->local_invocation_id_regs == 0)
- return;
-
- /* 'stride' should be an integer number of registers, that is, a multiple
- * of 32 bytes.
- */
- assert(stride % 32 == 0);
-
- unsigned x = 0, y = 0, z = 0;
- for (unsigned t = 0; t < threads; t++) {
- uint32_t *param = (uint32_t *) buffer + stride * t / 4;
-
- for (unsigned i = 0; i < prog_data->simd_size; i++) {
- param[0 * prog_data->simd_size + i] = x;
- param[1 * prog_data->simd_size + i] = y;
- param[2 * prog_data->simd_size + i] = z;
-
- x++;
- if (x == prog_data->local_size[0]) {
- x = 0;
- y++;
- if (y == prog_data->local_size[1]) {
- y = 0;
- z++;
- if (z == prog_data->local_size[2])
- z = 0;
- }
- }
- }
- }
-}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 4c1ac9c..4237197 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -267,7 +267,6 @@ public:
unsigned base_offset, const nir_src &offset_src,
unsigned num_components);
void emit_cs_terminate();
- fs_reg *emit_cs_local_invocation_id_setup();
fs_reg *emit_cs_work_group_id_setup();
void emit_barrier();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 81c7204..7fc43b5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -272,13 +272,6 @@ emit_system_values_block(nir_block *block, fs_visitor *v)
*reg = *v->emit_samplemaskin_setup();
break;
- case nir_intrinsic_load_local_invocation_id:
- assert(v->stage == MESA_SHADER_COMPUTE);
- reg = &v->nir_system_values[SYSTEM_VALUE_LOCAL_INVOCATION_ID];
- if (reg->file == BAD_FILE)
- *reg = *v->emit_cs_local_invocation_id_setup();
- break;
-
case nir_intrinsic_load_work_group_id:
assert(v->stage == MESA_SHADER_COMPUTE);
reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
diff --git a/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c b/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c
index 972b117..00155fb 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c
@@ -161,13 +161,6 @@ brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data)
state.nir = nir;
state.prog_data = prog_data;
- /* Currently this pass only lowers intrinsics using the uniform specified
- * by thread_local_id_index.
- */
- if (nir->stage == MESA_SHADER_COMPUTE &&
- state.cs_prog_data->thread_local_id_index < 0)
- return false;
-
do {
state.progress = false;
nir_foreach_function(function, nir) {
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index 2fee02d..fe1a617 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -244,13 +244,10 @@ brw_upload_cs_push_constants(struct brw_context *brw,
gl_constant_value thread_id;
if (cs_prog_data->push.per_thread.size > 0) {
- brw_cs_fill_local_id_payload(cs_prog_data, param, cs_prog_data->threads,
- cs_prog_data->push.per_thread.size);
for (unsigned t = 0; t < cs_prog_data->threads; t++) {
unsigned dst =
8 * (cs_prog_data->push.per_thread.regs * t +
- cs_prog_data->push.cross_thread.regs +
- cs_prog_data->local_invocation_id_regs);
+ cs_prog_data->push.cross_thread.regs);
unsigned src = cs_prog_data->push.cross_thread.dwords;
for ( ; src < prog_data->nr_params; src++, dst++) {
if (src != cs_prog_data->thread_local_id_index)
--
2.8.1
More information about the mesa-dev
mailing list