[Mesa-dev] [PATCH 9/9] i965/fs: Add support for uniform array access with a variable index.
Eric Anholt
eric at anholt.net
Mon Nov 12 10:53:14 PST 2012
Serious Sam 3 had a shader hitting this path, but it's used rarely so it
didn't show a significant performance difference (n=7). It does reduce
compile time massively, though -- one shader goes from 14s compile time
and 11723 instructions generated to .44s and 499 instructions.
Note that some shaders lose 16-wide mode because we don't support
16-wide and pull constants at the moment (generally, things looping over
a few-element array where the loop isn't getting unrolled). Given that
those shaders are being generated with 15-20% fewer instructions, it
probably outweighs the increased cost of doing pull constants (which
have a latency of about 4 chained instructions, less than what those
conditional moves create) and doing 8-wide only.
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 162 +++++++++++++++++++++++---
src/mesa/drivers/dri/i965/brw_fs.h | 8 ++
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 68 +++++++++--
src/mesa/drivers/dri/i965/brw_shader.cpp | 2 +-
4 files changed, 216 insertions(+), 24 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c0bad60..98e6407 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -219,6 +219,45 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
return inst;
}
+exec_list
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
+ fs_reg offset)
+{
+ exec_list instructions;
+ fs_inst *inst;
+
+ if (intel->gen >= 7) {
+ inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
+ dst, surf_index, offset);
+ instructions.push_tail(inst);
+ } else {
+ int base_mrf = 13;
+ bool header_present = true;
+
+ fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
+ mrf.type = BRW_REGISTER_TYPE_D;
+
+ /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
+ * dword-aligned byte offset.
+ */
+ if (intel->gen == 6) {
+ instructions.push_tail(MOV(mrf, offset));
+ } else {
+ instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
+ }
+ inst = MOV(mrf, offset);
+ inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
+ dst, surf_index);
+ inst->header_present = header_present;
+ inst->base_mrf = base_mrf;
+ inst->mlen = header_present + c->dispatch_width / 8;
+
+ instructions.push_tail(inst);
+ }
+
+ return instructions;
+}
+
bool
fs_inst::equals(fs_inst *inst)
{
@@ -365,6 +404,7 @@ fs_reg::equals(const fs_reg &r) const
type == r.type &&
negate == r.negate &&
abs == r.abs &&
+ !reladdr && !r.reladdr &&
memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
sizeof(fixed_hw_reg)) == 0 &&
smear == r.smear &&
@@ -1342,6 +1382,81 @@ fs_visitor::remove_dead_constants()
return true;
}
+/*
+ * Implements array access of uniforms by inserting a
+ * PULL_CONSTANT_LOAD instruction.
+ *
+ * Unlike temporary GRF array access (where we don't support it due to
+ * the difficulty of doing relative addressing on instruction
+ * destinations), we could potentially do array access of uniforms
+ * that were loaded in GRF space as push constants. In real-world
+ * usage we've seen, though, the arrays being used are always larger
+ * than we could load as push constants, so just always move all
+ * uniform array access out to a pull constant buffer.
+ */
+void
+fs_visitor::move_uniform_array_access_to_pull_constants()
+{
+ int pull_constant_loc[c->prog_data.nr_params];
+
+ for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
+ pull_constant_loc[i] = -1;
+ }
+
+ /* Walk through and find array access of uniforms. Put a copy of that
+ * uniform in the pull constant buffer.
+ *
+ * Note that we don't move constant-indexed accesses to arrays. No
+ * testing has been done of the performance impact of this choice.
+ */
+ foreach_list_safe(node, &this->instructions) {
+ fs_inst *inst = (fs_inst *)node;
+
+ for (int i = 0 ; i < 3; i++) {
+ if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
+ continue;
+
+ int uniform = inst->src[i].reg;
+
+ /* If this array isn't already present in the pull constant buffer,
+ * add it.
+ */
+ if (pull_constant_loc[uniform] == -1) {
+ const float **values = &c->prog_data.param[uniform];
+
+ pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
+
+ assert(param_size[uniform]);
+
+ for (int j = 0; j < param_size[uniform]; j++) {
+ c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
+ values[j];
+ }
+ }
+
+ /* Set up the annotation tracking for new generated instructions. */
+ base_ir = inst->ir;
+ current_annotation = inst->annotation;
+
+ fs_reg offset = fs_reg(this, glsl_type::int_type);
+ inst->insert_before(ADD(offset, *inst->src[i].reladdr,
+ fs_reg(pull_constant_loc[uniform] +
+ inst->src[i].reg_offset)));
+
+ fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
+ fs_reg temp = fs_reg(this, glsl_type::float_type);
+ exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
+ surf_index, offset);
+ inst->insert_before(&list);
+
+ inst->src[i].file = temp.file;
+ inst->src[i].reg = temp.reg;
+ inst->src[i].reg_offset = temp.reg_offset;
+ inst->src[i].reladdr = NULL;
+ }
+ }
+}
+
/**
* Choose accesses from the UNIFORM file to demote to using the pull
* constant buffer.
@@ -1368,8 +1483,31 @@ fs_visitor::setup_pull_constants()
/* Just demote the end of the list. We could probably do better
* here, demoting things that are rarely used in the program first.
*/
- int pull_uniform_base = max_uniform_components;
- int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
+ unsigned int pull_uniform_base = max_uniform_components;
+
+ int pull_constant_loc[c->prog_data.nr_params];
+ for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
+ if (i < pull_uniform_base) {
+ pull_constant_loc[i] = -1;
+ } else {
+ pull_constant_loc[i] = -1;
+ /* If our constant is already being uploaded for reladdr purposes,
+ * reuse it.
+ */
+ for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
+ if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
+ pull_constant_loc[i] = j;
+ break;
+ }
+ }
+ if (pull_constant_loc[i] == -1) {
+ int pull_index = c->prog_data.nr_pull_params++;
+ c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
+ pull_constant_loc[i] = pull_index;;
+ }
+ }
+ }
+ c->prog_data.nr_params = pull_uniform_base;
foreach_list(node, &this->instructions) {
fs_inst *inst = (fs_inst *)node;
@@ -1378,14 +1516,16 @@ fs_visitor::setup_pull_constants()
if (inst->src[i].file != UNIFORM)
continue;
- int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
- if (uniform_nr < pull_uniform_base)
+ int pull_index = pull_constant_loc[inst->src[i].reg +
+ inst->src[i].reg_offset];
+ if (pull_index == -1)
continue;
+ assert(!inst->src[i].reladdr);
+
fs_reg dst = fs_reg(this, glsl_type::float_type);
fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
- fs_reg offset = fs_reg((unsigned)(((uniform_nr -
- pull_uniform_base) * 4) & ~15));
+ fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
fs_inst *pull =
new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
dst, index, offset);
@@ -1399,15 +1539,9 @@ fs_visitor::setup_pull_constants()
inst->src[i].file = GRF;
inst->src[i].reg = dst.reg;
inst->src[i].reg_offset = 0;
- inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
+ inst->src[i].smear = pull_index & 3;
}
}
-
- for (int i = 0; i < pull_uniform_count; i++) {
- c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
- }
- c->prog_data.nr_params -= pull_uniform_count;
- c->prog_data.nr_pull_params = pull_uniform_count;
}
bool
@@ -1959,6 +2093,7 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
end->predicate ||
end->force_uncompressed ||
end->force_sechalf ||
+ reg.reladdr ||
!reg.equals(end->dst)) {
return NULL;
} else {
@@ -2022,6 +2157,7 @@ fs_visitor::run()
split_virtual_grfs();
setup_paramvalues_refs();
+ move_uniform_array_access_to_pull_constants();
setup_pull_constants();
bool progress;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 3ebbf93..0968ebd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -119,6 +119,8 @@ public:
uint32_t u;
float f;
} imm;
+
+ fs_reg *reladdr;
};
static const fs_reg reg_undef;
@@ -217,6 +219,7 @@ public:
fs_inst *emit(fs_inst inst);
fs_inst *emit(fs_inst *inst);
+ void emit(exec_list list);
fs_inst *emit(enum opcode opcode);
fs_inst *emit(enum opcode opcode, fs_reg dst);
@@ -251,6 +254,9 @@ public:
fs_inst *end,
fs_reg reg);
+ exec_list VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
+ fs_reg offset);
+
bool run();
void setup_paramvalues_refs();
void assign_curb_setup();
@@ -266,6 +272,7 @@ public:
void spill_reg(int spill_reg);
void split_virtual_grfs();
void compact_virtual_grfs();
+ void move_uniform_array_access_to_pull_constants();
void setup_pull_constants();
void calculate_live_intervals();
bool opt_algebraic();
@@ -406,6 +413,7 @@ public:
*/
int param_index[MAX_UNIFORMS * 4];
int param_offset[MAX_UNIFORMS * 4];
+ int param_size[MAX_UNIFORMS * 4];
int *virtual_grf_sizes;
int virtual_grf_count;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index d37da47..ca1e3ac 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -117,6 +117,7 @@ fs_visitor::visit(ir_variable *ir)
return;
}
+ param_size[param_index] = type_size(ir->type);
if (!strncmp(ir->name, "gl_", 3)) {
setup_builtin_uniform_values(ir);
} else {
@@ -160,21 +161,41 @@ fs_visitor::visit(ir_dereference_record *ir)
void
fs_visitor::visit(ir_dereference_array *ir)
{
- ir_constant *index;
- int element_size;
+ ir_constant *constant_index;
+ fs_reg src;
+ int element_size = type_size(ir->type);
- ir->array->accept(this);
- index = ir->array_index->as_constant();
+ constant_index = ir->array_index->as_constant();
- element_size = type_size(ir->type);
- this->result.type = brw_type_for_base_type(ir->type);
+ ir->array->accept(this);
+ src = this->result;
+ src.type = brw_type_for_base_type(ir->type);
- if (index) {
- assert(this->result.file == UNIFORM || this->result.file == GRF);
- this->result.reg_offset += index->value.i[0] * element_size;
+ if (constant_index) {
+ assert(src.file == UNIFORM || src.file == GRF);
+ src.reg_offset += constant_index->value.i[0] * element_size;
} else {
- assert(!"FINISHME: non-constant array element");
+ /* Variable index array dereference. We attach the variable index
+ * component to the reg as a pointer to a register containing the
+ * offset. Currently only uniform arrays are supported in this patch,
+ * and that reladdr pointer is resolved by
+ * move_uniform_array_access_to_pull_constants(). All other array types
+ * are lowered by lower_variable_index_to_cond_assign().
+ */
+ ir->array_index->accept(this);
+
+ fs_reg index_reg;
+ index_reg = fs_reg(this, glsl_type::int_type);
+ emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
+
+ if (src.reladdr) {
+ emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
+ }
+
+ src.reladdr = ralloc(mem_ctx, fs_reg);
+ memcpy(src.reladdr, &index_reg, sizeof(index_reg));
}
+ this->result = src;
}
void
@@ -599,6 +620,21 @@ fs_visitor::visit(ir_expression *ir)
*/
assert(packed_consts.smear < 8);
}
+ } else {
+ /* Turn the byte offset into a dword offset. */
+ fs_reg base_offset = fs_reg(this, glsl_type::int_type);
+ emit(SHR(base_offset, op[1], fs_reg(2)));
+
+ for (int i = 0; i < ir->type->vector_elements; i++) {
+ fs_reg offset = fs_reg(this, glsl_type::int_type);
+ emit(ADD(offset, base_offset, fs_reg(i)));
+ emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index, offset));
+
+ if (ir->type->base_type == GLSL_TYPE_BOOL)
+ emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
+
+ result.reg_offset++;
+ }
}
result.reg_offset = 0;
@@ -1814,6 +1850,16 @@ fs_visitor::emit(fs_inst *inst)
return inst;
}
+void
+fs_visitor::emit(exec_list list)
+{
+ foreach_list_safe(node, &list) {
+ fs_inst *inst = (fs_inst *)node;
+ inst->remove();
+ emit(inst);
+ }
+}
+
/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
void
fs_visitor::emit_dummy_fs()
@@ -2236,6 +2282,8 @@ fs_visitor::fs_visitor(struct brw_wm_compile *c, struct gl_shader_program *prog,
this->force_uncompressed_stack = 0;
this->force_sechalf_stack = 0;
+
+ memset(&this->param_size, 0, sizeof(this->param_size));
}
fs_visitor::~fs_visitor()
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index f8482e1..0672bc1 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -141,7 +141,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
bool input = true;
bool output = stage == MESA_SHADER_FRAGMENT;
bool temp = stage == MESA_SHADER_FRAGMENT;
- bool uniform = stage == MESA_SHADER_FRAGMENT;
+ bool uniform = false;
bool lowered_variable_indexing =
lower_variable_index_to_cond_assign(shader->ir,
--
1.7.10.4
More information about the mesa-dev
mailing list