[Mesa-dev] [PATCH 3/3] i965/fs: Try a different pre-scheduling heuristic if the first spills.
Eric Anholt
eric at anholt.net
Wed Nov 6 18:47:31 PST 2013
Since LIFO fails on some shaders in one particular way, and non-LIFO
systematically fails in another way on different kinds of shaders, try
them both. and pick whichever one successfully register allocates first.
Slightly prefer non-LIFO in case we produce extra dependencies in register
allocation, since it should start out with fewer stalls than LIFO.
This is madness, but I haven't come up with another way to get unigine
tropics to not spill while keeping other programs from not spilling and
retaining the non-unigine performance wins from texture-grf.
total instructions in shared programs: 1626728 -> 1626288 (-0.03%)
instructions in affected programs: 1015 -> 575 (-43.35%)
GAINED: 50
LOST: 0
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 25 +++++--
src/mesa/drivers/dri/i965/brw_fs.h | 4 +-
src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 10 +--
.../drivers/dri/i965/brw_schedule_instructions.cpp | 85 ++++++++++++----------
src/mesa/drivers/dri/i965/brw_shader.h | 6 ++
5 files changed, 76 insertions(+), 54 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 65fedfb..5d76666 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3283,15 +3283,28 @@ fs_visitor::run()
assign_curb_setup();
assign_urb_setup();
- schedule_instructions(false);
+ schedule_instructions(SCHEDULE_PRE_NON_LIFO);
if (0)
assign_regs_trivial();
else {
- while (!assign_regs()) {
- if (failed)
- break;
- }
+ if (!assign_regs(false)) {
+ /* Try a non-spilling register allocation again with a different
+ * scheduling heuristic.
+ */
+ schedule_instructions(SCHEDULE_PRE_LIFO);
+ if (!assign_regs(false)) {
+ if (dispatch_width == 16) {
+ fail("Failure to register allocate. Reduce number of "
+ "live scalar values to avoid this.");
+ } else {
+ while (!assign_regs(true)) {
+ if (failed)
+ break;
+ }
+ }
+ }
+ }
}
}
assert(force_uncompressed_stack == 0);
@@ -3306,7 +3319,7 @@ fs_visitor::run()
if (failed)
return false;
- schedule_instructions(true);
+ schedule_instructions(SCHEDULE_POST);
if (dispatch_width == 8) {
c->prog_data.reg_blocks = brw_register_blocks(grf_used);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 4f97a67..be86b15 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -291,7 +291,7 @@ public:
void assign_curb_setup();
void calculate_urb_setup();
void assign_urb_setup();
- bool assign_regs();
+ bool assign_regs(bool allow_spilling);
void assign_regs_trivial();
void get_used_mrfs(bool *mrf_used);
void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
@@ -322,7 +322,7 @@ public:
bool remove_dead_constants();
bool remove_duplicate_mrf_writes();
bool virtual_grf_interferes(int a, int b);
- void schedule_instructions(bool post_reg_alloc);
+ void schedule_instructions(instruction_scheduler_mode mode);
void insert_gen4_send_dependency_workarounds();
void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index d9e80d0..8567afd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -417,7 +417,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
}
bool
-fs_visitor::assign_regs()
+fs_visitor::assign_regs(bool allow_spilling)
{
/* Most of this allocation was written for a reg_width of 1
* (dispatch_width == 8). In extending to 16-wide, the code was
@@ -496,14 +496,10 @@ fs_visitor::assign_regs()
if (reg == -1) {
fail("no register to spill:\n");
dump_instructions();
- } else if (dispatch_width == 16) {
- fail("Failure to register allocate. Reduce number of live scalar "
- "values to avoid this.");
- } else {
- spill_reg(reg);
+ } else if (allow_spilling) {
+ spill_reg(reg);
}
-
ralloc_free(g);
return false;
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 5710380..befea0a 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -391,14 +391,16 @@ schedule_node::set_latency_gen7(bool is_haswell)
class instruction_scheduler {
public:
- instruction_scheduler(backend_visitor *v, int grf_count, bool post_reg_alloc)
+ instruction_scheduler(backend_visitor *v, int grf_count,
+ instruction_scheduler_mode mode)
{
this->bv = v;
this->mem_ctx = ralloc_context(NULL);
this->grf_count = grf_count;
this->instructions.make_empty();
this->instructions_to_schedule = 0;
- this->post_reg_alloc = post_reg_alloc;
+ this->post_reg_alloc = (mode == SCHEDULE_POST);
+ this->mode = mode;
this->time = 0;
if (!post_reg_alloc) {
this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
@@ -447,6 +449,8 @@ public:
exec_list instructions;
backend_visitor *bv;
+ instruction_scheduler_mode mode;
+
/**
* Number of instructions left to schedule that reference each vgrf.
*
@@ -467,7 +471,8 @@ public:
class fs_instruction_scheduler : public instruction_scheduler
{
public:
- fs_instruction_scheduler(fs_visitor *v, int grf_count, bool post_reg_alloc);
+ fs_instruction_scheduler(fs_visitor *v, int grf_count,
+ instruction_scheduler_mode mode);
void calculate_deps();
bool is_compressed(fs_inst *inst);
schedule_node *choose_instruction_to_schedule();
@@ -481,8 +486,8 @@ public:
fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
int grf_count,
- bool post_reg_alloc)
- : instruction_scheduler(v, grf_count, post_reg_alloc),
+ instruction_scheduler_mode mode)
+ : instruction_scheduler(v, grf_count, mode),
v(v)
{
}
@@ -569,7 +574,7 @@ public:
vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
int grf_count)
- : instruction_scheduler(v, grf_count, true),
+ : instruction_scheduler(v, grf_count, SCHEDULE_POST),
v(v)
{
}
@@ -1179,40 +1184,42 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
continue;
}
- /* Prefer instructions that recently became available for scheduling.
- * These are the things that are most likely to (eventually) make a
- * variable dead and reduce register pressure. Typical register
- * pressure estimates don't work for us because most of our pressure
- * comes from texturing, where no single instruction to schedule will
- * make a vec4 value dead.
- */
- if (n->cand_generation > chosen->cand_generation) {
- chosen = n;
- continue;
- } else if (n->cand_generation < chosen->cand_generation) {
- continue;
- }
-
- /* On MRF-using chips, prefer non-SEND instructions. If we don't do
- * this, then because we prefer instructions that just became
- * candidates, we'll end up in a pattern of scheduling a SEND, then
- * the MRFs for the next SEND, then the next SEND, then the MRFs,
- * etc., without ever consuming the results of a send.
- */
- if (v->brw->gen < 7) {
- fs_inst *chosen_inst = (fs_inst *)chosen->inst;
-
- /* We use regs_written > 1 as our test for the kind of send
- * instruction to avoid -- only sends generate many regs, and a
- * single-result send is probably actually reducing register
- * pressure.
+ if (mode == SCHEDULE_PRE_LIFO) {
+ /* Prefer instructions that recently became available for
+ * scheduling. These are the things that are most likely to
+ * (eventually) make a variable dead and reduce register pressure.
+ * Typical register pressure estimates don't work for us because
+ * most of our pressure comes from texturing, where no single
+ * instruction to schedule will make a vec4 value dead.
*/
- if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
+ if (n->cand_generation > chosen->cand_generation) {
chosen = n;
continue;
- } else if (inst->regs_written > chosen_inst->regs_written) {
+ } else if (n->cand_generation < chosen->cand_generation) {
continue;
}
+
+ /* On MRF-using chips, prefer non-SEND instructions. If we don't
+ * do this, then because we prefer instructions that just became
+ * candidates, we'll end up in a pattern of scheduling a SEND,
+ * then the MRFs for the next SEND, then the next SEND, then the
+ * MRFs, etc., without ever consuming the results of a send.
+ */
+ if (v->brw->gen < 7) {
+ fs_inst *chosen_inst = (fs_inst *)chosen->inst;
+
+ /* We use regs_written > 1 as our test for the kind of send
+ * instruction to avoid -- only sends generate many regs, and a
+ * single-result send is probably actually reducing register
+ * pressure.
+ */
+ if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
+ chosen = n;
+ continue;
+ } else if (inst->regs_written > chosen_inst->regs_written) {
+ continue;
+ }
+ }
}
/* For instructions pushed on the cands list at the same time, prefer
@@ -1407,18 +1414,18 @@ instruction_scheduler::run(exec_list *all_instructions)
}
void
-fs_visitor::schedule_instructions(bool post_reg_alloc)
+fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
{
int grf_count;
- if (post_reg_alloc)
+ if (mode == SCHEDULE_POST)
grf_count = grf_used;
else
grf_count = virtual_grf_count;
- fs_instruction_scheduler sched(this, grf_count, post_reg_alloc);
+ fs_instruction_scheduler sched(this, grf_count, mode);
sched.run(&instructions);
- if (unlikely(INTEL_DEBUG & DEBUG_WM) && post_reg_alloc) {
+ if (unlikely(INTEL_DEBUG & DEBUG_WM) && mode == SCHEDULE_POST) {
printf("fs%d estimated execution time: %d cycles\n",
dispatch_width, sched.time);
}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 88c2311..aba24c5 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -59,6 +59,12 @@ public:
bool predicate_inverse;
};
+enum instruction_scheduler_mode {
+ SCHEDULE_PRE_NON_LIFO,
+ SCHEDULE_PRE_LIFO,
+ SCHEDULE_POST,
+};
+
class backend_visitor : public ir_visitor {
public:
--
1.8.4.rc3
More information about the mesa-dev
mailing list