[Mesa-dev] [PATCH] i965/fs: In the pre-regalloc schedule, try harder at reducing reg pressure.
Chia-I Wu
olvaffe at gmail.com
Fri Oct 18 10:25:44 CEST 2013
On Tue, Oct 15, 2013 at 7:14 AM, Eric Anholt <eric at anholt.net> wrote:
> Previously, the best thing we had was to schedule the things unblocked by
> the current instruction, on the hope that it would be consuming two values
> at the end of their live intervals while only producing one new value.
> Sometimes that wasn't the case.
>
> Now, when an instruction is the first user of a GRF we schedule (i.e. it
> will probably be the virtual_grf_def[] instruction after computing live
> intervals again), penalize it by how many regs it would take up. When an
> instruction is the last user of a GRF we have to schedule (when it will
> probably be the virtual_grf_end[] instruction), give it a boost by how
> many regs it would free.
texture2D() takes up 4 regs and at best free 2 regs. It will always
be scheduled last. When there are more than ~60 texture2D() calls (it
could happen because of loop unrolling), the message payloads could
take up all available registers.
I wonder if it helps to take how long an instruction is in the
available queue into consideration. For after a couple of
texture2D()s are scheduled, the instructions that use the results may
become available and free them up.
>
> The new functions are made virtual (only 1 of 2 really needs to be
> virtual) because I expect we'll soon lift the pre-regalloc scheduling
> heuristic over to the vec4 backend.
>
> shader-db:
> total instructions in shared programs: 1512756 -> 1511604 (-0.08%)
> instructions in affected programs: 10292 -> 9140 (-11.19%)
> GAINED: 121
> LOST: 38
>
> Improves tropics performance at my current settings by 4.50602% +/-
> 2.60694% (n=5). No difference on Lightsmark (n=5). No difference on
> GLB2.7 (n=11).
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70445
> ---
> .../drivers/dri/i965/brw_schedule_instructions.cpp | 125 ++++++++++++++++++---
> 1 file changed, 111 insertions(+), 14 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> index b24c38c..7cb0265 100644
> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> @@ -353,6 +353,13 @@ public:
> this->instructions_to_schedule = 0;
> this->post_reg_alloc = post_reg_alloc;
> this->time = 0;
> + if (!post_reg_alloc) {
> + this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
> + this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
> + } else {
> + this->remaining_grf_uses = NULL;
> + this->grf_active = NULL;
> + }
> }
>
> ~instruction_scheduler()
> @@ -377,6 +384,9 @@ public:
> */
> virtual int issue_time(backend_instruction *inst) = 0;
>
> + virtual void mod_remaining_grf_uses(backend_instruction *inst, int mod) = 0;
> + virtual int get_grf_pressure_benefit(backend_instruction *inst) = 0;
> +
> void schedule_instructions(backend_instruction *next_block_header);
>
> void *mem_ctx;
> @@ -387,6 +397,17 @@ public:
> int time;
> exec_list instructions;
> backend_visitor *bv;
> +
> + /** Number of instructions left to schedule that reference each vgrf. */
> + int *remaining_grf_uses;
> +
> + /**
> + * Tracks whether each VGRF has had an instruction scheduled that uses it.
> + *
> + * This is used to estimate whether scheduling a new instruction will
> + * increase register pressure.
> + */
> + bool *grf_active;
> };
>
> class fs_instruction_scheduler : public instruction_scheduler
> @@ -398,6 +419,9 @@ public:
> schedule_node *choose_instruction_to_schedule();
> int issue_time(backend_instruction *inst);
> fs_visitor *v;
> +
> + void mod_remaining_grf_uses(backend_instruction *inst, int mod);
> + int get_grf_pressure_benefit(backend_instruction *inst);
> };
>
> fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
> @@ -408,6 +432,57 @@ fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
> {
> }
>
> +void
> +fs_instruction_scheduler::mod_remaining_grf_uses(backend_instruction *be,
> + int mod)
> +{
> + fs_inst *inst = (fs_inst *)be;
> +
> + if (!remaining_grf_uses)
> + return;
> +
> + if (inst->dst.file == GRF) {
> + remaining_grf_uses[inst->dst.reg] += mod;
> + if (mod < 0 && !grf_active[inst->dst.reg])
> + grf_active[inst->dst.reg] = true;
> + }
> +
> + for (int i = 0; i < 3; i++) {
> + if (inst->src[i].file != GRF)
> + continue;
> +
> + remaining_grf_uses[inst->src[i].reg] += mod;
> + if (mod < 0 && !grf_active[inst->src[i].reg])
> + grf_active[inst->src[i].reg] = true;
> + }
> +}
> +
> +int
> +fs_instruction_scheduler::get_grf_pressure_benefit(backend_instruction *be)
> +{
> + fs_inst *inst = (fs_inst *)be;
> + int benefit = 0;
> +
> + if (inst->dst.file == GRF) {
> + if (remaining_grf_uses[inst->dst.reg] == 1)
> + benefit += v->virtual_grf_sizes[inst->dst.reg];
> + if (!grf_active[inst->dst.reg])
> + benefit -= v->virtual_grf_sizes[inst->dst.reg];
> + }
> +
> + for (int i = 0; i < 3; i++) {
> + if (inst->src[i].file != GRF)
> + continue;
> +
> + if (remaining_grf_uses[inst->src[i].reg] == 1)
> + benefit += v->virtual_grf_sizes[inst->src[i].reg];
> + if (!grf_active[inst->src[i].reg])
> + benefit -= v->virtual_grf_sizes[inst->src[i].reg];
> + }
> +
> + return benefit;
> +}
> +
> class vec4_instruction_scheduler : public instruction_scheduler
> {
> public:
> @@ -416,6 +491,9 @@ public:
> schedule_node *choose_instruction_to_schedule();
> int issue_time(backend_instruction *inst);
> vec4_visitor *v;
> +
> + void mod_remaining_grf_uses(backend_instruction *inst, int mod);
> + int get_grf_pressure_benefit(backend_instruction *inst);
> };
>
> vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
> @@ -426,6 +504,18 @@ vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
> }
>
> void
> +vec4_instruction_scheduler::mod_remaining_grf_uses(backend_instruction *be,
> + int mod)
> +{
> +}
> +
> +int
> +vec4_instruction_scheduler::get_grf_pressure_benefit(backend_instruction *be)
> +{
> + return 0;
> +}
> +
> +void
> instruction_scheduler::add_inst(backend_instruction *inst)
> {
> schedule_node *n = new(mem_ctx) schedule_node(inst, bv->brw);
> @@ -946,23 +1036,18 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
> }
> }
> } else {
> + int chosen_score = -1000000; /* Any instruction is better than nothing */
> +
> /* Before register allocation, we don't care about the latencies of
> * instructions. All we care about is reducing live intervals of
> * variables so that we can avoid register spilling, or get 16-wide
> * shaders which naturally do a better job of hiding instruction
> * latency.
> *
> - * To do so, schedule our instructions in a roughly LIFO/depth-first
> - * order: when new instructions become available as a result of
> - * scheduling something, choose those first so that our result
> - * hopefully is consumed quickly.
> - *
> - * The exception is messages that generate more than one result
> - * register (AKA texturing). In those cases, the LIFO search would
> - * normally tend to choose them quickly (because scheduling the
> - * previous message not only unblocked the children using its result,
> - * but also the MRF setup for the next sampler message, which in turn
> - * unblocks the next sampler message).
> + * If this instruction would be the last use of any GRFs, we bump up its
> + * score since it means it should be reducing register pressure. If
> + * it's the first use of a GRF, reduce its score since it means it
> + * should be increasing register pressure.
> */
> for (schedule_node *node = (schedule_node *)instructions.get_tail();
> node != instructions.get_head()->prev;
> @@ -970,9 +1055,12 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
> schedule_node *n = (schedule_node *)node;
> fs_inst *inst = (fs_inst *)n->inst;
>
> - chosen = n;
> - if (inst->regs_written <= 1)
> - break;
> + int this_score = get_grf_pressure_benefit(inst);
> +
> + if (this_score > chosen_score) {
> + chosen = n;
> + chosen_score = this_score;
> + }
> }
> }
>
> @@ -1036,6 +1124,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
> chosen->remove();
> next_block_header->insert_before(chosen->inst);
> instructions_to_schedule--;
> + mod_remaining_grf_uses(chosen->inst, -1);
>
> /* Update the clock for how soon an instruction could start after the
> * chosen one.
> @@ -1105,6 +1194,14 @@ instruction_scheduler::run(exec_list *all_instructions)
> bv->dump_instructions();
> }
>
> + /* Populate the remaining GRF uses array to improve the pre-regalloc
> + * scheduling.
> + */
> + if (remaining_grf_uses) {
> + foreach_list(node, &instructions)
> + mod_remaining_grf_uses((backend_instruction *)node, 1);
> + }
> +
> while (!next_block_header->is_tail_sentinel()) {
> /* Add things to be scheduled until we get to a new BB. */
> while (!next_block_header->is_tail_sentinel()) {
> --
> 1.8.4.rc3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
--
olv at LunarG.com
More information about the mesa-dev
mailing list