[Mesa-dev] [PATCH 6/6] i965/sched: use liveness analysis for computing register pressure
Jason Ekstrand
jason at jlekstrand.net
Wed Oct 21 16:01:39 PDT 2015
On Fri, Oct 2, 2015 at 2:37 PM, Connor Abbott <cwabbott0 at gmail.com> wrote:
> Previously, we were using some heuristics to try and detect when a write
> was about to begin a live range, or when a read was about to end a live
> range. We never used the liveness analysis information used by the
> register allocator, though, which meant that the scheduler's and the
> allocator's ideas of when a live range began and ended were different.
> Not only did this make our estimate of the register pressure benefit of
> scheduling an instruction wrong in some cases, but it was preventing us
> from knowing the actual register pressure when scheduling each
> instruction, which we want to have in order to switch to register
> pressure scheduling only when the register pressure is too high.
>
> This commit rewrites the register pressure tracking code to use the same
> model as our register allocator currently uses. We use the results of
> liveness analysis, as well as the compute_payload_ranges() function that
> we split out in the last commit. This means that we compute live ranges
> twice on each round through the register allocator, although we could
> speed it up by only recomputing the ranges and not the live in/live out
> sets after scheduling, since we only shuffle around instructions within
> a single basic block when we schedule.
>
> Shader-db results on bdw:
>
> total instructions in shared programs: 7130187 -> 7129880 (-0.00%)
> instructions in affected programs: 1744 -> 1437 (-17.60%)
> helped: 1
> HURT: 1
>
> total cycles in shared programs: 172535126 -> 172473226 (-0.04%)
> cycles in affected programs: 11338636 -> 11276736 (-0.55%)
> helped: 876
> HURT: 873
>
> LOST: 8
> GAINED: 0
> Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
> ---
> The results are a wash, but this is needed for a lot of the more
> experimental things I want to do. I can drop this if there are any
> objections.
>
> .../drivers/dri/i965/brw_schedule_instructions.cpp | 300 +++++++++++++++++----
> 1 file changed, 244 insertions(+), 56 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> index 22a493f..6b8792b 100644
> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> @@ -26,6 +26,7 @@
> */
>
> #include "brw_fs.h"
> +#include "brw_fs_live_variables.h"
> #include "brw_vec4.h"
> #include "brw_cfg.h"
> #include "brw_shader.h"
> @@ -400,22 +401,49 @@ schedule_node::set_latency_gen7(bool is_haswell)
> class instruction_scheduler {
> public:
> instruction_scheduler(backend_shader *s, int grf_count,
> + int hw_reg_count, int block_count,
> instruction_scheduler_mode mode)
> {
> this->bs = s;
> this->mem_ctx = ralloc_context(NULL);
> this->grf_count = grf_count;
> + this->hw_reg_count = hw_reg_count;
> this->instructions.make_empty();
> this->instructions_to_schedule = 0;
> this->post_reg_alloc = (mode == SCHEDULE_POST);
> this->mode = mode;
> this->time = 0;
> if (!post_reg_alloc) {
> - this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
> - this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
> + this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count);
> +
> + this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
> + for (int i = 0; i < block_count; i++)
> + this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD,
> + BITSET_WORDS(grf_count));
> +
> + this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
> + for (int i = 0; i < block_count; i++)
> + this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
> + BITSET_WORDS(grf_count));
> +
> + this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
> + for (int i = 0; i < block_count; i++)
> + this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
> + BITSET_WORDS(hw_reg_count));
> +
> + this->written = rzalloc_array(mem_ctx, bool, grf_count);
> +
> + this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count);
> +
> + this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count);
> } else {
> - this->remaining_grf_uses = NULL;
> - this->grf_active = NULL;
> + this->reg_pressure_in = NULL;
> + this->livein = NULL;
> + this->liveout = NULL;
> + this->hw_liveout = NULL;
> + this->written = NULL;
> + this->reads_remaining = NULL;
> + this->hw_reads_remaining = NULL;
> }
> }
>
> @@ -442,7 +470,8 @@ public:
> */
> virtual int issue_time(backend_instruction *inst) = 0;
>
> - virtual void count_remaining_grf_uses(backend_instruction *inst) = 0;
> + virtual void count_reads_remaining(backend_instruction *inst) = 0;
> + virtual void setup_liveness(cfg_t *cfg) = 0;
> virtual void update_register_pressure(backend_instruction *inst) = 0;
> virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
>
> @@ -453,33 +482,63 @@ public:
> bool post_reg_alloc;
> int instructions_to_schedule;
> int grf_count;
> + int hw_reg_count;
> int time;
> + int reg_pressure;
> + int block_idx;
> exec_list instructions;
> backend_shader *bs;
>
> instruction_scheduler_mode mode;
>
> - /**
> - * Number of instructions left to schedule that reference each vgrf.
> - *
> - * Used so that we can prefer scheduling instructions that will end the
> - * live intervals of multiple variables, to reduce register pressure.
> + /*
> + * The register pressure at the beginning of each basic block.
> */
> - int *remaining_grf_uses;
>
> - /**
> - * Tracks whether each VGRF has had an instruction scheduled that uses it.
> - *
> - * This is used to estimate whether scheduling a new instruction will
> - * increase register pressure.
> + int *reg_pressure_in;
> +
> + /*
> + * The virtual GRF's whose range overlaps the beginning of each basic block.
> */
> - bool *grf_active;
> +
> + BITSET_WORD **livein;
> +
> + /*
> + * The virtual GRF's whose range overlaps the end of each basic block.
> + */
> +
> + BITSET_WORD **liveout;
> +
> + /*
> + * The hardware GRF's whose range overlaps the end of each basic block.
> + */
> +
> + BITSET_WORD **hw_liveout;
> +
> + /*
> + * Whether we've scheduled a write for this virtual GRF yet.
> + */
> +
> + bool *written;
> +
> + /*
> + * How many reads we haven't scheduled for this virtual GRF yet.
> + */
> +
> + int *reads_remaining;
> +
> + /*
> + * How many reads we haven't scheduled for this hardware GRF yet.
> + */
> +
> + int *hw_reads_remaining;
> };
>
> class fs_instruction_scheduler : public instruction_scheduler
> {
> public:
> - fs_instruction_scheduler(fs_visitor *v, int grf_count,
> + fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count,
> + int block_count,
> instruction_scheduler_mode mode);
> void calculate_deps();
> bool is_compressed(fs_inst *inst);
> @@ -487,35 +546,115 @@ public:
> int issue_time(backend_instruction *inst);
> fs_visitor *v;
>
> - void count_remaining_grf_uses(backend_instruction *inst);
> + void count_reads_remaining(backend_instruction *inst);
> + void setup_liveness(cfg_t *cfg);
> void update_register_pressure(backend_instruction *inst);
> int get_register_pressure_benefit(backend_instruction *inst);
> };
>
> fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
> - int grf_count,
> + int grf_count, int hw_reg_count,
> + int block_count,
> instruction_scheduler_mode mode)
> - : instruction_scheduler(v, grf_count, mode),
> + : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode),
> v(v)
> {
> }
>
> +static bool
> +is_src_duplicate(fs_inst *inst, int src)
> +{
> + for (int i = 0; i < src; i++)
> + if (inst->src[i].equals(inst->src[src]))
> + return true;
> +
> + return false;
> +}
> +
> void
> -fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
> +fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
> {
> fs_inst *inst = (fs_inst *)be;
>
> - if (!remaining_grf_uses)
> + if (!reads_remaining)
> return;
>
> - if (inst->dst.file == GRF)
> - remaining_grf_uses[inst->dst.reg]++;
> -
> for (int i = 0; i < inst->sources; i++) {
> - if (inst->src[i].file != GRF)
> + if (is_src_duplicate(inst, i))
> + continue;
> +
> + if (inst->src[i].file == GRF) {
> + reads_remaining[inst->src[i].reg]++;
> + } else if (inst->src[i].file == HW_REG &&
> + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
> + if (inst->src[i].fixed_hw_reg.nr >= hw_reg_count)
> + continue;
> +
> + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr]++;
> +
> + if (v->devinfo->gen >= 6 &&
> + inst->opcode == FS_OPCODE_LINTERP && i == 0) {
> + for (int j = 1; j < 4; j++) {
We should use regs_read() here
> + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + j]++;
> + }
> + }
> + }
> + }
> +}
> +
> +void
> +fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
> +{
> + /* First, compute liveness on a per-GRF level using the in/out sets from
> + * liveness calculation.
> + */
> + for (int block = 0; block < cfg->num_blocks; block++) {
> + for (int i = 0; i < v->live_intervals->num_vars; i++) {
> + if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) {
> + int vgrf = v->live_intervals->vgrf_from_var[i];
> + if (!BITSET_TEST(livein[block], vgrf)) {
> + reg_pressure_in[block] += v->alloc.sizes[vgrf];
> + BITSET_SET(livein[block], vgrf);
> + }
> + }
> +
> + if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i))
> + BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]);
> + }
> + }
> +
> + /* Now, extend the live in/live out sets for when a range crosses a block
> + * boundary, which matches what our register allocator/interference code
> + * does to account for force_writemask_all and incompatible exec_mask's.
> + */
> + for (int block = 0; block < cfg->num_blocks - 1; block++) {
> + for (int i = 0; i < grf_count; i++) {
> + if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip &&
> + v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) {
> + if (!BITSET_TEST(livein[block + 1], i)) {
> + reg_pressure_in[block + 1] += v->alloc.sizes[i];
> + BITSET_SET(livein[block + 1], i);
> + }
> +
> + BITSET_SET(liveout[block], i);
> + }
> + }
> + }
> +
> + int payload_last_use_ip[hw_reg_count];
> + v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
> +
> + for (int i = 0; i < hw_reg_count; i++) {
> + if (payload_last_use_ip[i] == -1)
> continue;
>
> - remaining_grf_uses[inst->src[i].reg]++;
> + for (int block = 0; block < cfg->num_blocks; block++) {
> + if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
> + reg_pressure_in[block]++;
> +
> + if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
> + BITSET_SET(hw_liveout[block], i);
> + }
> }
> }
>
> @@ -524,18 +663,29 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
> {
> fs_inst *inst = (fs_inst *)be;
>
> - if (!remaining_grf_uses)
> + if (!reads_remaining)
> return;
>
> if (inst->dst.file == GRF) {
> - remaining_grf_uses[inst->dst.reg]--;
> - grf_active[inst->dst.reg] = true;
> + written[inst->dst.reg] = true;
> }
>
> for (int i = 0; i < inst->sources; i++) {
> + if (is_src_duplicate(inst, i))
> + continue;
> +
> if (inst->src[i].file == GRF) {
> - remaining_grf_uses[inst->src[i].reg]--;
> - grf_active[inst->src[i].reg] = true;
> + reads_remaining[inst->src[i].reg]--;
> + } else if (inst->src[i].file == HW_REG &&
> + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE &&
> + inst->src[i].fixed_hw_reg.nr < hw_reg_count) {
> + int regs_read = (v->devinfo->gen >= 6 &&
> + inst->opcode == FS_OPCODE_LINTERP &&
> + i == 0) ? 4 : 1;
and here
> +
> + for (int off = 0; off < regs_read; off++) {
> + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + off]--;
> + }
> }
> }
> }
> @@ -547,20 +697,35 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
> int benefit = 0;
>
> if (inst->dst.file == GRF) {
> - if (remaining_grf_uses[inst->dst.reg] == 1)
> - benefit += v->alloc.sizes[inst->dst.reg];
> - if (!grf_active[inst->dst.reg])
> + if (!BITSET_TEST(livein[block_idx], inst->dst.reg) &&
> + !written[inst->dst.reg])
> benefit -= v->alloc.sizes[inst->dst.reg];
> }
>
> for (int i = 0; i < inst->sources; i++) {
> - if (inst->src[i].file != GRF)
> + if (is_src_duplicate(inst, i))
> continue;
>
> - if (remaining_grf_uses[inst->src[i].reg] == 1)
> + if (inst->src[i].file == GRF &&
> + !BITSET_TEST(liveout[block_idx], inst->src[i].reg) &&
> + reads_remaining[inst->src[i].reg] == 1)
> benefit += v->alloc.sizes[inst->src[i].reg];
> - if (!grf_active[inst->src[i].reg])
> - benefit -= v->alloc.sizes[inst->src[i].reg];
> +
> + if (inst->src[i].file == HW_REG &&
> + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE &&
> + inst->src[i].fixed_hw_reg.nr < hw_reg_count) {
> + int regs_read = (v->devinfo->gen >= 6 &&
> + inst->opcode == FS_OPCODE_LINTERP &&
> + i == 0) ? 4 : 1;
and here
Other than that, it seems reasonable
Reviewed-by: Jason Ekstrand <jason.ekstrand at intel.com>
> +
> + for (int off = 0; off < regs_read; off++) {
> + int reg = inst->src[i].fixed_hw_reg.nr + off;
> + if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
> + hw_reads_remaining[reg] == 1) {
> + benefit++;
> + }
> + }
> + }
> }
>
> return benefit;
> @@ -575,20 +740,26 @@ public:
> int issue_time(backend_instruction *inst);
> vec4_visitor *v;
>
> - void count_remaining_grf_uses(backend_instruction *inst);
> + void count_reads_remaining(backend_instruction *inst);
> + void setup_liveness(cfg_t *cfg);
> void update_register_pressure(backend_instruction *inst);
> int get_register_pressure_benefit(backend_instruction *inst);
> };
>
> vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
> int grf_count)
> - : instruction_scheduler(v, grf_count, SCHEDULE_POST),
> + : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST),
> v(v)
> {
> }
>
> void
> -vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
> +vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
> +{
> +}
> +
> +void
> +vec4_instruction_scheduler::setup_liveness(cfg_t *cfg)
> {
> }
>
> @@ -1387,6 +1558,9 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
> const struct brw_device_info *devinfo = bs->devinfo;
> backend_instruction *inst = block->end();
> time = 0;
> + if (!post_reg_alloc)
> + reg_pressure = reg_pressure_in[block->num];
> + block_idx = block->num;
>
> /* Remove non-DAG heads from the list. */
> foreach_in_list_safe(schedule_node, n, &instructions) {
> @@ -1403,7 +1577,11 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
> chosen->remove();
> inst->insert_before(block, chosen->inst);
> instructions_to_schedule--;
> - update_register_pressure(chosen->inst);
> +
> + if (!post_reg_alloc) {
> + reg_pressure -= get_register_pressure_benefit(chosen->inst);
> + update_register_pressure(chosen->inst);
> + }
>
> /* If we expected a delay for scheduling, then bump the clock to reflect
> * that. In reality, the hardware will switch to another hyperthread
> @@ -1421,6 +1599,8 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
> if (debug) {
> fprintf(stderr, "clock %4d, scheduled: ", time);
> bs->dump_instruction(chosen->inst);
> + if (!post_reg_alloc)
> + fprintf(stderr, "(register pressure %d)\n", reg_pressure);
> }
>
> /* Now that we've scheduled a new instruction, some of its
> @@ -1490,25 +1670,30 @@ static unsigned get_cycle_count(cfg_t *cfg)
> void
> instruction_scheduler::run(cfg_t *cfg)
> {
> - if (debug) {
> + if (debug && !post_reg_alloc) {
> fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
> post_reg_alloc);
> - bs->dump_instructions();
> + bs->dump_instructions();
> }
>
> - /* Populate the remaining GRF uses array to improve the pre-regalloc
> - * scheduling.
> - */
> - if (remaining_grf_uses) {
> - foreach_block_and_inst(block, backend_instruction, inst, cfg) {
> - count_remaining_grf_uses(inst);
> - }
> - }
> + if (!post_reg_alloc)
> + setup_liveness(cfg);
>
> foreach_block(block, cfg) {
> if (block->end_ip - block->start_ip <= 1)
> continue;
>
> + if (reads_remaining) {
> + memset(reads_remaining, 0,
> + grf_count * sizeof(*reads_remaining));
> + memset(hw_reads_remaining, 0,
> + hw_reg_count * sizeof(*hw_reads_remaining));
> + memset(written, 0, grf_count * sizeof(*written));
> +
> + foreach_inst_in_block(fs_inst, inst, block)
> + count_reads_remaining(inst);
> + }
> +
> add_insts_from_block(block);
>
> calculate_deps();
> @@ -1520,7 +1705,7 @@ instruction_scheduler::run(cfg_t *cfg)
> schedule_instructions(block);
> }
>
> - if (debug) {
> + if (debug && !post_reg_alloc) {
> fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
> post_reg_alloc);
> bs->dump_instructions();
> @@ -1532,13 +1717,16 @@ instruction_scheduler::run(cfg_t *cfg)
> void
> fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
> {
> + calculate_live_intervals();
> +
> int grf_count;
> if (mode == SCHEDULE_POST)
> grf_count = grf_used;
> else
> grf_count = alloc.count;
>
> - fs_instruction_scheduler sched(this, grf_count, mode);
> + fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf,
> + cfg->num_blocks, mode);
> sched.run(cfg);
>
> if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {
> --
> 2.1.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list