[Mesa-dev] [PATCH 10/13] i965/fs: Bake regs_written into the IR instead of recomputing it later.

Sun Mar 31 21:41:14 PDT 2013

On 03/20/2013 05:36 PM, Eric Anholt wrote:
> For sampler messages, it depends on the target gen, and on gen4
> SIMD16-sampler-on-SIMD8-execution we were returning 4 instead of 8 like we
> should.
>
> NOTE: This is a candidate for the 9.1 branch.
> ---
>   src/mesa/drivers/dri/i965/brw_fs.cpp               |   29 +++++++-------------
>   src/mesa/drivers/dri/i965/brw_fs.h                 |    2 +-
>   src/mesa/drivers/dri/i965/brw_fs_cse.cpp           |    6 ++--
>   .../drivers/dri/i965/brw_fs_live_variables.cpp     |    2 +-
>   src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp  |    8 +++---
>   .../dri/i965/brw_fs_schedule_instructions.cpp      |    6 ++--
>   src/mesa/drivers/dri/i965/brw_fs_visitor.cpp       |    7 +++--
>   7 files changed, 27 insertions(+), 33 deletions(-)

Ugh...I'm not a huge fan of this, but I think it's better than the 
alternative (which is passing "intel" into random functions to handle 
the case you mentioned.)

The reason I'm concerned is that we sometimes change the opcode of 
instructions, and we'll need to make sure to update this too.  But 
that's probably fine.  For CSE, you emit new instructions, rather than 
editing it, so that works.

> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index f4aa9f7..c128175 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -60,6 +60,9 @@ fs_inst::init()
>      this->src[0] = reg_undef;
>      this->src[1] = reg_undef;
>      this->src[2] = reg_undef;
> +
> +   /* This will be the case for almost all instructions. */
> +   this->regs_written = 1;
>   }
>
>   fs_inst::fs_inst()
> @@ -254,6 +257,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
>         fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
>         inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
>                                     vec4_result, surf_index, vec4_offset);
> +      inst->regs_written = 4;
>         instructions.push_tail(inst);
>
>         vec4_result.reg_offset += const_offset & 3;
> @@ -329,26 +333,13 @@ fs_inst::equals(fs_inst *inst)
>              offset == inst->offset);
>   }
>
> -int
> -fs_inst::regs_written()
> -{
> -   if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7)
> -      return 4;
> -
> -   /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
> -    * but we don't currently use them...nor do we have an opcode for them.
> -    */
> -
> -   return 1;
> -}
> -
>   bool
>   fs_inst::overwrites_reg(const fs_reg &reg)
>   {
>      return (reg.file == dst.file &&
>              reg.reg == dst.reg &&
>              reg.reg_offset >= dst.reg_offset  &&
> -           reg.reg_offset < dst.reg_offset + regs_written());
> +           reg.reg_offset < dst.reg_offset + regs_written);
>   }
>
>   bool
> @@ -1388,7 +1379,7 @@ fs_visitor::split_virtual_grfs()
>         /* If there's a SEND message that requires contiguous destination
>          * registers, no splitting is allowed.
>          */
> -      if (inst->regs_written() > 1) {
> +      if (inst->regs_written > 1) {
>   	 split_grf[inst->dst.reg] = false;
>         }
>      }
> @@ -2109,7 +2100,7 @@ fs_visitor::compute_to_mrf()
>               /* Things returning more than one register would need us to
>                * understand coalescing out more than one MOV at a time.
>                */
> -            if (scan_inst->regs_written() > 1)
> +            if (scan_inst->regs_written > 1)
>                  break;
>
>   	    /* SEND instructions can't have MRF as a destination. */
> @@ -2326,7 +2317,7 @@ void
>   fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
>   {
>      int reg_size = dispatch_width / 8;
> -   int write_len = inst->regs_written() * reg_size;
> +   int write_len = inst->regs_written * reg_size;
>      int first_write_grf = inst->dst.reg;
>      bool needs_dep[BRW_MAX_MRF];
>      assert(write_len < (int)sizeof(needs_dep) - 1);
> @@ -2366,7 +2357,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
>          * dependency has more latency than a MOV.
>          */
>         if (scan_inst->dst.file == GRF) {
> -         for (int i = 0; i < scan_inst->regs_written(); i++) {
> +         for (int i = 0; i < scan_inst->regs_written; i++) {
>               int reg = scan_inst->dst.reg + i * reg_size;
>
>               if (reg >= first_write_grf &&
> @@ -2405,7 +2396,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
>   void
>   fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
>   {
> -   int write_len = inst->regs_written() * dispatch_width / 8;
> +   int write_len = inst->regs_written * dispatch_width / 8;
>      int first_write_grf = inst->dst.reg;
>      bool needs_dep[BRW_MAX_MRF];
>      assert(write_len < (int)sizeof(needs_dep) - 1);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 76130b1..0c5aad1 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -174,7 +174,6 @@ public:
>              fs_reg src0, fs_reg src1,fs_reg src2);
>
>      bool equals(fs_inst *inst);
> -   int regs_written();
>      bool overwrites_reg(const fs_reg &reg);
>      bool is_tex();
>      bool is_math();
> @@ -192,6 +191,7 @@ public:
>      uint8_t flag_subreg;
>
>      int mlen; /**< SEND message length */
> +   int regs_written; /**< Number of vgrfs written by a SEND message, or 1 */
>      int base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
>      uint32_t texture_offset; /**< Texture offset bitfield */
>      int sampler;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> index c89da36..01a64d2 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> @@ -130,7 +130,7 @@ fs_visitor::opt_cse_local(bblock_t *block, exec_list *aeb)
>   	     */
>   	    bool no_existing_temp = entry->tmp.file == BAD_FILE;
>   	    if (no_existing_temp) {
> -               int written = entry->generator->regs_written();
> +               int written = entry->generator->regs_written;
>
>                  fs_reg orig_dst = entry->generator->dst;
>                  fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written),
> @@ -150,8 +150,8 @@ fs_visitor::opt_cse_local(bblock_t *block, exec_list *aeb)
>   	    }
>
>   	    /* dest <- temp */
> -            int written = inst->regs_written();
> -            assert(written == entry->generator->regs_written());
> +            int written = inst->regs_written;
> +            assert(written == entry->generator->regs_written);
>               assert(inst->dst.type == entry->tmp.type);
>               fs_reg dst = inst->dst;
>               fs_reg tmp = entry->tmp;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> index 63af148..373aa2d 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> @@ -77,7 +77,7 @@ fs_live_variables::setup_def_use()
>   	  * variable, and thus qualify for being in def[].
>   	  */
>   	 if (inst->dst.file == GRF &&
> -	     inst->regs_written() == v->virtual_grf_sizes[inst->dst.reg] &&
> +	     inst->regs_written == v->virtual_grf_sizes[inst->dst.reg] &&
>   	     !inst->predicate &&
>   	     !inst->force_uncompressed &&
>   	     !inst->force_sechalf) {
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> index b8936dc..4ee7bbc 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> @@ -553,7 +553,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
>         }
>
>         if (inst->dst.file == GRF) {
> -	 spill_costs[inst->dst.reg] += inst->regs_written() * loop_scale;
> +	 spill_costs[inst->dst.reg] += inst->regs_written * loop_scale;
>
>            if (inst->dst.smear >= 0) {
>               no_spill[inst->dst.reg] = true;
> @@ -622,7 +622,7 @@ fs_visitor::spill_reg(int spill_reg)
>   	  inst->dst.reg == spill_reg) {
>            int subset_spill_offset = (spill_offset +
>                                       REG_SIZE * inst->dst.reg_offset);
> -         inst->dst.reg = virtual_grf_alloc(inst->regs_written());
> +         inst->dst.reg = virtual_grf_alloc(inst->regs_written);
>            inst->dst.reg_offset = 0;
>
>   	 /* If our write is going to affect just part of the
> @@ -631,7 +631,7 @@ fs_visitor::spill_reg(int spill_reg)
>   	  */
>   	 if (inst->predicate || inst->force_uncompressed || inst->force_sechalf) {
>               fs_reg unspill_reg = inst->dst;
> -            for (int chan = 0; chan < inst->regs_written(); chan++) {
> +            for (int chan = 0; chan < inst->regs_written; chan++) {
>                  emit_unspill(inst, unspill_reg,
>                               subset_spill_offset + REG_SIZE * chan);
>                  unspill_reg.reg_offset++;
> @@ -644,7 +644,7 @@ fs_visitor::spill_reg(int spill_reg)
>   	 spill_src.negate = false;
>   	 spill_src.smear = -1;
>
> -	 for (int chan = 0; chan < inst->regs_written(); chan++) {
> +	 for (int chan = 0; chan < inst->regs_written; chan++) {
>   	    fs_inst *spill_inst = new(mem_ctx) fs_inst(FS_OPCODE_SPILL,
>   						       reg_null_f, spill_src);
>   	    spill_src.reg_offset++;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> index c125928..0d68e3d 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> @@ -510,7 +510,7 @@ instruction_scheduler::calculate_deps()
>         /* write-after-write deps. */
>         if (inst->dst.file == GRF) {
>            if (post_reg_alloc) {
> -            for (int r = 0; r < inst->regs_written() * reg_width; r++) {
> +            for (int r = 0; r < inst->regs_written * reg_width; r++) {
>                  add_dep(last_grf_write[inst->dst.reg + r], n);
>                  last_grf_write[inst->dst.reg + r] = n;
>               }
> @@ -617,7 +617,7 @@ instruction_scheduler::calculate_deps()
>          */
>         if (inst->dst.file == GRF) {
>            if (post_reg_alloc) {
> -            for (int r = 0; r < inst->regs_written() * reg_width; r++)
> +            for (int r = 0; r < inst->regs_written * reg_width; r++)
>                  last_grf_write[inst->dst.reg + r] = n;
>            } else {
>               last_grf_write[inst->dst.reg] = n;
> @@ -716,7 +716,7 @@ instruction_scheduler::schedule_instructions(fs_inst *next_block_header)
>               schedule_node *n = (schedule_node *)node;
>
>               chosen = n;
> -            if (chosen->inst->regs_written() <= 1)
> +            if (chosen->inst->regs_written <= 1)
>                  break;
>            }
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 48c6df3..19adfc9 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -261,7 +261,7 @@ fs_visitor::try_emit_saturate(ir_expression *ir)
>       * src, generate a saturated MOV
>       */
>      fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
> -   if (!modify || modify->regs_written() != 1) {
> +   if (!modify || modify->regs_written != 1) {
>         this->result = fs_reg(this, ir->type);
>         fs_inst *inst = emit(MOV(this->result, src));
>         inst->saturate = true;
> @@ -746,7 +746,7 @@ fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
>      /* If last_rhs_inst wrote a different number of components than our LHS,
>       * we can't safely rewrite it.
>       */
> -   if (virtual_grf_sizes[dst.reg] != modify->regs_written())
> +   if (virtual_grf_sizes[dst.reg] != modify->regs_written)
>         return false;
>
>      /* Success!  Rewrite the instruction. */
> @@ -948,6 +948,7 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
>      inst->base_mrf = base_mrf;
>      inst->mlen = mlen;
>      inst->header_present = true;
> +   inst->regs_written = simd16 ? 8 : 4;
>
>      if (simd16) {
>         for (int i = 0; i < 4; i++) {
> @@ -1086,6 +1087,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
>      inst->base_mrf = base_mrf;
>      inst->mlen = mlen;
>      inst->header_present = header_present;
> +   inst->regs_written = 4;
>
>      if (mlen > 11) {
>         fail("Message length >11 disallowed by hardware\n");
> @@ -1239,6 +1241,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
>      inst->base_mrf = base_mrf;
>      inst->mlen = mlen;
>      inst->header_present = header_present;
> +   inst->regs_written = 4;
>
>      if (mlen > 11) {
>         fail("Message length >11 disallowed by hardware\n");
>