[Mesa-dev] [PATCH 7/8] i965/fs: Rewrite discards to use a flag subreg to track discarded pixels.

Kenneth Graunke kenneth at whitecape.org
Sun Dec 9 14:53:26 PST 2012


On 12/07/2012 02:08 PM, Eric Anholt wrote:
> This makes much more sense on gen6+, and will also prove useful for
> early exit of shaders on discard.
> ---
>   src/mesa/drivers/dri/i965/brw_defines.h            |    1 -
>   src/mesa/drivers/dri/i965/brw_fs.cpp               |    8 +++
>   src/mesa/drivers/dri/i965/brw_fs.h                 |    1 -
>   src/mesa/drivers/dri/i965/brw_fs_emit.cpp          |   76 +++++---------------
>   src/mesa/drivers/dri/i965/brw_fs_fp.cpp            |   17 ++---
>   src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp  |    3 -
>   .../dri/i965/brw_fs_schedule_instructions.cpp      |    1 -
>   src/mesa/drivers/dri/i965/brw_fs_visitor.cpp       |   12 +++-
>   8 files changed, 46 insertions(+), 73 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 5e00b40..2b77ae6 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -673,7 +673,6 @@ enum opcode {
>      FS_OPCODE_PIXEL_Y,
>      FS_OPCODE_CINTERP,
>      FS_OPCODE_LINTERP,
> -   FS_OPCODE_DISCARD,
>      FS_OPCODE_SPILL,
>      FS_OPCODE_UNSPILL,
>      FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index dbf48f8..b4e8d68 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -2434,6 +2434,14 @@ fs_visitor::run()
>         else
>   	 emit_interpolation_setup_gen6();
>
> +      /* On gen6+, we handle discards by keeping track of the still-live
> +       * pixels in f0.1.  If so, initialize it with the dispatched pixels.

The wording "If so" doesn't make sense to me.  Perhaps just "Initialize 
it with the dispatched pixels."  The fact that we only do so for 
programs that use discards is obvious from the code and makes a lot of 
sense.

> +       */
> +      if (fp->UsesKill) {
> +         fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> +         discard_init->flag_subreg = 1;
> +      }

Also: does this new approach work in SIMD16 mode?  (Is the f0.1 wide 
enough?)  It looks like you've removed the code to fail 16-wide compiles 
that use discard.  I wasn't sure if that was intentional.

If you do want to disallow it, this seems like a good place to do it 
(the earlier the better).

Otherwise, I like this.

>         /* Generate FS IR for main().  (the visitor only descends into
>          * functions called "main").
>          */
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index b60a37e..b00755f 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -501,7 +501,6 @@ private:
>      void generate_math_gen4(fs_inst *inst,
>   			   struct brw_reg dst,
>   			   struct brw_reg src);
> -   void generate_discard(fs_inst *inst);
>      void generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src);
>      void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
>                        bool negate_value);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> index b3d7f81..f185eb5 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> @@ -74,6 +74,17 @@ fs_generator::generate_fb_write(fs_inst *inst)
>      brw_set_mask_control(p, BRW_MASK_DISABLE);
>      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
>
> +   if (fp->UsesKill) {
> +      struct brw_reg pixel_mask;
> +
> +      if (intel->gen >= 6)
> +         pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
> +      else
> +         pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
> +
> +      brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
> +   }
> +
>      if (inst->header_present) {
>         if (intel->gen >= 6) {
>   	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
> @@ -514,58 +525,6 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src
>   }
>
>   void
> -fs_generator::generate_discard(fs_inst *inst)
> -{
> -   struct brw_reg f0 = brw_flag_reg(0, 0);
> -
> -   if (intel->gen >= 6) {
> -      struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
> -      struct brw_reg some_register;
> -
> -      /* As of gen6, we no longer have the mask register to look at,
> -       * so life gets a bit more complicated.
> -       */
> -
> -      /* Load the flag register with all ones. */
> -      brw_push_insn_state(p);
> -      brw_set_mask_control(p, BRW_MASK_DISABLE);
> -      brw_MOV(p, f0, brw_imm_uw(0xffff));
> -      brw_pop_insn_state(p);
> -
> -      /* Do a comparison that should always fail, to produce 0s in the flag
> -       * reg where we have active channels.
> -       */
> -      some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
> -      brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
> -	      BRW_CONDITIONAL_NZ, some_register, some_register);
> -
> -      /* Undo CMP's whacking of predication*/
> -      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
> -
> -      brw_push_insn_state(p);
> -      brw_set_mask_control(p, BRW_MASK_DISABLE);
> -      brw_AND(p, g1, f0, g1);
> -      brw_pop_insn_state(p);
> -   } else {
> -      struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
> -
> -      brw_push_insn_state(p);
> -      brw_set_mask_control(p, BRW_MASK_DISABLE);
> -      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
> -
> -      /* Unlike the 965, we have the mask reg, so we just need
> -       * somewhere to invert that (containing channels to be disabled)
> -       * so it can be ANDed with the mask of pixels still to be
> -       * written. Use the flag reg for consistency with gen6+.
> -       */
> -      brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */
> -      brw_AND(p, g0, f0, g0);
> -
> -      brw_pop_insn_state(p);
> -   }
> -}
> -
> -void
>   fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
>   {
>      assert(inst->mlen != 0);
> @@ -745,12 +704,16 @@ void
>   fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
>   {
>      struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
> -   struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
> +   struct brw_reg dispatch_mask;
> +
> +   if (intel->gen >= 6)
> +      dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
> +   else
> +      dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
>
> -   assert (intel->gen >= 6);
>      brw_push_insn_state(p);
>      brw_set_mask_control(p, BRW_MASK_DISABLE);
> -   brw_MOV(p, flags, g1);
> +   brw_MOV(p, flags, dispatch_mask);
>      brw_pop_insn_state(p);
>   }
>
> @@ -1083,9 +1046,6 @@ fs_generator::generate_code(exec_list *instructions)
>         case SHADER_OPCODE_TXS:
>   	 generate_tex(inst, dst, src[0]);
>   	 break;
> -      case FS_OPCODE_DISCARD:
> -	 generate_discard(inst);
> -	 break;
>         case FS_OPCODE_DDX:
>   	 generate_ddx(inst, dst, src[0]);
>   	 break;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> index 4be7779..bebf059 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> @@ -252,14 +252,15 @@ fs_visitor::emit_fragment_program_code()
>                  continue;
>               }
>
> -            emit(CMP(null, regoffset(src[0], i), fs_reg(0.0f),
> -                     BRW_CONDITIONAL_L));
> -
> -            if (intel->gen < 6 && dispatch_width == 16)
> -               fail("Can't support (non-uniform) control flow on 16-wide");
> -            emit(IF(BRW_PREDICATE_NORMAL));
> -            emit(FS_OPCODE_DISCARD);
> -            emit(BRW_OPCODE_ENDIF);
> +
> +            /* Emit an instruction that's predicated on the current
> +             * undiscarded pixels, and updates just those pixels to be
> +             * turned off.
> +             */
> +            fs_inst *cmp = emit(CMP(null, regoffset(src[0], i), fs_reg(0.0f),
> +                                    BRW_CONDITIONAL_GE));
> +            cmp->predicate = BRW_PREDICATE_NORMAL;
> +            cmp->flag_subreg = 1;
>            }
>            break;
>         }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> index a4fc032..d1147f5 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> @@ -277,9 +277,6 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
>            payload_last_use_ip[0 / reg_width] = use_ip;
>            payload_last_use_ip[1 / reg_width] = use_ip;
>            break;
> -      case FS_OPCODE_DISCARD:
> -         payload_last_use_ip[1 / reg_width] = use_ip;
> -         break;
>
>         case FS_OPCODE_LINTERP:
>            /* On gen6+ in 16-wide, there are 4 adjacent registers (so 2 nodes)
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> index 96d1131..e9c25b0 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> @@ -299,7 +299,6 @@ instruction_scheduler::calculate_deps()
>         }
>
>         if (inst->predicate) {
> -	 assert(last_conditional_mod[inst->flag_subreg]);
>   	 add_dep(last_conditional_mod[inst->flag_subreg], n);
>         }
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index f1c6860..98cd064 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -1435,7 +1435,17 @@ fs_visitor::visit(ir_discard *ir)
>   {
>      assert(ir->condition == NULL); /* FINISHME */
>
> -   emit(FS_OPCODE_DISCARD);
> +   /* We track our discarded pixels in f0.1.  By predicating on it, we can
> +    * update just the flag bits that aren't yet discarded.  By emitting a
> +    * CMP of g0 != g0, all our currently executing channels will get turned
> +    * off.
> +    */
> +   fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> +                                   BRW_REGISTER_TYPE_UW));
> +   fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
> +                           BRW_CONDITIONAL_NZ));
> +   cmp->predicate = BRW_PREDICATE_NORMAL;
> +   cmp->flag_subreg = 1;
>   }
>
>   void
>



More information about the mesa-dev mailing list