[Mesa-dev] [PATCH 8/8] i965/fs: Improve performance of shaders that start out with a discard.

Sun Dec 9 15:03:44 PST 2012

On 12/07/2012 02:08 PM, Eric Anholt wrote:
> I had tried this in the past, but ran into trouble with applications that
> sample from undiscarded pixels in the same subspan.  To fix that issue, only
> jump to the end for an entire subspan at a time.
>
> Improves GLbenchmark 2.7 (1024x768) performance by 7.9 +/- 1.5% (n=8).
> ---
>   src/mesa/drivers/dri/i965/brw_defines.h      |    1 +
>   src/mesa/drivers/dri/i965/brw_eu.h           |    1 +
>   src/mesa/drivers/dri/i965/brw_eu_emit.c      |   53 ++++++++++++++++++---
>   src/mesa/drivers/dri/i965/brw_fs.h           |   24 ++++++++++
>   src/mesa/drivers/dri/i965/brw_fs_emit.cpp    |   65 ++++++++++++++++++++++++++
>   src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |   14 ++++++
>   6 files changed, 151 insertions(+), 7 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 2b77ae6..40571a4 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -679,6 +679,7 @@ enum opcode {
>      FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
>      FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
>      FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
> +   FS_OPCODE_DISCARD_JUMP,
>
>      VS_OPCODE_URB_WRITE,
>      VS_OPCODE_SCRATCH_READ,
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
> index adefcfd..b2fa448 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -1032,6 +1032,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p);
>   struct brw_instruction *brw_BREAK(struct brw_compile *p);
>   struct brw_instruction *brw_CONT(struct brw_compile *p);
>   struct brw_instruction *gen6_CONT(struct brw_compile *p);
> +struct brw_instruction *gen6_HALT(struct brw_compile *p);
>   /* Forward jumps:
>    */
>   void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx);
> diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
> index fb1255f..dd91a30 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
> @@ -1461,6 +1461,24 @@ struct brw_instruction *brw_CONT(struct brw_compile *p)
>      return insn;
>   }
>
> +struct brw_instruction *gen6_HALT(struct brw_compile *p)
> +{
> +   struct brw_instruction *insn;
> +
> +   insn = next_insn(p, BRW_OPCODE_HALT);
> +   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
> +   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
> +   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
> +
> +   if (p->compressed) {
> +      insn->header.execution_size = BRW_EXECUTE_16;
> +   } else {
> +      insn->header.compression_control = BRW_COMPRESSION_NONE;
> +      insn->header.execution_size = BRW_EXECUTE_8;
> +   }
> +   return insn;
> +}
> +
>   /* DO/WHILE loop:
>    *
>    * The DO/WHILE is just an unterminated loop -- break or continue are
> @@ -2302,8 +2320,8 @@ brw_find_next_block_end(struct brw_compile *p, int start)
>   	 return ip;
>         }
>      }
> -   assert(!"not reached");
> -   return start + 1;
> +
> +   return 0;
>   }
>
>   /* There is no DO instruction on gen6, so to find the end of the loop
> @@ -2336,7 +2354,7 @@ brw_find_loop_end(struct brw_compile *p, int start)
>   }
>
>   /* After program generation, go back and update the UIP and JIP of
> - * BREAK and CONT instructions to their correct locations.
> + * BREAK, CONT, and HALT instructions to their correct locations.
>    */
>   void
>   brw_set_uip_jip(struct brw_compile *p)
> @@ -2360,24 +2378,45 @@ brw_set_uip_jip(struct brw_compile *p)
>   	 continue;
>         }
>
> +      int block_end_ip = brw_find_next_block_end(p, ip);
>         switch (insn->header.opcode) {
>         case BRW_OPCODE_BREAK:
> -	 insn->bits3.break_cont.jip =
> -            (brw_find_next_block_end(p, ip) - ip) / scale;
> +         assert(block_end_ip != 0);
> +	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
>   	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
>   	 insn->bits3.break_cont.uip =
>   	    (brw_find_loop_end(p, ip) - ip +
>                (intel->gen == 6 ? 16 : 0)) / scale;
>   	 break;
>         case BRW_OPCODE_CONTINUE:
> -	 insn->bits3.break_cont.jip =
> -            (brw_find_next_block_end(p, ip) - ip) / scale;
> +         assert(block_end_ip != 0);
> +	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
>   	 insn->bits3.break_cont.uip =
>               (brw_find_loop_end(p, ip) - ip) / scale;
>
>   	 assert(insn->bits3.break_cont.uip != 0);
>   	 assert(insn->bits3.break_cont.jip != 0);
>   	 break;
> +      case BRW_OPCODE_HALT:
> +	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
> +	  *
> +	  *    "In case of the halt instruction not inside any conditional
> +	  *     code block, the value of <JIP> and <UIP> should be the
> +	  *     same. In case of the halt instruction inside conditional code
> +	  *     block, the <UIP> should be the end of the program, and the
> +	  *     <JIP> should be end of the most inner conditional code block."
> +	  *
> +	  * The uip will have already been set by whoever set up the
> +	  * instruction.
> +	  */
> +	 if (block_end_ip == 0) {
> +	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
> +	 } else {
> +	    insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
> +	 }
> +	 assert(insn->bits3.break_cont.uip != 0);
> +	 assert(insn->bits3.break_cont.jip != 0);
> +	 break;
>         }
>      }
>   }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index b00755f..d4ddb47 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -129,6 +129,26 @@ static const fs_reg reg_undef;
>   static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F);
>   static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D);
>
> +class ip_record : public exec_node {
> +public:
> +   static void* operator new(size_t size, void *ctx)
> +   {
> +      void *node;
> +
> +      node = rzalloc_size(ctx, size);
> +      assert(node != NULL);
> +
> +      return node;
> +   }
> +
> +   ip_record(int ip)
> +   {
> +      this->ip = ip;
> +   }
> +
> +   int ip;
> +};
> +
>   class fs_inst : public backend_instruction {
>   public:
>      /* Callers of this ralloc-based new need not call delete. It's
> @@ -516,6 +536,9 @@ private:
>                                                    struct brw_reg index,
>                                                    struct brw_reg offset);
>      void generate_mov_dispatch_to_flags(fs_inst *inst);
> +   void generate_discard_jump(fs_inst *inst);
> +
> +   void patch_discard_jumps_to_fb_writes();
>
>      struct brw_context *brw;
>      struct intel_context *intel;
> @@ -530,6 +553,7 @@ private:
>
>      unsigned dispatch_width; /**< 8 or 16 */
>
> +   exec_list discard_halt_patches;
>      bool dual_source_output;
>      void *mem_ctx;
>   };
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> index f185eb5..0e6fbef 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> @@ -61,12 +61,54 @@ fs_generator::~fs_generator()
>   }
>
>   void
> +fs_generator::patch_discard_jumps_to_fb_writes()
> +{
> +   if (intel->gen < 6 || this->discard_halt_patches.is_empty())
> +      return;
> +
> +   /* There is a somewhat strange undocumented requirement of using
> +    * HALT, according to the simulator.  If some channel has HALTed to
> +    * a particular UIP, then by the end of the program, every channel
> +    * must have HALTed to that UIP.  Furthermore, the tracking is a
> +    * stack, so you can't do the final halt of a UIP after starting
> +    * halting to a new UIP.
> +    *
> +    * Symptoms of not emitting this instruction on actual hardware
> +    * included GPU hangs and sparkly rendering on the piglit discard
> +    * tests.
> +    */
> +   struct brw_instruction *last_halt = gen6_HALT(p);
> +   last_halt->bits3.break_cont.uip = 2;
> +   last_halt->bits3.break_cont.jip = 2;
> +
> +   int ip = p->nr_insn;
> +
> +   foreach_list(node, &this->discard_halt_patches) {
> +      ip_record *patch_ip = (ip_record *)node;
> +      struct brw_instruction *patch = &p->store[patch_ip->ip];
> +      int br = (intel->gen >= 5) ? 2 : 1;

You can just do int br = 2; here...you already know gen >= 6.

This series is:
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>

> +
> +      assert(patch->header.opcode == BRW_OPCODE_HALT);
> +      /* HALT takes a distance from the pre-incremented IP. */
> +      patch->bits3.break_cont.uip = (ip - patch_ip->ip) * br;
> +   }
> +
> +   this->discard_halt_patches.make_empty();
> +}
> +
> +void
>   fs_generator::generate_fb_write(fs_inst *inst)
>   {
>      bool eot = inst->eot;
>      struct brw_reg implied_header;
>      uint32_t msg_control;
>
> +   /* Note that the jumps emitted to this point mean that the g0 ->
> +    * base_mrf setup must be inside of this function, so that we jump
> +    * to a point containing it.
> +    */
> +   patch_discard_jumps_to_fb_writes();
> +
>      /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
>       * move, here's g1.
>       */
> @@ -525,6 +567,25 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src
>   }
>
>   void
> +fs_generator::generate_discard_jump(fs_inst *inst)
> +{
> +   struct brw_reg f0 = brw_flag_reg(0, 0);
> +
> +   assert(intel->gen >= 6);
> +
> +   /* This HALT will be patched up at FB write time to point UIP at the end of
> +    * the program, and at brw_uip_jip() JIP will be set to the end of the
> +    * current block (or the program).
> +    */
> +   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
> +
> +   brw_push_insn_state(p);
> +   brw_set_mask_control(p, BRW_MASK_DISABLE);
> +   gen6_HALT(p);
> +   brw_pop_insn_state(p);
> +}
> +
> +void
>   fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
>   {
>      assert(inst->mlen != 0);
> @@ -1085,6 +1146,10 @@ fs_generator::generate_code(exec_list *instructions)
>            generate_mov_dispatch_to_flags(inst);
>            break;
>
> +      case FS_OPCODE_DISCARD_JUMP:
> +         generate_discard_jump(inst);
> +         break;
> +
>         case SHADER_OPCODE_SHADER_TIME_ADD:
>            brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
>            break;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 98cd064..56d4330 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -1446,6 +1446,20 @@ fs_visitor::visit(ir_discard *ir)
>                              BRW_CONDITIONAL_NZ));
>      cmp->predicate = BRW_PREDICATE_NORMAL;
>      cmp->flag_subreg = 1;
> +
> +   if (intel->gen >= 6) {
> +      /* For performance, after a discard, jump to the end of the shader.
> +       * However, many people will do foliage by discarding based on a
> +       * texture's alpha mask, and then continue on to texture with the
> +       * remaining pixels.  To avoid trashing the derivatives for those
> +       * texture samples, we'll only jump if all of the pixels in the subspan
> +       * have been discarded.
> +       */
> +      fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
> +      discard_jump->flag_subreg = 1;
> +      discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
> +      discard_jump->predicate_inverse = true;
> +   }
>   }
>
>   void
>