[Mesa-dev] [PATCH 7/9] i965/gs: Add opcodes needed for EndPrimitive().

Mon Sep 9 23:39:48 PDT 2013

On 09/09/2013 08:20 AM, Paul Berry wrote:
> ---
>  src/mesa/drivers/dri/i965/brw_defines.h     | 21 ++++++++
>  src/mesa/drivers/dri/i965/brw_shader.cpp    |  4 ++
>  src/mesa/drivers/dri/i965/brw_vec4.h        |  2 +
>  src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 81 +++++++++++++++++++++++++++++
>  4 files changed, 108 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 4742103..7b53c68 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -847,6 +847,27 @@ enum opcode {
>      * scratch reads and writes to operate correctly.
>      */
>     GS_OPCODE_SET_DWORD_2_IMMED,
> +
> +   /**
> +    * Prepare the dst register for storage in the "Channel Mask" fields of a
> +    * URB_WRITE message header.
> +    *
> +    * DWORD 4 of dst is shifted left by 4 bits, so that later,
> +    * GS_OPCODE_SET_CHANNEL_MASKS can OR DWORDs 0 and 4 together to form the
> +    * final channel mask.
> +    */
> +   GS_OPCODE_PREPARE_CHANNEL_MASKS,
> +
> +   /**
> +    * Set the "Channel Mask" fields of a URB_WRITE message header.
> +    *
> +    * - dst is the MRF containing the message header.
> +    *
> +    * - src.x is the channel mask, as prepared by
> +    *   GS_OPCODE_PREPARE_CHANNEL_MASKS.  DWORDs 0 and 4 are OR'ed together to
> +    *   form the final channel mask.
> +    */
> +   GS_OPCODE_SET_CHANNEL_MASKS,
>  };
>  
>  #define BRW_PREDICATE_NONE             0
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index e7dbdbe..53364a5 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -507,6 +507,10 @@ brw_instruction_name(enum opcode op)
>        return "set_vertex_count";
>     case GS_OPCODE_SET_DWORD_2_IMMED:
>        return "set_dword_2_immed";
> +   case GS_OPCODE_PREPARE_CHANNEL_MASKS:
> +      return "prepare_channel_masks";
> +   case GS_OPCODE_SET_CHANNEL_MASKS:
> +      return "set_channel_masks";
>  
>     default:
>        /* Yes, this leaks.  It's in debug code, it should never occur, and if
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
> index c5101d3..cba5cd4 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -610,6 +610,8 @@ private:
>     void generate_gs_set_vertex_count(struct brw_reg dst,
>                                       struct brw_reg src);
>     void generate_gs_set_dword_2_immed(struct brw_reg dst, struct brw_reg src);
> +   void generate_gs_prepare_channel_masks(struct brw_reg dst);
> +   void generate_gs_set_channel_masks(struct brw_reg dst, struct brw_reg src);
>     void generate_oword_dual_block_offsets(struct brw_reg m1,
>  					  struct brw_reg index);
>     void generate_scratch_write(vec4_instruction *inst,
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> index bf04bd9..12e1b50 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> @@ -516,6 +516,79 @@ vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst,
>  }
>  
>  void
> +vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst)
> +{
> +   /* We want to left shift just DWORD 4 (the x component belonging to the
> +    * second geometry shader invocation) by 4 bits.  So generate the
> +    * instruction:
> +    *
> +    *     shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
> +    */
> +   dst = suboffset(vec1(dst), 4);
> +   brw_push_insn_state(p);
> +   brw_set_access_mode(p, BRW_ALIGN_1);
> +   brw_set_mask_control(p, BRW_MASK_DISABLE);
> +   brw_SHL(p, dst, dst, brw_imm_ud(4));
> +   brw_pop_insn_state(p);
> +}
> +
> +void
> +vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst,
> +                                              struct brw_reg src)
> +{
> +   /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
> +    * Header: M0.5):
> +    *
> +    *     15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
> +    *
> +    *        When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
> +    *        DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
> +    *        Vertex 0 DATA[7].  This bit is ANDed with the corresponding
> +    *        channel enable to determine the final channel enable.  For the
> +    *        URB_READ_OWORD & URB_READ_HWORD messages, when final channel
> +    *        enable is 1 it indicates that Vertex 1 DATA [3] will be included
> +    *        in the writeback message.  For the URB_WRITE_OWORD &
> +    *        URB_WRITE_HWORD messages, when final channel enable is 1 it
> +    *        indicates that Vertex 1 DATA [3] will be written to the surface.
> +    *
> +    *        0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
> +    *        1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
> +    *
> +    *     14 Vertex 1 DATA [2] Channel Mask
> +    *     13 Vertex 1 DATA [1] Channel Mask
> +    *     12 Vertex 1 DATA [0] Channel Mask
> +    *     11 Vertex 0 DATA [3] Channel Mask
> +    *     10 Vertex 0 DATA [2] Channel Mask
> +    *      9 Vertex 0 DATA [1] Channel Mask
> +    *      8 Vertex 0 DATA [0] Channel Mask
> +    *
> +    * (This is from a section of the PRM that is agnostic to the particular
> +    * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
> +    * geometry shader invocations 0 and 1, respectively).  Since we have the
> +    * enable flags for geometry shader invocation 0 in bits 0-3 of DWORD 0,
> +    * and the enable flags for geometry shader invocation 1 in bits 4-7 of
> +    * DWORD 4, we just need to OR them together and store the result in bits
> +    * 15-8 of DWORD 5.

One thing isn't entirely clear to me here: are bits 7:4 of DWord 0 and
bits 3:0 of DWord 4 both zero?  Otherwise, OR'ing them together will
result in a mishmash of the bits you want and...whatever other rubbish
is there.

I assume it works out, but maybe expand the comment to explain this?

One other nitpick: you list bits as 0-3, 4-7, and 15-8.  Might be nice
to pick a consistent order.  Usually the hardware docs use "High:Low".

Otherwise, I verified your math.  Nice use of UB types :)

> +    *
> +    * It's easier to get the EU to do this if we think of the src and dst
> +    * registers as composed of 32 bytes each; then, we want to pick up the
> +    * contents of bytes 0 and 16 from src, OR them together, and store them in
> +    * byte 21.
> +    *
> +    * We can do that by the following EU instruction:
> +    *
> +    *     or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
> +    */
> +   dst = retype(dst, BRW_REGISTER_TYPE_UB);
> +   src = retype(src, BRW_REGISTER_TYPE_UB);
> +   brw_push_insn_state(p);
> +   brw_set_access_mode(p, BRW_ALIGN_1);
> +   brw_set_mask_control(p, BRW_MASK_DISABLE);
> +   brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
> +   brw_pop_insn_state(p);
> +}
> +
> +void
>  vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
>                                                    struct brw_reg index)
>  {
> @@ -1003,6 +1076,14 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
>        generate_gs_set_dword_2_immed(dst, src[0]);
>        break;
>  
> +   case GS_OPCODE_PREPARE_CHANNEL_MASKS:
> +      generate_gs_prepare_channel_masks(dst);
> +      break;
> +
> +   case GS_OPCODE_SET_CHANNEL_MASKS:
> +      generate_gs_set_channel_masks(dst, src[0]);
> +      break;
> +
>     case SHADER_OPCODE_SHADER_TIME_ADD:
>        brw_shader_time_add(p, src[0], SURF_INDEX_VEC4_SHADER_TIME);
>        mark_surface_used(SURF_INDEX_VEC4_SHADER_TIME);
>