[Mesa-dev] [PATCH 7/9] i965/gs: Add opcodes needed for EndPrimitive().

Mon Sep 9 08:20:44 PDT 2013

---
 src/mesa/drivers/dri/i965/brw_defines.h     | 21 ++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp    |  4 ++
 src/mesa/drivers/dri/i965/brw_vec4.h        |  2 +
 src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 81 +++++++++++++++++++++++++++++
 4 files changed, 108 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 4742103..7b53c68 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -847,6 +847,27 @@ enum opcode {
     * scratch reads and writes to operate correctly.
     */
    GS_OPCODE_SET_DWORD_2_IMMED,
+
+   /**
+    * Prepare the dst register for storage in the "Channel Mask" fields of a
+    * URB_WRITE message header.
+    *
+    * DWORD 4 of dst is shifted left by 4 bits, so that later,
+    * GS_OPCODE_SET_CHANNEL_MASKS can OR DWORDs 0 and 4 together to form the
+    * final channel mask.
+    */
+   GS_OPCODE_PREPARE_CHANNEL_MASKS,
+
+   /**
+    * Set the "Channel Mask" fields of a URB_WRITE message header.
+    *
+    * - dst is the MRF containing the message header.
+    *
+    * - src.x is the channel mask, as prepared by
+    *   GS_OPCODE_PREPARE_CHANNEL_MASKS.  DWORDs 0 and 4 are OR'ed together to
+    *   form the final channel mask.
+    */
+   GS_OPCODE_SET_CHANNEL_MASKS,
 };
 
 #define BRW_PREDICATE_NONE             0
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index e7dbdbe..53364a5 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -507,6 +507,10 @@ brw_instruction_name(enum opcode op)
       return "set_vertex_count";
    case GS_OPCODE_SET_DWORD_2_IMMED:
       return "set_dword_2_immed";
+   case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+      return "prepare_channel_masks";
+   case GS_OPCODE_SET_CHANNEL_MASKS:
+      return "set_channel_masks";
 
    default:
       /* Yes, this leaks.  It's in debug code, it should never occur, and if
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index c5101d3..cba5cd4 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -610,6 +610,8 @@ private:
    void generate_gs_set_vertex_count(struct brw_reg dst,
                                      struct brw_reg src);
    void generate_gs_set_dword_2_immed(struct brw_reg dst, struct brw_reg src);
+   void generate_gs_prepare_channel_masks(struct brw_reg dst);
+   void generate_gs_set_channel_masks(struct brw_reg dst, struct brw_reg src);
    void generate_oword_dual_block_offsets(struct brw_reg m1,
 					  struct brw_reg index);
    void generate_scratch_write(vec4_instruction *inst,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index bf04bd9..12e1b50 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -516,6 +516,79 @@ vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst,
 }
 
 void
+vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst)
+{
+   /* We want to left shift just DWORD 4 (the x component belonging to the
+    * second geometry shader invocation) by 4 bits.  So generate the
+    * instruction:
+    *
+    *     shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
+    */
+   dst = suboffset(vec1(dst), 4);
+   brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_SHL(p, dst, dst, brw_imm_ud(4));
+   brw_pop_insn_state(p);
+}
+
+void
+vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst,
+                                              struct brw_reg src)
+{
+   /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
+    * Header: M0.5):
+    *
+    *     15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
+    *
+    *        When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
+    *        DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
+    *        Vertex 0 DATA[7].  This bit is ANDed with the corresponding
+    *        channel enable to determine the final channel enable.  For the
+    *        URB_READ_OWORD & URB_READ_HWORD messages, when final channel
+    *        enable is 1 it indicates that Vertex 1 DATA [3] will be included
+    *        in the writeback message.  For the URB_WRITE_OWORD &
+    *        URB_WRITE_HWORD messages, when final channel enable is 1 it
+    *        indicates that Vertex 1 DATA [3] will be written to the surface.
+    *
+    *        0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
+    *        1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
+    *
+    *     14 Vertex 1 DATA [2] Channel Mask
+    *     13 Vertex 1 DATA [1] Channel Mask
+    *     12 Vertex 1 DATA [0] Channel Mask
+    *     11 Vertex 0 DATA [3] Channel Mask
+    *     10 Vertex 0 DATA [2] Channel Mask
+    *      9 Vertex 0 DATA [1] Channel Mask
+    *      8 Vertex 0 DATA [0] Channel Mask
+    *
+    * (This is from a section of the PRM that is agnostic to the particular
+    * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
+    * geometry shader invocations 0 and 1, respectively).  Since we have the
+    * enable flags for geometry shader invocation 0 in bits 0-3 of DWORD 0,
+    * and the enable flags for geometry shader invocation 1 in bits 4-7 of
+    * DWORD 4, we just need to OR them together and store the result in bits
+    * 15-8 of DWORD 5.
+    *
+    * It's easier to get the EU to do this if we think of the src and dst
+    * registers as composed of 32 bytes each; then, we want to pick up the
+    * contents of bytes 0 and 16 from src, OR them together, and store them in
+    * byte 21.
+    *
+    * We can do that by the following EU instruction:
+    *
+    *     or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
+    */
+   dst = retype(dst, BRW_REGISTER_TYPE_UB);
+   src = retype(src, BRW_REGISTER_TYPE_UB);
+   brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
+   brw_pop_insn_state(p);
+}
+
+void
 vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
                                                   struct brw_reg index)
 {
@@ -1003,6 +1076,14 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
       generate_gs_set_dword_2_immed(dst, src[0]);
       break;
 
+   case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+      generate_gs_prepare_channel_masks(dst);
+      break;
+
+   case GS_OPCODE_SET_CHANNEL_MASKS:
+      generate_gs_set_channel_masks(dst, src[0]);
+      break;
+
    case SHADER_OPCODE_SHADER_TIME_ADD:
       brw_shader_time_add(p, src[0], SURF_INDEX_VEC4_SHADER_TIME);
       mark_surface_used(SURF_INDEX_VEC4_SHADER_TIME);
-- 
1.8.4