[Mesa-dev] [RFC] i965/vec4: Add support for SHADER_OPCODE_MOV_INDIRECT

Wed Dec 9 21:36:09 PST 2015

This is an initial implementation of the MOV_INDIRECT opcode in the vec4
backend.  Unfortunately, I haven't had a chance to test it in the wild yet,
but I think review would still be good.  In particular, the approach I took
to handling swizzles.

Unfortunately, the only indirect MOV instructions you can use in align16
mode have a uniform indirect.  This means that, in order to do an indirect
MOV, we need to either do two movs or use align1 mode.  The problem with
two MOVs is that, in order to force first/second half, you have to disable
writemasking so you might as well be in align1.  In align1 mode, we have
two options for indirects:  We could use height of 2 and make the hardware
grab two sets of 4 consecutive dwords for us or we could use a UV
immediate to add 0, 4, 8, and 12 to the four channels.  The second method
only works easily on SNB+ because we don't have UV immediates on ILK and
previous.  However, the first method (use a height of 2) may have
interesting hardware implications if any of those sets of 4 dwords ever
crosses a register boundary.  I didn't want to count software always giving
us vec4-aligned offsets, so I decided on using the UV immediate.

Ok, so swizzles.  One of the other advantages of adding a UV immediate is
that we can make the immediate be whatever we want.  It doesn't have to be
(0, 4, 8, 12); it could be anything.  So, I used it to implement swizzling.
Do we want swizzling?  Does it make sense to load a swizzled value starting
at an arbitrary offset?  Does it make sense to support swizzling but not
writemasking?  I don't know.  Thoughts?

---
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 45 ++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index c3426dd..71a7f63 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1052,6 +1052,48 @@ generate_set_simd4x2_header_gen9(struct brw_codegen *p,
 }
 
 static void
+generate_mov_indirect(struct brw_codegen *p,
+                      vec4_instruction *inst,
+                      struct brw_reg dst, struct brw_reg reg,
+                      struct brw_reg indirect, struct brw_reg length)
+{
+   assert(indirect.type == BRW_REGISTER_TYPE_UD);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
+
+   /* This instruction acts in align1 mode */
+   assert(inst->force_writemask_all || reg.writemask == 0xf);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   struct brw_reg addr = vec2(brw_address_reg(0));
+
+   /* We need to move the indirect value into the address register.  In order
+    * to make things make some sense, we want to respect at least the X
+    * component of the swizzle.  In order to do that, we need to convert the
+    * subnr (probably 0) to an align1 subnr and add in the swizzle.  We then
+    * use a region of <8,4,0>:uw to pick off the first 2 bytes of the indirect
+    * and splat it out to all four channels of the given half of a0.
+    */
+   assert(brw_is_single_value_swizzle(indirect.swizzle));
+   indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)) * 2;
+   indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
+
+   brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
+
+   /* Use a <4,1> region Vx1 region*/
+   struct brw_reg src = brw_VxH_indirect(0, 0);
+   src.width = BRW_WIDTH_4;
+   src.hstride = BRW_HORIZONTAL_STRIDE_1;
+
+   brw_MOV(p, dst, retype(src, reg.type));
+
+   brw_pop_insn_state(p);
+}
+
+static void
 generate_code(struct brw_codegen *p,
               const struct brw_compiler *compiler,
               void *log_data,
@@ -1538,6 +1580,9 @@ generate_code(struct brw_codegen *p,
          break;
       }
 
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
+
       default:
          unreachable("Unsupported opcode");
       }
-- 
2.5.0.400.gff86faf