[Mesa-dev] [WIP 13/25] i965/fs: Add pack_double_2x32 virtual opcode

Thu Oct 16 05:24:25 PDT 2014

Signed-off-by: Topi Pohjolainen <topi.pohjolainen at intel.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h        |  1 +
 src/mesa/drivers/dri/i965/brw_fs.h             |  5 ++
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 96 ++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 4a173db..88097b7 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -919,6 +919,7 @@ enum opcode {
    FS_OPCODE_SET_SAMPLE_ID,
    FS_OPCODE_SET_SIMD4X2_OFFSET,
    FS_OPCODE_PACK_HALF_2x16_SPLIT,
+   FS_OPCODE_PACK_DOUBLE_2x32,
    FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
    FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
    FS_OPCODE_PLACEHOLDER_HALT,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 8c11c32..66173fe 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -759,6 +759,11 @@ private:
                                         struct brw_reg dst,
                                         struct brw_reg src);
 
+   void generate_pack_double_2x32(fs_inst *inst,
+                                  struct brw_reg dst,
+                                  struct brw_reg hi,
+                                  struct brw_reg lo);
+
    void generate_shader_time_add(fs_inst *inst,
                                  struct brw_reg payload,
                                  struct brw_reg offset,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 21c9660..2b20f7c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1413,6 +1413,98 @@ fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
 }
 
 void
+fs_generator::generate_pack_double_2x32(fs_inst *inst,
+                                        struct brw_reg dst,
+                                        struct brw_reg hi,
+                                        struct brw_reg lo)
+{
+   assert(brw->gen >= 7);
+   assert(dst.type == BRW_REGISTER_TYPE_DF);
+   assert(hi.type == BRW_REGISTER_TYPE_UD);
+   assert(lo.type == BRW_REGISTER_TYPE_UD);
+
+   /**
+    * Double precision floats take 64-bits channel meaning that two registers
+    * are needed to hold 8 elements. The values are constructed in two steps:
+    * first high 32-bits are copied and then the low 32. The destination is
+    * treated as having unsigned type but a horizontal stride telling that two
+    * consecutive channels are 64-bits apart. Both high bits and low bits
+    * require two moves each - hardware allows sources to spand over mulitple
+    * physical registers but destination not. Hence four moves in total are
+    * required.
+    *
+    * TODO: If "hi" and "lo" are both uniforms and in consecutive slots then
+    *       on HSW and newer one could simply omit the copy. The pair of
+    *       32-bit slots could be treated as double precision scalar instead.
+    *       On IVB the copy is still needed but could be done with two
+    *       instructions each moving hi-lo-pairs.
+    */
+   dst.type = BRW_REGISTER_TYPE_UD;
+   dst.width = BRW_WIDTH_4;
+   dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+   dst.vstride = BRW_VERTICAL_STRIDE_8;
+
+   if (!brw_is_scalar(hi)) {
+      assert(hi.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(hi.vstride == BRW_VERTICAL_STRIDE_8);
+      hi.width = BRW_WIDTH_4;
+      hi.vstride = BRW_VERTICAL_STRIDE_4;
+   }
+   if (!brw_is_scalar(lo)) {
+      assert(lo.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(lo.vstride == BRW_VERTICAL_STRIDE_8);
+      lo.width = BRW_WIDTH_4;
+      lo.vstride = BRW_VERTICAL_STRIDE_4;
+   }
+
+   struct brw_reg dst_2nd_half = dst;
+   ++dst_2nd_half.nr;
+
+   /* In terms of SIMD8:
+    *            +--+--+--+--+--+--+--+--+          +--+--+--+--+--+--+--+--+
+    * dst.reg    |H0|  |H1|  |H2|  |H3|  |   hi.reg |H0|H1|H2|H3|H4|H5|H6|H7|
+    *            +--+--+--+--+--+--+--+--+          +--+--+--+--+--+--+--+--+
+    * dst.reg+1  |  |  |  |  |  |  |  |  |   lo.reg |L0|L1|L2|L3|L4|L5|L6|L7|
+    *            +--+--+--+--+--+--+--+--+          +--+--+--+--+--+--+--+--+
+    */
+   brw_MOV(p, dst, hi);
+
+   /*            +--+--+--+--+--+--+--+--+
+    * dst.reg    |H0|  |H1|  |H2|  |H3|  |
+    *            +--+--+--+--+--+--+--+--+
+    * dst.reg+1  |H4|  |H5|  |H6|  |H7|  |
+    *            +--+--+--+--+--+--+--+--+
+    */
+   if (!brw_is_scalar(hi)) {
+      assert(hi.subnr == 0);
+      hi.subnr = 4 * 4;
+   }
+   brw_MOV(p, dst_2nd_half, hi);
+
+   /*            +--+--+--+--+--+--+--+--+
+    * dst.reg    |H0|L0|H1|L1|H2|L2|H3|L3|
+    *            +--+--+--+--+--+--+--+--+
+    * dst.reg+1  |H4|  |H5|  |H6|  |H7|  |
+    *            +--+--+--+--+--+--+--+--+
+    */
+   dst.subnr += 4;
+   brw_MOV(p, dst, lo);
+
+   /*            +--+--+--+--+--+--+--+--+
+    * dst.reg    |H0|L0|H1|L1|H2|L2|H3|L3|
+    *            +--+--+--+--+--+--+--+--+
+    * dst.reg+1  |H4|L4|H5|L5|H6|L6|H7|L7|
+    *            +--+--+--+--+--+--+--+--+
+    */
+   if (!brw_is_scalar(lo)) {
+      assert(lo.subnr == 0);
+      lo.subnr = 4 * 4;
+   }
+   dst_2nd_half.subnr += 4;
+   brw_MOV(p, dst_2nd_half, lo);
+}
+
+void
 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
                                               struct brw_reg dst,
                                               struct brw_reg src)
@@ -1932,6 +2024,10 @@ fs_generator::generate_code(const cfg_t *cfg)
           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
           break;
 
+      case FS_OPCODE_PACK_DOUBLE_2x32:
+          generate_pack_double_2x32(inst, dst, src[0], src[1]);
+          break;
+
       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
          generate_unpack_half_2x16_split(inst, dst, src[0]);
-- 
1.8.3.1