[Mesa-dev] [PATCH 17/18] i965: Validate "Region Alignment Rules"

Tue Nov 22 19:59:51 UTC 2016

---
 src/mesa/drivers/dri/i965/brw_eu_validate.c    | 410 ++++++++++++++++++++++++-
 src/mesa/drivers/dri/i965/test_eu_validate.cpp | 288 +++++++++++++++++
 2 files changed, 697 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_validate.c b/src/mesa/drivers/dri/i965/brw_eu_validate.c
index e01067f..e9b55ed 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_validate.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_validate.c
@@ -44,7 +44,8 @@ cat(struct string *dest, const struct string src)
 }
 #define CAT(dest, src) cat(&dest, (struct string){src, strlen(src)})
 
-#define error(str) "\tERROR: " str "\n"
+#define error(str)   "\tERROR: " str "\n"
+#define ERROR_INDENT "\t       "
 
 #define ERROR(msg) ERROR_IF(true, msg)
 #define ERROR_IF(cond, msg)          \
@@ -104,6 +105,22 @@ src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
    return brw_inst_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE;
 }
 
+static bool
+src0_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 &&
+          brw_inst_src0_width(devinfo, inst) == BRW_WIDTH_1 &&
+          brw_inst_src0_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0;
+}
+
+static bool
+src1_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src1_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 &&
+          brw_inst_src1_width(devinfo, inst) == BRW_WIDTH_1 &&
+          brw_inst_src1_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0;
+}
+
 static unsigned
 num_sources_from_inst(const struct gen_device_info *devinfo,
                       const brw_inst *inst)
@@ -328,6 +345,26 @@ execution_type(const struct gen_device_info *devinfo, const brw_inst *inst)
 }
 
 /**
+ * Returns whether a region is packed
+ *
+ * A region is packed if its elements are adjacent in memory, with no
+ * intervening space, no overlap, and no replicated values.
+ */
+static bool
+is_packed(unsigned vstride, unsigned width, unsigned hstride)
+{
+   if (vstride == width) {
+      if (vstride == 1) {
+         return hstride == 0;
+      } else {
+         return hstride == 1;
+      }
+   }
+
+   return false;
+}
+
+/**
  * Checks restrictions listed in "General Restrictions Based on Operand Types"
  * in the "Register Region Restrictions" section.
  */
@@ -539,6 +576,376 @@ general_restrictions_on_region_parameters(const struct gen_device_info *devinfo,
    return error_msg;
 }
 
+/**
+ * Creates an \p access_mask for an \p exec_size, \p element_size, and a region
+ *
+ * An \p access_mask is a 32-element array of uint64_t, where each uint64_t is
+ * a bitmask of bytes accessed by the region.
+ *
+ * For instance the access mask of the source gX.1<4,2,2>F in an exec_size = 4
+ * instruction would be
+ *
+ *    access_mask[0] = 0x00000000000000F0
+ *    access_mask[1] = 0x000000000000F000
+ *    access_mask[2] = 0x0000000000F00000
+ *    access_mask[3] = 0x00000000F0000000
+ *    access_mask[4-31] = 0
+ *
+ * because the first execution channel accesses bytes 7-4 and the second
+ * execution channel accesses bytes 15-12, etc.
+ */
+static void
+align1_access_mask(uint64_t access_mask[static 32],
+                   unsigned exec_size, unsigned element_size, unsigned subreg,
+                   unsigned vstride, unsigned width, unsigned hstride)
+{
+   const uint64_t mask = (1 << element_size) - 1;
+   unsigned rowbase = subreg;
+   unsigned element = 0;
+
+   for (int y = 0; y < exec_size / width; y++) {
+      unsigned offset = rowbase;
+
+      for (int x = 0; x < width; x++) {
+         access_mask[element++] = mask << offset;
+         offset += hstride * element_size;
+      }
+
+      rowbase += vstride * element_size;
+   }
+
+   assert(element == 0 || element == exec_size);
+}
+
+/**
+ * Returns the number of registers accessed according to the \p access_mask
+ */
+static int
+registers_read(const uint64_t access_mask[static 32])
+{
+   int regs_read = 0;
+
+   for (unsigned i = 0; i < 32; i++) {
+      if (access_mask[i] > 0xFFFFFFFF) {
+         return 2;
+      } else if (access_mask[i]) {
+         regs_read = 1;
+      }
+   }
+
+   return regs_read;
+}
+
+/**
+ * Checks restrictions listed in "Region Alignment Rules" in the "Register
+ * Region Restrictions" section.
+ */
+static struct string
+region_alignment_rules(const struct gen_device_info *devinfo,
+                       const brw_inst *inst)
+{
+   const struct opcode_desc *desc =
+      brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   uint64_t dst_access_mask[32], src0_access_mask[32], src1_access_mask[32];
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (num_sources == 3)
+      return (struct string){};
+
+   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16)
+      return (struct string){};
+
+   if (inst_is_send(devinfo, inst))
+      return (struct string){};
+
+   memset(dst_access_mask, 0, sizeof(dst_access_mask));
+   memset(src0_access_mask, 0, sizeof(src0_access_mask));
+   memset(src1_access_mask, 0, sizeof(src1_access_mask));
+
+   for (unsigned i = 0; i < num_sources; i++) {
+      unsigned vstride, width, hstride, element_size, subreg;
+
+      /* In Direct Addressing mode, a source cannot span more than 2 adjacent
+       * GRF registers.
+       */
+
+#define DO_SRC(n)                                                              \
+      if (brw_inst_src ## n ## _address_mode(devinfo, inst) !=                 \
+          BRW_ADDRESS_DIRECT)                                                  \
+         continue;                                                             \
+                                                                               \
+      if (brw_inst_src ## n ## _reg_file(devinfo, inst) ==                     \
+          BRW_IMMEDIATE_VALUE)                                                 \
+         continue;                                                             \
+                                                                               \
+      vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ?                 \
+                (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \
+      width = 1 << brw_inst_src ## n ## _width(devinfo, inst);                 \
+      hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ?                 \
+                (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \
+      element_size = brw_element_size(devinfo, inst, src ## n);                \
+      subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst);             \
+      align1_access_mask(src ## n ## _access_mask,                             \
+                         exec_size, element_size, subreg,                      \
+                         vstride, width, hstride)
+
+      if (i == 0) {
+         DO_SRC(0);
+      } else if (i == 1) {
+         DO_SRC(1);
+      }
+#undef DO_SRC
+
+      unsigned num_vstride = exec_size / width;
+      unsigned num_hstride = width;
+      unsigned vstride_elements = (num_vstride - 1) * vstride;
+      unsigned hstride_elements = (num_hstride - 1) * hstride;
+      unsigned offset = (vstride_elements + hstride_elements) * element_size +
+                        subreg;
+      ERROR_IF(offset >= 64,
+               "A source cannot span more than 2 adjacent GRF registers");
+   }
+
+   if (desc->ndst == 0 || dst_is_null(devinfo, inst))
+      return error_msg;
+
+   unsigned stride = 1 << (brw_inst_dst_hstride(devinfo, inst) - 1);
+   unsigned element_size = brw_element_size(devinfo, inst, dst);
+   unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+   unsigned offset = ((exec_size - 1) * stride * element_size) + subreg;
+   ERROR_IF(offset >= 64,
+            "A destination cannot span more than 2 adjacent GRF registers");
+
+   if (error_msg.str)
+      return error_msg;
+
+   align1_access_mask(dst_access_mask, exec_size, element_size, subreg,
+                      exec_size == 1 ? 0 : exec_size * stride,
+                      exec_size == 1 ? 1 : exec_size,
+                      exec_size == 1 ? 0 : stride);
+
+   unsigned dst_regs = registers_read(dst_access_mask);
+   unsigned src0_regs = registers_read(src0_access_mask);
+   unsigned src1_regs = registers_read(src1_access_mask);
+
+   /* The SNB, IVB, HSW, BDW, and CHV PRMs say:
+    *
+    *    When an instruction has a source region spanning two registers and a
+    *    destination region contained in one register, the number of elements
+    *    must be the same between two sources and one of the following must be
+    *    true:
+    *
+    *       1. The destination region is entirely contained in the lower OWord
+    *          of a register.
+    *       2. The destination region is entirely contained in the upper OWord
+    *          of a register.
+    *       3. The destination elements are evenly split between the two OWords
+    *          of a register.
+    */
+   if (devinfo->gen <= 8) {
+      if (dst_regs == 1 && (src0_regs == 2 || src1_regs == 2)) {
+         unsigned upper_oword_writes = 0, lower_oword_writes = 0;
+
+         for (unsigned i = 0; i < exec_size; i++) {
+            if (dst_access_mask[i] > 0x0000FFFF) {
+               upper_oword_writes++;
+            } else {
+               assert(dst_access_mask[i] != 0);
+               lower_oword_writes++;
+            }
+         }
+
+         ERROR_IF(lower_oword_writes != 0 &&
+                  upper_oword_writes != 0 &&
+                  upper_oword_writes != lower_oword_writes,
+                  "Writes must be to only one OWord or "
+                  "evenly split between OWords");
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When an instruction has a source region that spans two registers and
+    *    the destination spans two registers, the destination elements must be
+    *    evenly split between the two registers [...]
+    *
+    * The SNB PRM contains similar wording (but written in a much more
+    * confusing manner).
+    *
+    * The BDW PRM says:
+    *
+    *    When destination spans two registers, the source may be one or two
+    *    registers. The destination elements must be evenly split between the
+    *    two registers.
+    *
+    * The SKL PRM says:
+    *
+    *    When destination of MATH instruction spans two registers, the
+    *    destination elements must be evenly split between the two registers.
+    *
+    * It is not known whether this restriction applies to KBL other Gens after
+    * SKL.
+    */
+   if (devinfo->gen <= 8 ||
+       brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) {
+
+      /* Nothing explicitly states that on Gen < 8 elements must be evenly
+       * split between two destination registers in the two exceptional
+       * source-region-spans-one-register cases, but since Broadwell requires
+       * evenly split writes regardless of source region, we assume that it was
+       * an oversight and require it.
+       */
+      if (dst_regs == 2) {
+         unsigned upper_reg_writes = 0, lower_reg_writes = 0;
+
+         for (unsigned i = 0; i < exec_size; i++) {
+            if (dst_access_mask[i] > 0xFFFFFFFF) {
+               upper_reg_writes++;
+            } else {
+               assert(dst_access_mask[i] != 0);
+               lower_reg_writes++;
+            }
+         }
+
+         ERROR_IF(upper_reg_writes != lower_reg_writes,
+                  "Writes must be evenly split between the two "
+                  "destination registers");
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When an instruction has a source region that spans two registers and
+    *    the destination spans two registers, the destination elements must be
+    *    evenly split between the two registers and each destination register
+    *    must be entirely derived from one source register.
+    *
+    *    Note: In such cases, the regioning parameters must ensure that the
+    *    offset from the two source registers is the same.
+    *
+    * The SNB PRM contains similar wording (but written in a much more
+    * confusing manner).
+    *
+    * There are effectively three rules stated here:
+    *
+    *    For an instruction with a source and a destination spanning two
+    *    registers,
+    *
+    *       (1) destination elements must be evenly split between the two
+    *           registers
+    *       (2) all destination elements in a register must be derived
+    *           from one source register
+    *       (3) the offset (i.e. the starting location in each of the two
+    *           registers spanned by a region) must be the same in the two
+    *           registers spanned by a region
+    *
+    * It is impossible to violate rule (1) without violating (2) or (3), so we
+    * do not attempt to validate it.
+    */
+   if (devinfo->gen <= 7 && dst_regs == 2) {
+      for (unsigned i = 0; i < num_sources; i++) {
+#define DO_SRC(n)                                                             \
+         if (src ## n ## _regs <= 1)                                          \
+            continue;                                                         \
+                                                                              \
+         for (unsigned i = 0; i < exec_size; i++) {                           \
+            if ((dst_access_mask[i] > 0xFFFFFFFF) !=                          \
+                (src ## n ## _access_mask[i] > 0xFFFFFFFF)) {                 \
+               ERROR("Each destination register must be entirely derived "    \
+                     "from one source register");                             \
+               break;                                                         \
+            }                                                                 \
+         }                                                                    \
+                                                                              \
+         unsigned offset_0 =                                                  \
+            brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst);               \
+         unsigned offset_1 = offset_0;                                        \
+                                                                              \
+         for (unsigned i = 0; i < exec_size; i++) {                           \
+            if (src ## n ## _access_mask[i] > 0xFFFFFFFF) {                   \
+               offset_1 = __builtin_ctzll(src ## n ## _access_mask[i]) - 32;  \
+               break;                                                         \
+            }                                                                 \
+         }                                                                    \
+                                                                              \
+         ERROR_IF(offset_0 != offset_1,                                       \
+                  "The offset from the two source registers "                 \
+                  "must be the same")
+
+         if (i == 0) {
+            DO_SRC(0);
+         } else if (i == 1) {
+            DO_SRC(1);
+         }
+#undef DO_SRC
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When destination spans two registers, the source MUST span two
+    *    registers. The exception to the above rule:
+    *        1. When source is scalar, the source registers are not
+    *           incremented.
+    *        2. When source is packed integer Word and destination is packed
+    *           integer DWord, the source register is not incremented by the
+    *           source sub register is incremented.
+    *
+    * The SNB PRM does not contain this rule, but the internal documentation
+    * indicates that it applies to SNB as well. We assume that the rule applies
+    * to Gen <= 5 although their PRMs do not state it.
+    *
+    * While the documentation explicitly says in exception (2) that the
+    * destination must be an integer DWord, the hardware allows at least a
+    * float destination type as well. We emit such instructions from
+    *
+    *    fs_visitor::emit_interpolation_setup_gen6
+    *    fs_visitor::emit_fragcoord_interpolation
+    *
+    * and have for years with no ill effects.
+    *
+    * Additionally the simulator source code indicates that the real condition
+    * is that the size of the destination type is 4 bytes.
+    */
+   if (devinfo->gen <= 7 && dst_regs == 2) {
+      bool dst_is_packed_dword =
+         is_packed(exec_size * stride, exec_size, stride) &&
+         brw_element_size(devinfo, inst, dst) == 4;
+
+      for (unsigned i = 0; i < num_sources; i++) {
+#define DO_SRC(n)                                                                  \
+         unsigned vstride, width, hstride;                                         \
+         vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ?                  \
+                   (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0;  \
+         width = 1 << brw_inst_src ## n ## _width(devinfo, inst);                  \
+         hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ?                  \
+                   (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0;  \
+         bool src ## n ## _is_packed_word =                                        \
+            is_packed(vstride, width, hstride) &&                                  \
+            (brw_inst_src ## n ## _reg_type(devinfo, inst) == BRW_HW_REG_TYPE_W || \
+             brw_inst_src ## n ## _reg_type(devinfo, inst) == BRW_HW_REG_TYPE_UW); \
+                                                                                   \
+         ERROR_IF(src ## n ## _regs == 1 &&                                        \
+                  !src ## n ## _has_scalar_region(devinfo, inst) &&                \
+                  !(dst_is_packed_dword && src ## n ## _is_packed_word),           \
+                  "When the destination spans two registers, the source must "     \
+                  "span two registers\n" ERROR_INDENT "(exceptions for scalar "    \
+                  "source and packed-word to packed-dword expansion)")
+
+         if (i == 0) {
+            DO_SRC(0);
+         } else if (i == 1) {
+            DO_SRC(1);
+         }
+#undef DO_SRC
+      }
+   }
+
+   return error_msg;
+}
+
 bool
 brw_validate_instructions(const struct brw_codegen *p, int start_offset,
                           struct annotation_info *annotation)
@@ -559,6 +966,7 @@ brw_validate_instructions(const struct brw_codegen *p, int start_offset,
          CHECK(send_restrictions);
          CHECK(general_restrictions_based_on_operand_types);
          CHECK(general_restrictions_on_region_parameters);
+         CHECK(region_alignment_rules);
       }
 
       if (error_msg.str && annotation) {
diff --git a/src/mesa/drivers/dri/i965/test_eu_validate.cpp b/src/mesa/drivers/dri/i965/test_eu_validate.cpp
index ae1197a..f4d0e94 100644
--- a/src/mesa/drivers/dri/i965/test_eu_validate.cpp
+++ b/src/mesa/drivers/dri/i965/test_eu_validate.cpp
@@ -468,3 +468,291 @@ TEST_P(validation_test, vstride_on_align16_must_be_0_or_4)
       clear_instructions(p);
    }
 }
+
+/* In Direct Addressing mode, a source cannot span more than 2 adjacent GRF
+ * registers.
+ */
+TEST_P(validation_test, source_cannot_span_more_than_2_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* A destination cannot span more than 2 adjacent GRF registers. */
+TEST_P(validation_test, destination_cannot_span_more_than_2_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_8);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 6);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+TEST_P(validation_test, src_region_spans_two_regs_dst_region_spans_one)
+{
+   /* Writes to dest are to the lower OWord */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are to the upper OWord */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are evenly split between OWords */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are uneven between OWords */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 10);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   if (devinfo.gen >= 9) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, dst_elements_must_be_evenly_split_between_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4);
+
+   if (devinfo.gen >= 9) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   if (devinfo.gen >= 6) {
+      gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+
+      EXPECT_TRUE(validate(p));
+
+      clear_instructions(p);
+
+      gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4);
+
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, two_src_two_dst_source_offsets_must_be_same)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_2);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   if (devinfo.gen <= 7) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_8);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+#if 0
+TEST_P(validation_test, two_src_two_dst_each_dst_must_be_derived_from_one_src)
+{
+   // mov (16) r10.0<2>:w r12.4<4;4,1>:w
+
+   brw_MOV(p, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 8);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+#if 0
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_FALSE(validate(p));
+   #endif
+}
+#endif
+
+TEST_P(validation_test, one_src_two_dst)
+{
+   struct brw_reg g0_0 = brw_vec1_grf(0, 0);
+
+   brw_ADD(p, g0, g0_0, g0_0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   if (devinfo.gen >= 8) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+   if (devinfo.gen >= 8) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+}
-- 
2.7.3