[Mesa-dev] [PATCH v6 32/35] intel/compiler: validate region restrictions for mixed float mode

Thu Apr 11 12:10:51 UTC 2019

On Wed, 2019-04-10 at 17:13 -0700, Francisco Jerez wrote:
> "Juan A. Suarez Romero" <jasuarez at igalia.com> writes:
> 
> > From: Iago Toral Quiroga <itoral at igalia.com>
> > 
> > v2:
> >  - Adapted unit tests to make them consistent with the changes done
> >    to the validation of half-float conversions.
> > 
> > v3 (Curro):
> > - Check all the accummulators
> > - Constify declarations
> > - Do not check src1 type in single-source instructions.
> > - Check for all instructions that read accumulator (either implicitly or
> >   explicitly)
> > - Check restrictions in src1 too.
> > - Merge conditional block
> > - Add invalid test case.
> > ---
> >  src/intel/compiler/brw_eu_validate.c    | 290 +++++++++++
> >  src/intel/compiler/test_eu_validate.cpp | 631 ++++++++++++++++++++++++
> >  2 files changed, 921 insertions(+)
> > 
> > diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
> > index cfaf126e2f5..4a735641c86 100644
> > --- a/src/intel/compiler/brw_eu_validate.c
> > +++ b/src/intel/compiler/brw_eu_validate.c
> > @@ -170,6 +170,20 @@ src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
> >            brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
> >  }
> >  
> > +static bool
> > +src0_is_acc(const struct gen_device_info *devinfo, const brw_inst *inst)
> > +{
> > +   return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
> > +          (brw_inst_src0_da_reg_nr(devinfo, inst) & 0xF0) == BRW_ARF_ACCUMULATOR;
> > +}
> > +
> > +static bool
> > +src1_is_acc(const struct gen_device_info *devinfo, const brw_inst *inst)
> > +{
> > +   return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
> > +          (brw_inst_src1_da_reg_nr(devinfo, inst) & 0xF0) == BRW_ARF_ACCUMULATOR;
> > +}
> > +
> >  static bool
> >  src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
> >  {
> > @@ -275,6 +289,27 @@ sources_not_null(const struct gen_device_info *devinfo,
> >     return error_msg;
> >  }
> >  
> > +static bool
> > +inst_uses_src_acc(const struct gen_device_info *devinfo, const brw_inst *inst)
> > +{
> > +   /* Check instructions that use implicit accumulator sources */
> > +   switch (brw_inst_opcode(devinfo, inst)) {
> > +   case BRW_OPCODE_MAC:
> > +   case BRW_OPCODE_MACH:
> > +   case BRW_OPCODE_SADA2:
> > +      return true;
> > +   }
> > +
> > +   /* Instructions with three source operands cannot use explicit accumulator
> > +    * operands.
> > +    */
> 
> They can on Gen10+.  Yeah, I know, it's quite a pain to have to
> special-case 3src instructions everywhere in the validator code...

Checking other parts of code, I'll assume that srcN_is_acc() should return false
for align16 mode; at least in them there're assertions that in this mode srcs
can only be GRF.

OTOH, is it worth to handle here the case for 3src instructions allowing
explicit accumulator? If other parts of drive asume this is not possible, I
understand it would be better to fix this in all the code in a separate patchset
(not related with float16).

> 
> > +   const unsigned num_sources = num_sources_from_inst(devinfo, inst);
> > +   if (num_sources > 2)
> > +      return false;
> > +
> > +   return src0_is_acc(devinfo, inst) || (num_sources > 1 && src1_is_acc(devinfo, inst));
> > +}
> > +
> >  static struct string
> >  send_restrictions(const struct gen_device_info *devinfo,
> >                    const brw_inst *inst)
> > @@ -938,6 +973,260 @@ general_restrictions_on_region_parameters(const struct gen_device_info *devinfo,
> >     return error_msg;
> >  }
> >  
> > +static struct string
> > +special_restrictions_for_mixed_float_mode(const struct gen_device_info *devinfo,
> > +                                          const brw_inst *inst)
> > +{
> > +   struct string error_msg = { .str = NULL, .len = 0 };
> > +
> > +   const unsigned opcode = brw_inst_opcode(devinfo, inst);
> > +   const unsigned num_sources = num_sources_from_inst(devinfo, inst);
> > +   if (num_sources >= 3)
> > +      return error_msg;
> > +
> > +   if (!is_mixed_float(devinfo, inst))
> > +      return error_msg;
> > +
> > +   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
> > +   bool is_align16 = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16;
> > +
> > +   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
> > +   enum brw_reg_type src1_type = num_sources > 1 ?
> > +                                 brw_inst_src1_type(devinfo, inst) : 0;
> > +   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
> > +
> > +   unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
> > +   bool dst_is_packed = is_packed(exec_size * dst_stride, exec_size, dst_stride);
> > +
> > +   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +    * Float Operations:
> > +    *
> > +    *    "Indirect addressing on source is not supported when source and
> > +    *     destination data types are mixed float."
> > +    */
> > +   ERROR_IF((types_are_mixed_float(dst_type, src0_type) &&
> > +             brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT) ||
> > +            (num_sources > 1 &&
> > +             types_are_mixed_float(dst_type, src1_type) &&
> > +             brw_inst_src1_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT),
> > +            "Indirect addressing on source is not supported when source and "
> > +            "destination data types are mixed float");
> > +
> > +   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +    * Float Operations:
> > +    *
> > +    *    "No SIMD16 in mixed mode when destination is f32. Instruction
> > +    *     execution size must be no more than 8."
> > +    */
> > +   ERROR_IF(exec_size > 8 && dst_type == BRW_REGISTER_TYPE_F,
> > +            "Mixed float mode with 32-bit float destination is limited "
> > +            "to SIMD8");
> > +
> > +   if (is_align16) {
> > +      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +       * Float Operations:
> > +       *
> > +       *   "In Align16 mode, when half float and float data types are mixed
> > +       *    between source operands OR between source and destination operands,
> > +       *    the register content are assumed to be packed."
> > +       *
> > +       * Since Align16 doesn't have a concept of horizontal stride (or width),
> > +       * it means that vertical stride must always be 4, since 0 and 2 would
> > +       * lead to replicated data, and any other value is disallowed in Align16.
> > +       * However, the PRM also says:
> > +       *
> > +       *   "In Align16, vertical stride can never be zero for f16"
> > +       *
> > +       * Which is oddly redundant and specific considering the more general
> > +       * assumption that all operands are assumed to be packed, so we
> > +       * understand that this might be hinting that there may be an exception
> > +       * for f32 operands with a vstride of 0, so we don't validate this for
> > +       * them while we don't have empirical evidence that it is forbidden.
> > +       */
> > +      ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
> > +               (src0_type != BRW_REGISTER_TYPE_F ||
> > +                brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
> > +               "Align16 mixed float mode assumes packed data (vstride must "
> > +               "be 4 -or 0 for f32 operands-)");
> > +
> > +      ERROR_IF(num_sources >= 2 &&
> > +               brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
> > +               (src1_type != BRW_REGISTER_TYPE_F ||
> > +                brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
> > +               "Align16 mixed float mode assumes packed data (vstride must "
> > +               "be 4 -or 0 for f32 operands-)");
> > +
> > +      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +       * Float Operations:
> > +       *
> > +       *   "For Align16 mixed mode, both input and output packed f16 data
> > +       *    must be oword aligned, no oword crossing in packed f16."
> > +       *
> > +       * The previous rule requires that Align16 operands are always packed,
> > +       * and since there is only one bit for Align16 subnr, which represents
> > +       * offsets 0B and 16B, this rule is always enforced and we don't need to
> > +       * validate it.
> > +       */
> > +
> > +      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +       * Float Operations:
> > +       *
> > +       *    "No SIMD16 in mixed mode when destination is packed f16 for both
> > +       *     Align1 and Align16."
> > +       *
> > +       * And:
> > +       *
> > +       *   "In Align16 mode, when half float and float data types are mixed
> > +       *    between source operands OR between source and destination operands,
> > +       *    the register content are assumed to be packed."
> > +       *
> > +       * Which implies that SIMD16 is not available in Align16. This is further
> > +       * confirmed by:
> > +       *
> > +       *    "For Align16 mixed mode, both input and output packed f16 data
> > +       *     must be oword aligned, no oword crossing in packed f16"
> > +       *
> > +       * Since oword-aligned packed f16 data would cross oword boundaries when
> > +       * the execution size is larger than 8.
> > +       */
> > +      ERROR_IF(exec_size > 8, "Align16 mixed float mode is limited to SIMD8");
> > +
> > +      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +       * Float Operations:
> > +       *
> > +       *    "Math operations for mixed mode:
> > +       *     - In Align16, only packed format is supported"
> > +       *
> > +       * It is not clear what this is restricting since as stated in previous
> > +       * spec quotes, Align16 always assumes packed data. However, since
> > +       * we are allowing vstride of 0 on f32, we check again here without that
> > +       * exception.
> > +       */
> > +      if (opcode == BRW_OPCODE_MATH) {
> > +         ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
> > +                  "Align16 mixed float mode math only supports packed format");
> > +
> > +         ERROR_IF(num_sources >= 2 &&
> > +                  brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
> > +                  "Align16 mixed float mode math only supports packed format");
> > +      }
> > +
> > +      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +       * Float Operations:
> > +       *
> > +       *    "No accumulator read access for Align16 mixed float."
> > +       */
> > +      ERROR_IF((src0_is_acc(devinfo, inst) ||
> > +                (num_sources > 1 && src1_is_acc(devinfo, inst))),
> > +               "No accumulator read access for Align16 mixed float");
> > +   } else {
> > +      assert(!is_align16);
> > +
> > +      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +       * Float Operations:
> > +       *
> > +       *    "No SIMD16 in mixed mode when destination is packed f16 for both
> > +       *     Align1 and Align16."
> > +       */
> > +      ERROR_IF(exec_size > 8 && dst_is_packed &&
> > +               dst_type == BRW_REGISTER_TYPE_HF,
> > +               "Align1 mixed float mode is limited to SIMD8 when destination "
> > +               "is packed half-float");
> > +
> > +      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +       * Float Operations:
> > +       *
> > +       *    "Math operations for mixed mode:
> > +       *     - In Align1, f16 inputs need to be strided"
> > +       */
> > +      if (opcode == BRW_OPCODE_MATH) {
> > +         if (src0_type == BRW_REGISTER_TYPE_HF) {
> > +            ERROR_IF(STRIDE(brw_inst_src0_hstride(devinfo, inst)) <= 1,
> > +                     "Align1 mixed mode math needs strided half-float inputs");
> > +         }
> > +
> > +         if (num_sources >= 2 && src1_type == BRW_REGISTER_TYPE_HF) {
> > +            ERROR_IF(STRIDE(brw_inst_src1_hstride(devinfo, inst)) <= 1,
> > +                     "Align1 mixed mode math needs strided half-float inputs");
> > +         }
> > +      }
> > +
> > +      if (dst_type == BRW_REGISTER_TYPE_HF && dst_stride == 1) {
> > +         /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +          * Float Operations:
> > +          *
> > +          *    "In Align1, destination stride can be smaller than execution
> > +          *     type. When destination is stride of 1, 16 bit packed data is
> > +          *     updated on the destination. However, output packed f16 data
> > +          *     must be oword aligned, no oword crossing in packed f16."
> > +          *
> > +          * The requirement of not crossing oword boundaries for 16-bit oword
> > +          * aligned data means that execution size is limited to 8.
> > +          */
> > +         unsigned subreg;
> > +         if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT)
> > +            subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
> > +         else
> > +            subreg = brw_inst_dst_ia_subreg_nr(devinfo, inst);
> > +         ERROR_IF(subreg % 16 != 0,
> > +                  "Align1 mixed mode packed half-float output must be "
> > +                  "oword aligned");
> > +         ERROR_IF(exec_size > 8,
> > +                  "Align1 mixed mode packed half-float output must not "
> > +                  "cross oword boundaries (max exec size is 8)");
> > +
> > +         /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +          * Float Operations:
> > +          *
> > +          *    "When source is float or half float from accumulator register and
> > +          *     destination is half float with a stride of 1, the source must
> > +          *     register aligned. i.e., source must have offset zero."
> > +          *
> > +          * Align16 mixed float mode doesn't allow accumulator access on sources,
> > +          * so we only need to check this for Align1.
> > +          */
> > +         if (src0_is_acc(devinfo, inst) &&
> > +             (src0_type == BRW_REGISTER_TYPE_F ||
> > +              src0_type == BRW_REGISTER_TYPE_HF)) {
> > +            ERROR_IF(brw_inst_src0_da1_subreg_nr(devinfo, inst) != 0,
> > +                     "Mixed float mode requires register-aligned accumulator "
> > +                     "source reads when destination is packed half-float");
> > +
> > +         if (num_sources > 1 &&
> > +             src1_is_acc(devinfo, inst) &&
> > +             (src1_type == BRW_REGISTER_TYPE_F ||
> > +              src1_type == BRW_REGISTER_TYPE_HF)) {
> > +            ERROR_IF(brw_inst_src1_da1_subreg_nr(devinfo, inst) != 0,
> > +                     "Mixed float mode requires register-aligned accumulator "
> > +                     "source reads when destination is packed half-float");
> > +         }
> > +         }
> > +      }
> > +
> > +      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > +       * Float Operations:
> > +       *
> > +       *    "No swizzle is allowed when an accumulator is used as an implicit
> > +       *     source or an explicit source in an instruction. i.e. when
> > +       *     destination is half float with an implicit accumulator source,
> > +       *     destination stride needs to be 2."
> > +       *
> > +       * FIXME: it is not quite clear what the first sentence actually means
> > +       *        or its link to the implication described after it, so we only
> > +       *        validate the explicit implication, which is clearly described.
> > +       */
> > +      if (dst_type == BRW_REGISTER_TYPE_HF &&
> > +          inst_uses_src_acc(devinfo, inst)) {
> > +         ERROR_IF(dst_stride != 2,
> > +                  "Mixed float mode with implicit/explicit accumulator "
> > +                  "source and half-float destination requires a stride "
> > +                  "of 2 on the destination");
> > +      }
> > +   }
> > +
> > +   return error_msg;
> > +}
> > +
> >  /**
> >   * Creates an \p access_mask for an \p exec_size, \p element_size, and a region
> >   *
> > @@ -1576,6 +1865,7 @@ brw_validate_instructions(const struct gen_device_info *devinfo,
> >           CHECK(send_restrictions);
> >           CHECK(general_restrictions_based_on_operand_types);
> >           CHECK(general_restrictions_on_region_parameters);
> > +         CHECK(special_restrictions_for_mixed_float_mode);
> >           CHECK(region_alignment_rules);
> >           CHECK(vector_immediate_restrictions);
> >           CHECK(special_requirements_for_handling_double_precision_data_types);
> > diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp
> > index 2e06da2f5b4..f2f0750719f 100644
> > --- a/src/intel/compiler/test_eu_validate.cpp
> > +++ b/src/intel/compiler/test_eu_validate.cpp
> > @@ -1019,6 +1019,637 @@ TEST_P(validation_test, half_float_conversion)
> >     }
> >  }
> >  
> > +TEST_P(validation_test, mixed_float_source_indirect_addressing)
> > +{
> > +   static const struct {
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      unsigned dst_stride;
> > +      bool dst_indirect;
> > +      bool src0_indirect;
> > +      bool expected_result;
> > +   } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type,                              \
> > +             dst_stride, dst_indirect, src0_indirect, expected_result)    \
> > +      {                                                                   \
> > +         BRW_REGISTER_TYPE_##dst_type,                                    \
> > +         BRW_REGISTER_TYPE_##src0_type,                                   \
> > +         BRW_REGISTER_TYPE_##src1_type,                                   \
> > +         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
> > +         dst_indirect,                                                    \
> > +         src0_indirect,                                                   \
> > +         expected_result,                                                 \
> > +      }
> > +
> > +      /* Source and dest are mixed float: indirect src addressing not allowed */
> > +      INST(HF,  F,  F, 2, false, false, true),
> > +      INST(HF,  F,  F, 2, true,  false, true),
> > +      INST(HF,  F,  F, 2, false, true,  false),
> > +      INST(HF,  F,  F, 2, true,  true,  false),
> > +      INST( F, HF,  F, 1, false, false, true),
> > +      INST( F, HF,  F, 1, true,  false, true),
> > +      INST( F, HF,  F, 1, false, true,  false),
> > +      INST( F, HF,  F, 1, true,  true,  false),
> > +
> > +      /* Source and dest are not mixed float: indirect src addressing allowed */
> > +      INST(HF, HF,  F, 2, false, false, true),
> > +      INST(HF, HF,  F, 2, true,  false, true),
> > +      INST(HF, HF,  F, 2, false, true,  true),
> > +      INST(HF, HF,  F, 2, true,  true,  true),
> > +      INST( F,  F, HF, 1, false, false, true),
> > +      INST( F,  F, HF, 1, true,  false, true),
> > +      INST( F,  F, HF, 1, false, true,  true),
> > +      INST( F,  F, HF, 1, true,  true,  true),
> > +
> > +#undef INST
> > +   };
> > +
> > +   if (devinfo.gen < 8)
> > +      return;
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      brw_ADD(p, retype(g0, inst[i].dst_type),
> > +                 retype(g0, inst[i].src0_type),
> > +                 retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_dst_address_mode(&devinfo, last_inst, inst[i].dst_indirect);
> > +      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +      brw_inst_set_src0_address_mode(&devinfo, last_inst, inst[i].src0_indirect);
> > +
> > +      EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align1_simd16)
> > +{
> > +   static const struct {
> > +      unsigned exec_size;
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      unsigned dst_stride;
> > +      bool expected_result;
> > +   } inst[] = {
> > +#define INST(exec_size, dst_type, src0_type, src1_type,                   \
> > +             dst_stride, expected_result)                                 \
> > +      {                                                                   \
> > +         BRW_EXECUTE_##exec_size,                                         \
> > +         BRW_REGISTER_TYPE_##dst_type,                                    \
> > +         BRW_REGISTER_TYPE_##src0_type,                                   \
> > +         BRW_REGISTER_TYPE_##src1_type,                                   \
> > +         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
> > +         expected_result,                                                 \
> > +      }
> > +
> > +      /* No SIMD16 in mixed mode when destination is packed f16 */
> > +      INST( 8, HF,  F, HF, 2, true),
> > +      INST(16, HF, HF,  F, 2, true),
> > +      INST(16, HF, HF,  F, 1, false),
> > +      INST(16, HF,  F, HF, 1, false),
> > +
> > +      /* No SIMD16 in mixed mode when destination is f32 */
> > +      INST( 8,  F, HF,  F, 1, true),
> > +      INST( 8,  F,  F, HF, 1, true),
> > +      INST(16,  F, HF,  F, 1, false),
> > +      INST(16,  F,  F, HF, 1, false),
> > +
> > +#undef INST
> > +   };
> > +
> > +   if (devinfo.gen < 8)
> > +      return;
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      brw_ADD(p, retype(g0, inst[i].dst_type),
> > +                 retype(g0, inst[i].src0_type),
> > +                 retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
> > +
> > +      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +
> > +      EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align1_packed_fp16_dst_acc_read_offset_0)
> > +{
> > +   static const struct {
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      unsigned dst_stride;
> > +      bool read_acc;
> > +      unsigned subnr;
> > +      bool expected_result_bdw;
> > +      bool expected_result_chv_skl;
> > +   } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type, dst_stride, read_acc, subnr,   \
> > +             expected_result_bdw, expected_result_chv_skl)                  \
> > +      {                                                                     \
> > +         BRW_REGISTER_TYPE_##dst_type,                                      \
> > +         BRW_REGISTER_TYPE_##src0_type,                                     \
> > +         BRW_REGISTER_TYPE_##src1_type,                                     \
> > +         BRW_HORIZONTAL_STRIDE_##dst_stride,                                \
> > +         read_acc,                                                          \
> > +         subnr,                                                             \
> > +         expected_result_bdw,                                               \
> > +         expected_result_chv_skl,                                           \
> > +      }
> > +
> > +      /* Destination is not packed */
> > +      INST(HF, HF,  F, 2, true,  0, true, true),
> > +      INST(HF, HF,  F, 2, true,  2, true, true),
> > +      INST(HF, HF,  F, 2, true,  4, true, true),
> > +      INST(HF, HF,  F, 2, true,  8, true, true),
> > +      INST(HF, HF,  F, 2, true, 16, true, true),
> > +
> > +      /* Destination is packed, we don't read acc */
> > +      INST(HF, HF,  F, 1, false,  0, false, true),
> > +      INST(HF, HF,  F, 1, false,  2, false, true),
> > +      INST(HF, HF,  F, 1, false,  4, false, true),
> > +      INST(HF, HF,  F, 1, false,  8, false, true),
> > +      INST(HF, HF,  F, 1, false, 16, false, true),
> > +
> > +      /* Destination is packed, we read acc */
> > +      INST(HF, HF,  F, 1, true,  0, false, false),
> > +      INST(HF, HF,  F, 1, true,  2, false, false),
> > +      INST(HF, HF,  F, 1, true,  4, false, false),
> > +      INST(HF, HF,  F, 1, true,  8, false, false),
> > +      INST(HF, HF,  F, 1, true, 16, false, false),
> > +
> > +#undef INST
> > +   };
> > +
> > +   if (devinfo.gen < 8)
> > +      return;
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      brw_ADD(p, retype(g0, inst[i].dst_type),
> > +                 retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type),
> > +                 retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +
> > +      brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, inst[i].subnr);
> > +
> > +      if (devinfo.is_cherryview || devinfo.gen >= 9)
> > +         EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
> > +      else
> > +         EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_fp16_dest_with_acc)
> > +{
> > +   static const struct {
> > +      unsigned exec_size;
> > +      unsigned opcode;
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      unsigned dst_stride;
> > +      bool read_acc;
> > +      bool expected_result_bdw;
> > +      bool expected_result_chv_skl;
> > +   } inst[] = {
> > +#define INST(exec_size, opcode, dst_type, src0_type, src1_type,           \
> > +             dst_stride, read_acc,expected_result_bdw,                    \
> > +             expected_result_chv_skl)                                     \
> > +      {                                                                   \
> > +         BRW_EXECUTE_##exec_size,                                         \
> > +         BRW_OPCODE_##opcode,                                             \
> > +         BRW_REGISTER_TYPE_##dst_type,                                    \
> > +         BRW_REGISTER_TYPE_##src0_type,                                   \
> > +         BRW_REGISTER_TYPE_##src1_type,                                   \
> > +         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
> > +         read_acc,                                                        \
> > +         expected_result_bdw,                                             \
> > +         expected_result_chv_skl,                                         \
> > +      }
> > +
> > +      /* Packed fp16 dest with implicit acc needs hstride=2 */
> > +      INST(8, MAC, HF, HF,  F, 1, false, false, false),
> > +      INST(8, MAC, HF, HF,  F, 2, false, true,  true),
> > +      INST(8, MAC, HF,  F, HF, 1, false, false, false),
> > +      INST(8, MAC, HF,  F, HF, 2, false, true,  true),
> > +
> > +      /* Packed fp16 dest with explicit acc needs hstride=2 */
> > +      INST(8, ADD, HF, HF,  F, 1, true,  false, false),
> > +      INST(8, ADD, HF, HF,  F, 2, true,  true,  true),
> > +      INST(8, ADD, HF,  F, HF, 1, true,  false, false),
> > +      INST(8, ADD, HF,  F, HF, 2, true,  true,  true),
> > +
> > +      /* If destination is not fp16, restriction doesn't apply */
> > +      INST(8, MAC,  F, HF,  F, 1, false, true, true),
> > +      INST(8, MAC,  F, HF,  F, 2, false, true, true),
> > +
> > +      /* If there is no implicit/explicit acc, restriction doesn't apply */
> > +      INST(8, ADD, HF, HF,  F, 1, false, false, true),
> > +      INST(8, ADD, HF, HF,  F, 2, false, true,  true),
> > +      INST(8, ADD, HF,  F, HF, 1, false, false, true),
> > +      INST(8, ADD, HF,  F, HF, 2, false, true,  true),
> > +      INST(8, ADD,  F, HF,  F, 1, false, true,  true),
> > +      INST(8, ADD,  F, HF,  F, 2, false, true,  true),
> > +
> > +#undef INST
> > +   };
> > +
> > +   if (devinfo.gen < 8)
> > +      return;
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      if (inst[i].opcode == BRW_OPCODE_MAC) {
> > +         brw_MAC(p, retype(g0, inst[i].dst_type),
> > +                    retype(g0, inst[i].src0_type),
> > +                    retype(g0, inst[i].src1_type));
> > +      } else {
> > +         assert(inst[i].opcode == BRW_OPCODE_ADD);
> > +         brw_ADD(p, retype(g0, inst[i].dst_type),
> > +                    retype(inst[i].read_acc ? acc0: g0, inst[i].src0_type),
> > +                    retype(g0, inst[i].src1_type));
> > +      }
> > +
> > +      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
> > +
> > +      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +
> > +      if (devinfo.is_cherryview || devinfo.gen >= 9)
> > +         EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
> > +      else
> > +         EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align1_math_strided_fp16_inputs)
> > +{
> > +   static const struct {
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      unsigned dst_stride;
> > +      unsigned src0_stride;
> > +      unsigned src1_stride;
> > +      bool expected_result;
> > +   } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type,                              \
> > +             dst_stride, src0_stride, src1_stride, expected_result)       \
> > +      {                                                                   \
> > +         BRW_REGISTER_TYPE_##dst_type,                                    \
> > +         BRW_REGISTER_TYPE_##src0_type,                                   \
> > +         BRW_REGISTER_TYPE_##src1_type,                                   \
> > +         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
> > +         BRW_HORIZONTAL_STRIDE_##src0_stride,                             \
> > +         BRW_HORIZONTAL_STRIDE_##src1_stride,                             \
> > +         expected_result,                                                 \
> > +      }
> > +
> > +      INST(HF, HF,  F, 2, 2, 1, true),
> > +      INST(HF,  F, HF, 2, 1, 2, true),
> > +      INST(HF,  F, HF, 1, 1, 2, true),
> > +      INST(HF,  F, HF, 2, 1, 1, false),
> > +      INST(HF, HF,  F, 2, 1, 1, false),
> > +      INST(HF, HF,  F, 1, 1, 1, false),
> > +      INST(HF, HF,  F, 2, 1, 1, false),
> > +      INST( F, HF,  F, 1, 1, 1, false),
> > +      INST( F,  F, HF, 1, 1, 2, true),
> > +      INST( F, HF, HF, 1, 2, 1, false),
> > +      INST( F, HF, HF, 1, 2, 2, true),
> > +
> > +#undef INST
> > +   };
> > +
> > +   /* No half-float math in gen8 */
> > +   if (devinfo.gen < 9)
> > +      return;
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      gen6_math(p, retype(g0, inst[i].dst_type),
> > +                   BRW_MATH_FUNCTION_POW,
> > +                   retype(g0, inst[i].src0_type),
> > +                   retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +
> > +      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +      brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
> > +      brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].src0_stride);
> > +
> > +      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +      brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
> > +      brw_inst_set_src1_hstride(&devinfo, last_inst, inst[i].src1_stride);
> > +
> > +      EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align1_packed_fp16_dst)
> > +{
> > +   static const struct {
> > +      unsigned exec_size;
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      unsigned dst_stride;
> > +      unsigned dst_subnr;
> > +      bool expected_result_bdw;
> > +      bool expected_result_chv_skl;
> > +   } inst[] = {
> > +#define INST(exec_size, dst_type, src0_type, src1_type, dst_stride, dst_subnr, \
> > +             expected_result_bdw, expected_result_chv_skl)                     \
> > +      {                                                                        \
> > +         BRW_EXECUTE_##exec_size,                                              \
> > +         BRW_REGISTER_TYPE_##dst_type,                                         \
> > +         BRW_REGISTER_TYPE_##src0_type,                                        \
> > +         BRW_REGISTER_TYPE_##src1_type,                                        \
> > +         BRW_HORIZONTAL_STRIDE_##dst_stride,                                   \
> > +         dst_subnr,                                                            \
> > +         expected_result_bdw,                                                  \
> > +         expected_result_chv_skl                                               \
> > +      }
> > +
> > +      /* SIMD8 packed fp16 dst won't cross oword boundaries if region is
> > +       * oword-aligned
> > +       */
> > +      INST( 8, HF, HF,  F, 1,  0, false, true),
> > +      INST( 8, HF, HF,  F, 1,  2, false, false),
> > +      INST( 8, HF, HF,  F, 1,  4, false, false),
> > +      INST( 8, HF, HF,  F, 1,  8, false, false),
> > +      INST( 8, HF, HF,  F, 1, 16, false, true),
> > +
> > +      /* SIMD16 packed fp16 always crosses oword boundaries */
> > +      INST(16, HF, HF,  F, 1,  0, false, false),
> > +      INST(16, HF, HF,  F, 1,  2, false, false),
> > +      INST(16, HF, HF,  F, 1,  4, false, false),
> > +      INST(16, HF, HF,  F, 1,  8, false, false),
> > +      INST(16, HF, HF,  F, 1, 16, false, false),
> > +
> > +      /* If destination is not packed (or not fp16) we can cross oword
> > +       * boundaries
> > +       */
> > +      INST( 8, HF, HF,  F, 2,  0, true, true),
> > +      INST( 8,  F, HF,  F, 1,  0, true, true),
> > +
> > +#undef INST
> > +   };
> > +
> > +   if (devinfo.gen < 8)
> > +      return;
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      brw_ADD(p, retype(g0, inst[i].dst_type),
> > +                 retype(g0, inst[i].src0_type),
> > +                 retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, inst[i].dst_subnr);
> > +
> > +      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +      brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
> > +      brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
> > +
> > +      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +      brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
> > +      brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
> > +
> > +      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
> > +
> > +      if (devinfo.is_cherryview || devinfo.gen >= 9)
> > +         EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
> > +      else
> > +         EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align16_packed_data)
> > +{
> > +   static const struct {
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      unsigned src0_vstride;
> > +      unsigned src1_vstride;
> > +      bool expected_result;
> > +   } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type,                              \
> > +             src0_vstride, src1_vstride, expected_result)                 \
> > +      {                                                                   \
> > +         BRW_REGISTER_TYPE_##dst_type,                                    \
> > +         BRW_REGISTER_TYPE_##src0_type,                                   \
> > +         BRW_REGISTER_TYPE_##src1_type,                                   \
> > +         BRW_VERTICAL_STRIDE_##src0_vstride,                              \
> > +         BRW_VERTICAL_STRIDE_##src1_vstride,                              \
> > +         expected_result,                                                 \
> > +      }
> > +
> > +      /* We only test with F destination because there is a restriction
> > +       * by which F->HF conversions need to be DWord aligned but Align16 also
> > +       * requires that destination horizontal stride is 1.
> > +       */
> > +      INST(F,  F, HF, 4, 4, true),
> > +      INST(F,  F, HF, 2, 4, false),
> > +      INST(F,  F, HF, 4, 2, false),
> > +      INST(F,  F, HF, 0, 4, true),
> > +      INST(F,  F, HF, 4, 0, false),
> > +      INST(F, HF,  F, 4, 4, true),
> > +      INST(F, HF,  F, 4, 2, false),
> > +      INST(F, HF,  F, 2, 4, false),
> > +      INST(F, HF,  F, 0, 4, false),
> > +      INST(F, HF,  F, 4, 0, true),
> > +
> > +#undef INST
> > +   };
> > +
> > +   if (devinfo.gen < 8 || devinfo.gen >= 11)
> > +      return;
> > +
> > +   brw_set_default_access_mode(p, BRW_ALIGN_16);
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      brw_ADD(p, retype(g0, inst[i].dst_type),
> > +                 retype(g0, inst[i].src0_type),
> > +                 retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src0_vstride);
> > +      brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].src1_vstride);
> > +
> > +      EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align16_no_simd16)
> > +{
> > +   static const struct {
> > +      unsigned exec_size;
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      bool expected_result;
> > +   } inst[] = {
> > +#define INST(exec_size, dst_type, src0_type, src1_type, expected_result)  \
> > +      {                                                                   \
> > +         BRW_EXECUTE_##exec_size,                                         \
> > +         BRW_REGISTER_TYPE_##dst_type,                                    \
> > +         BRW_REGISTER_TYPE_##src0_type,                                   \
> > +         BRW_REGISTER_TYPE_##src1_type,                                   \
> > +         expected_result,                                                 \
> > +      }
> > +
> > +      /* We only test with F destination because there is a restriction
> > +       * by which F->HF conversions need to be DWord aligned but Align16 also
> > +       * requires that destination horizontal stride is 1.
> > +       */
> > +      INST( 8,  F,  F, HF, true),
> > +      INST( 8,  F, HF,  F, true),
> > +      INST( 8,  F,  F, HF, true),
> > +      INST(16,  F,  F, HF, false),
> > +      INST(16,  F, HF,  F, false),
> > +      INST(16,  F,  F, HF, false),
> > +
> > +#undef INST
> > +   };
> > +
> > +   if (devinfo.gen < 8 || devinfo.gen >= 11)
> > +      return;
> > +
> > +   brw_set_default_access_mode(p, BRW_ALIGN_16);
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      brw_ADD(p, retype(g0, inst[i].dst_type),
> > +                 retype(g0, inst[i].src0_type),
> > +                 retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
> > +
> > +      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +
> > +      EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align16_no_acc_read)
> > +{
> > +   static const struct {
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      bool read_acc;
> > +      bool expected_result;
> > +   } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type, read_acc, expected_result)   \
> > +      {                                                                   \
> > +         BRW_REGISTER_TYPE_##dst_type,                                    \
> > +         BRW_REGISTER_TYPE_##src0_type,                                   \
> > +         BRW_REGISTER_TYPE_##src1_type,                                   \
> > +         read_acc,                                                        \
> > +         expected_result,                                                 \
> > +      }
> > +
> > +      /* We only test with F destination because there is a restriction
> > +       * by which F->HF conversions need to be DWord aligned but Align16 also
> > +       * requires that destination horizontal stride is 1.
> > +       */
> > +      INST( F,  F, HF, false, true),
> > +      INST( F,  F, HF, true,  false),
> > +      INST( F, HF,  F, false, true),
> > +      INST( F, HF,  F, true,  false),
> > +
> > +#undef INST
> > +   };
> > +
> > +   if (devinfo.gen < 8 || devinfo.gen >= 11)
> > +      return;
> > +
> > +   brw_set_default_access_mode(p, BRW_ALIGN_16);
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      brw_ADD(p, retype(g0, inst[i].dst_type),
> > +                 retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type),
> > +                 retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +
> > +      EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align16_math_packed_format)
> > +{
> > +   static const struct {
> > +      enum brw_reg_type dst_type;
> > +      enum brw_reg_type src0_type;
> > +      enum brw_reg_type src1_type;
> > +      unsigned src0_vstride;
> > +      unsigned src1_vstride;
> > +      bool expected_result;
> > +   } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type,                              \
> > +             src0_vstride, src1_vstride, expected_result)                 \
> > +      {                                                                   \
> > +         BRW_REGISTER_TYPE_##dst_type,                                    \
> > +         BRW_REGISTER_TYPE_##src0_type,                                   \
> > +         BRW_REGISTER_TYPE_##src1_type,                                   \
> > +         BRW_VERTICAL_STRIDE_##src0_vstride,                              \
> > +         BRW_VERTICAL_STRIDE_##src1_vstride,                              \
> > +         expected_result,                                                 \
> > +      }
> > +
> > +      /* We only test with F destination because there is a restriction
> > +       * by which F->HF conversions need to be DWord aligned but Align16 also
> > +       * requires that destination horizontal stride is 1.
> > +       */
> > +      INST( F, HF,  F, 4, 0, false),
> > +      INST( F, HF, HF, 4, 4, true),
> > +      INST( F,  F, HF, 4, 0, false),
> > +      INST( F,  F, HF, 2, 4, false),
> > +      INST( F,  F, HF, 4, 2, false),
> > +      INST( F, HF, HF, 0, 4, false),
> > +
> > +#undef INST
> > +   };
> > +
> > +   /* Align16 Math for mixed float mode is not supported in gen8 */
> > +   if (devinfo.gen < 9 || devinfo.gen >= 11)
> > +      return;
> > +
> > +   brw_set_default_access_mode(p, BRW_ALIGN_16);
> > +
> > +   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > +      gen6_math(p, retype(g0, inst[i].dst_type),
> > +                   BRW_MATH_FUNCTION_POW,
> > +                   retype(g0, inst[i].src0_type),
> > +                   retype(g0, inst[i].src1_type));
> > +
> > +      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src0_vstride);
> > +      brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].src1_vstride);
> > +
> > +      EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > +      clear_instructions(p);
> > +   }
> > +}
> > +
> >  TEST_P(validation_test, vector_immediate_destination_alignment)
> >  {
> >     static const struct {
> > -- 
> > 2.20.1