[Mesa-dev] [PATCH v5 35/38] intel/compiler: validate region restrictions for mixed float mode
Juan A. Suarez Romero
jasuarez at igalia.com
Mon Apr 1 14:50:29 UTC 2019
On Wed, 2019-03-27 at 19:37 -0700, Francisco Jerez wrote:
> "Juan A. Suarez Romero" <jasuarez at igalia.com> writes:
>
> > From: Iago Toral Quiroga <itoral at igalia.com>
> >
> > v2:
> > - Adapted unit tests to make them consistent with the changes done
> > to the validation of half-float conversions.
> > ---
> > src/intel/compiler/brw_eu_validate.c | 256 ++++++++++
> > src/intel/compiler/test_eu_validate.cpp | 620 ++++++++++++++++++++++++
> > 2 files changed, 876 insertions(+)
> >
> > diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
> > index 18c95efb05b..5eea02f5c94 100644
> > --- a/src/intel/compiler/brw_eu_validate.c
> > +++ b/src/intel/compiler/brw_eu_validate.c
> > @@ -170,6 +170,13 @@ src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
> > brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
> > }
> >
> > +static bool
> > +src0_is_acc(const struct gen_device_info *devinfo, const brw_inst *inst)
> > +{
> > + return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
> > + brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_ACCUMULATOR;
> > +}
> > +
>
> There are multiple accumulator registers. The above only checks for
> acc0.
>
static bool
> > src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
> > {
> > @@ -937,6 +944,254 @@ general_restrictions_on_region_parameters(const struct gen_device_info *devinfo,
> > return error_msg;
> > }
> >
> > +static struct string
> > +special_restrictions_for_mixed_float_mode(const struct gen_device_info *devinfo,
> > + const brw_inst *inst)
> > +{
> > + struct string error_msg = { .str = NULL, .len = 0 };
> > +
> > + unsigned opcode = brw_inst_opcode(devinfo, inst);
>
> Constify this and the declarations below.
>
> > + unsigned num_sources = num_sources_from_inst(devinfo, inst);
> > + if (num_sources >= 3)
> > + return error_msg;
> > +
> > + if (!is_mixed_float(devinfo, inst))
> > + return error_msg;
> > +
> > + unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
> > + bool is_align16 = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16;
> > +
> > + enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
> > + enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
>
> Same comment as in the previous patch, this can possibly blow up for
> instructions with less than two sources.
>
> > + enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
> > +
> > + unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
> > + bool dst_is_packed = is_packed(exec_size * dst_stride, exec_size, dst_stride);
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "Indirect addressing on source is not supported when source and
> > + * destination data types are mixed float."
> > + *
> > + * Indirect addressing is only supported on the first source, so we only
> > + * check that.
>
> I don't think that's true. The hardware spec has the following example
> of a valid but kind of funky instruction with indirect regioning:
>
> > add (16) r[a0.0]:f r[a0.2]:f r[a0.4]:f
Interesting, because looking at the driver implementation of
brw_set_src1, for anything that is not an immediate we do:
/* This is a hardware restriction, which may or may not be lifted
* in the future:
*/
assert (reg.address_mode == BRW_ADDRESS_DIRECT);
I guess this assertion was written for older platforms then?
> > + */
> > + ERROR_IF(types_are_mixed_float(dst_type, src0_type) &&
>
> I doubt that it makes a difference whether there is a mismatch between
> the type of src0 and the type of the destination for indirect addressing
> to be disallowed. Things are likely to blow up with indirect addressing
> in src0 even if the instruction is mixed-mode due to the effect of the
> type of src1 alone. But it's hard to tell for sure, spec wording seems
> fairly ambiguous...
>
> > + brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT,
> > + "Indirect addressing on source is not supported when source and "
> > + "destination data types are mixed float");
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "No SIMD16 in mixed mode when destination is f32. Instruction
> > + * execution size must be no more than 8."
> > + */
> > + ERROR_IF(exec_size > 8 && dst_type == BRW_REGISTER_TYPE_F,
> > + "Mixed float mode with 32-bit float destination is limited "
> > + "to SIMD8");
> > +
> > + if (is_align16) {
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "In Align16 mode, when half float and float data types are mixed
> > + * between source operands OR between source and destination operands,
> > + * the register content are assumed to be packed."
> > + *
> > + * Since Align16 doesn't have a concept of horizontal stride (or width),
> > + * it means that vertical stride must always be 4, since 0 and 2 would
> > + * lead to replicated data, and any other value is disallowed in Align16.
> > + * However, the PRM also says:
> > + *
> > + * "In Align16, vertical stride can never be zero for f16"
> > + *
> > + * Which is oddly redundant and specific considering the more general
> > + * assumption that all operands are assumed to be packed, so we
> > + * understand that this might be hinting that there may be an exception
> > + * for f32 operands with a vstride of 0, so we don't validate this for
> > + * them while we don't have empirical evidence that it is forbidden.
> > + */
> > + ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
> > + (src0_type != BRW_REGISTER_TYPE_F ||
> > + brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
> > + "Align16 mixed float mode assumes packed data (vstride must "
> > + "be 4 -or 0 for f32 operands-)");
> > +
> > + ERROR_IF(num_sources >= 2 &&
> > + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
> > + (src1_type != BRW_REGISTER_TYPE_F ||
> > + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
> > + "Align16 mixed float mode assumes packed data (vstride must "
> > + "be 4 -or 0 for f32 operands-)");
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "For Align16 mixed mode, both input and output packed f16 data
> > + * must be oword aligned, no oword crossing in packed f16."
> > + *
> > + * The previous rule requires that Align16 operands are always packed,
> > + * and since there is only one bit for Align16 subnr, which represents
> > + * offsets 0B and 16B, this rule is always enforced and we don't need to
> > + * validate it.
> > + */
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "No SIMD16 in mixed mode when destination is packed f16 for both
> > + * Align1 and Align16."
> > + *
> > + * And:
> > + *
> > + * "In Align16 mode, when half float and float data types are mixed
> > + * between source operands OR between source and destination operands,
> > + * the register content are assumed to be packed."
> > + *
> > + * Which implies that SIMD16 is not available in Align16. This is further
> > + * confirmed by:
> > + *
> > + * "For Align16 mixed mode, both input and output packed f16 data
> > + * must be oword aligned, no oword crossing in packed f16"
> > + *
> > + * Since oword-aligned packed f16 data would cross oword boundaries when
> > + * the execution size is larger than 8.
> > + */
> > + ERROR_IF(exec_size > 8, "Align16 mixed float mode is limited to SIMD8");
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "Math operations for mixed mode:
> > + * - In Align16, only packed format is supported"
> > + *
> > + * It is not clear what this is restricting since as stated in previous
> > + * spec quotes, Align16 always assumes packed data. However, since
> > + * we are allowing vstride of 0 on f32, we check again here without that
> > + * exception.
> > + */
> > + if (opcode == BRW_OPCODE_MATH) {
> > + ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
> > + "Align16 mixed float mode math only supports packed format");
> > +
> > + ERROR_IF(num_sources >= 2 &&
> > + brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
> > + "Align16 mixed float mode math only supports packed format");
> > + }
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "No accumulator read access for Align16 mixed float."
> > + *
> > + * Explicit accumulator reads are only supported on the first source,
> > + * so we only check that.
>
> The restriction on explicit accumulator access being allowed for the
> first source only is not really universal. Wouldn't hurt to check src1
> in addition.
>
Ok. This came from the emission code in brw_set_src1 again, where we
have this:
/* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
*
* "Accumulator registers may be accessed explicitly as src0
* operands only."
*/
assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
reg.nr != BRW_ARF_ACCUMULATOR);
which we are actually applying to all platforms again... :-/
I guess this assert there is not quite correct then.
In fact, I see that restriction in SKL PRM too, but not in KBL. So I guess it is
valid only for gens <= SKL
> > + */
> > + ERROR_IF(src0_is_acc(devinfo, inst),
> > + "No accumulator read access for Align16 mixed float");
> > + } else {
> > + assert(!is_align16);
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "No SIMD16 in mixed mode when destination is packed f16 for both
> > + * Align1 and Align16."
> > + */
> > + ERROR_IF(exec_size > 8 && dst_is_packed &&
> > + dst_type == BRW_REGISTER_TYPE_HF,
> > + "Align1 mixed float mode is limited to SIMD8 when destination "
> > + "is packed half-float");
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "Math operations for mixed mode:
> > + * - In Align1, f16 inputs need to be strided"
> > + */
> > + if (opcode == BRW_OPCODE_MATH) {
> > + if (src0_type == BRW_REGISTER_TYPE_HF) {
> > + ERROR_IF(STRIDE(brw_inst_src0_hstride(devinfo, inst)) <= 1,
> > + "Align1 mixed mode math needs strided half-float inputs");
> > + }
> > +
> > + if (num_sources >= 2 && src1_type == BRW_REGISTER_TYPE_HF) {
> > + ERROR_IF(STRIDE(brw_inst_src1_hstride(devinfo, inst)) <= 1,
> > + "Align1 mixed mode math needs strided half-float inputs");
> > + }
> > + }
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "In Align1, destination stride can be smaller than execution
> > + * type. When destination is stride of 1, 16 bit packed data is
> > + * updated on the destination. However, output packed f16 data
> > + * must be oword aligned, no oword crossing in packed f16."
> > + *
> > + * The requirement of not crossing oword boundaries for 16-bit oword
> > + * aligned data means that execution size is limited to 8.
> > + */
> > + if (dst_type == BRW_REGISTER_TYPE_HF && dst_stride == 1) {
> > + unsigned subreg;
> > + if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT)
> > + subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
> > + else
> > + subreg = brw_inst_dst_ia_subreg_nr(devinfo, inst);
> > + ERROR_IF(subreg % 16 != 0,
> > + "Align1 mixed mode packed half-float output must be "
> > + "oword aligned");
> > + ERROR_IF(exec_size > 8,
> > + "Align1 mixed mode packed half-float output must not "
> > + "cross oword boundaries (max exec size is 8)");
> > + }
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "When source is float or half float from accumulator register and
> > + * destination is half float with a stride of 1, the source must
> > + * register aligned. i.e., source must have offset zero."
> > + *
> > + * Align16 mixed float mode doesn't allow accumulator access on sources,
> > + * so we only need to check this for Align1. Also, explicit accumulator
> > + * reads are only supported on the first source, so we only check that.
>
> Same comment as above regarding checking accumulator access for src1.
>
> > + */
> > + if (dst_type == BRW_REGISTER_TYPE_HF && dst_stride == 1) {
>
> You could eliminate one level of nesting here, or combine the outer
> conditional block with the previous conditional.
>
> > + if (src0_is_acc(devinfo, inst) &&
> > + (src0_type == BRW_REGISTER_TYPE_F ||
> > + src0_type == BRW_REGISTER_TYPE_HF)) {
> > + ERROR_IF(brw_inst_src0_da1_subreg_nr(devinfo, inst) != 0,
> > + "Mixed float mode requires register-aligned accumulator "
> > + "source reads when destination is packed half-float");
> > + }
> > + }
> > +
> > + /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
> > + * Float Operations:
> > + *
> > + * "No swizzle is allowed when an accumulator is used as an implicit
> > + * source or an explicit source in an instruction. i.e. when
> > + * destination is half float with an implicit accumulator source,
> > + * destination stride needs to be 2."
> > + *
> > + * FIXME: it is not quite clear what the first sentence actually means
> > + * or its link to the implication described after it, so we only
> > + * validate the explicit implication, which is clearly described.
>
> I suspect that with "No swizzle is allowed" they mean that the sources
> and destination need to be aligned to the size of the execution size.
> But I'm okay with only checking the destination since the spec wording
> is pretty vague...
>
> > + */
> > + if (dst_type == BRW_REGISTER_TYPE_HF && opcode == BRW_OPCODE_MAC) {
>
> There are other instructions that read the accumulator (implicitly or
> explicitly) beyond the MAC instruction.
>
> > + ERROR_IF(dst_stride != 2,
> > + "Mixed float mode with implicit accumulator source and "
> > + "half-float destination requires a stride of 2 on the "
> > + "destination");
> > + }
> > + }
> > +
> > + return error_msg;
> > +}
> > +
> > /**
> > * Creates an \p access_mask for an \p exec_size, \p element_size, and a region
> > *
> > @@ -1575,6 +1830,7 @@ brw_validate_instructions(const struct gen_device_info *devinfo,
> > CHECK(send_restrictions);
> > CHECK(general_restrictions_based_on_operand_types);
> > CHECK(general_restrictions_on_region_parameters);
> > + CHECK(special_restrictions_for_mixed_float_mode);
> > CHECK(region_alignment_rules);
> > CHECK(vector_immediate_restrictions);
> > CHECK(special_requirements_for_handling_double_precision_data_types);
> > diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp
> > index bf8a2cfcf12..f7cbb9bb903 100644
> > --- a/src/intel/compiler/test_eu_validate.cpp
> > +++ b/src/intel/compiler/test_eu_validate.cpp
> > @@ -1020,6 +1020,626 @@ TEST_P(validation_test, half_float_conversion)
> > }
> > }
> >
> > +TEST_P(validation_test, mixed_float_source_indirect_addressing)
> > +{
> > + static const struct {
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + unsigned dst_stride;
> > + bool dst_indirect;
> > + bool src0_indirect;
> > + bool expected_result;
> > + } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type, \
> > + dst_stride, dst_indirect, src0_indirect, expected_result) \
> > + { \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + BRW_HORIZONTAL_STRIDE_##dst_stride, \
> > + dst_indirect, \
> > + src0_indirect, \
> > + expected_result, \
> > + }
> > +
> > + /* Source and dest are mixed float: indirect src addressing not allowed */
> > + INST(HF, F, F, 2, false, false, true),
> > + INST(HF, F, F, 2, true, false, true),
> > + INST(HF, F, F, 2, false, true, false),
> > + INST(HF, F, F, 2, true, true, false),
> > + INST( F, HF, F, 1, false, false, true),
> > + INST( F, HF, F, 1, true, false, true),
> > + INST( F, HF, F, 1, false, true, false),
> > + INST( F, HF, F, 1, true, true, false),
> > +
> > + /* Source and dest are not mixed float: indirect src addressing allowed */
> > + INST(HF, HF, F, 2, false, false, true),
> > + INST(HF, HF, F, 2, true, false, true),
> > + INST(HF, HF, F, 2, false, true, true),
> > + INST(HF, HF, F, 2, true, true, true),
> > + INST( F, F, HF, 1, false, false, true),
> > + INST( F, F, HF, 1, true, false, true),
> > + INST( F, F, HF, 1, false, true, true),
> > + INST( F, F, HF, 1, true, true, true),
> > +
> > +#undef INST
> > + };
> > +
> > + if (devinfo.gen < 8)
> > + return;
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + brw_ADD(p, retype(g0, inst[i].dst_type),
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_dst_address_mode(&devinfo, last_inst, inst[i].dst_indirect);
> > + brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > + brw_inst_set_src0_address_mode(&devinfo, last_inst, inst[i].src0_indirect);
> > +
> > + EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align1_simd16)
> > +{
> > + static const struct {
> > + unsigned exec_size;
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + unsigned dst_stride;
> > + bool expected_result;
> > + } inst[] = {
> > +#define INST(exec_size, dst_type, src0_type, src1_type, \
> > + dst_stride, expected_result) \
> > + { \
> > + BRW_EXECUTE_##exec_size, \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + BRW_HORIZONTAL_STRIDE_##dst_stride, \
> > + expected_result, \
> > + }
> > +
> > + /* No SIMD16 in mixed mode when destination is packed f16 */
> > + INST( 8, HF, F, HF, 2, true),
> > + INST(16, HF, HF, F, 2, true),
>
> Maybe add a known invalid test-case here?
>
> > +
> > + /* No SIMD16 in mixed mode when destination is f32 */
> > + INST( 8, F, HF, F, 1, true),
> > + INST( 8, F, F, HF, 1, true),
> > + INST(16, F, HF, F, 1, false),
> > + INST(16, F, F, HF, 1, false),
> > +
> > +#undef INST
> > + };
> > +
> > + if (devinfo.gen < 8)
> > + return;
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + brw_ADD(p, retype(g0, inst[i].dst_type),
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
> > +
> > + brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +
> > + EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align1_packed_fp16_dst_acc_read_offset_0)
> > +{
> > + static const struct {
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + unsigned dst_stride;
> > + bool read_acc;
> > + unsigned subnr;
> > + bool expected_result_bdw;
> > + bool expected_result_chv_skl;
> > + } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type, dst_stride, read_acc, subnr, \
> > + expected_result_bdw, expected_result_chv_skl) \
> > + { \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + BRW_HORIZONTAL_STRIDE_##dst_stride, \
> > + read_acc, \
> > + subnr, \
> > + expected_result_bdw, \
> > + expected_result_chv_skl, \
> > + }
> > +
> > + /* Destination is not packed */
> > + INST(HF, HF, F, 2, true, 0, true, true),
> > + INST(HF, HF, F, 2, true, 2, true, true),
> > + INST(HF, HF, F, 2, true, 4, true, true),
> > + INST(HF, HF, F, 2, true, 8, true, true),
> > + INST(HF, HF, F, 2, true, 16, true, true),
> > +
> > + /* Destination is packed, we don't read acc */
> > + INST(HF, HF, F, 1, false, 0, false, true),
> > + INST(HF, HF, F, 1, false, 2, false, true),
> > + INST(HF, HF, F, 1, false, 4, false, true),
> > + INST(HF, HF, F, 1, false, 8, false, true),
> > + INST(HF, HF, F, 1, false, 16, false, true),
> > +
> > + /* Destination is packed, we read acc */
> > + INST(HF, HF, F, 1, true, 0, false, true),
> > + INST(HF, HF, F, 1, true, 2, false, false),
> > + INST(HF, HF, F, 1, true, 4, false, false),
> > + INST(HF, HF, F, 1, true, 8, false, false),
> > + INST(HF, HF, F, 1, true, 16, false, false),
> > +
> > +#undef INST
> > + };
> > +
> > + if (devinfo.gen < 8)
> > + return;
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + brw_ADD(p, retype(g0, inst[i].dst_type),
> > + retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +
> > + brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, inst[i].subnr);
> > +
> > + if (devinfo.is_cherryview || devinfo.gen >= 9)
> > + EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
> > + else
> > + EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_fp16_dest_with_implicit_acc)
> > +{
> > + static const struct {
> > + unsigned exec_size;
> > + unsigned opcode;
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + unsigned dst_stride;
> > + bool expected_result_bdw;
> > + bool expected_result_chv_skl;
> > + } inst[] = {
> > +#define INST(exec_size, opcode, dst_type, src0_type, src1_type, \
> > + dst_stride, expected_result_bdw, expected_result_chv_skl) \
> > + { \
> > + BRW_EXECUTE_##exec_size, \
> > + BRW_OPCODE_##opcode, \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + BRW_HORIZONTAL_STRIDE_##dst_stride, \
> > + expected_result_bdw, \
> > + expected_result_chv_skl, \
> > + }
> > +
> > + /* Packed fp16 dest with implicit acc needs hstride=2 */
> > + INST(8, MAC, HF, HF, F, 1, false, false),
> > + INST(8, MAC, HF, HF, F, 2, true, true),
> > + INST(8, MAC, HF, F, HF, 1, false, false),
> > + INST(8, MAC, HF, F, HF, 2, true, true),
> > +
> > + /* If destination is not fp16, restriction doesn't apply */
> > + INST(8, MAC, F, HF, F, 1, true, true),
> > + INST(8, MAC, F, HF, F, 2, true, true),
> > +
> > + /* If there is no implicit acc, restriction doesn't apply */
> > + INST(8, ADD, HF, HF, F, 1, false, true),
> > + INST(8, ADD, HF, HF, F, 2, true, true),
> > + INST(8, ADD, HF, F, HF, 1, false, true),
> > + INST(8, ADD, HF, F, HF, 2, true, true),
> > + INST(8, ADD, F, HF, F, 1, true, true),
> > + INST(8, ADD, F, HF, F, 2, true, true),
> > +
> > +#undef INST
> > + };
> > +
> > + if (devinfo.gen < 8)
> > + return;
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + if (inst[i].opcode == BRW_OPCODE_MAC) {
> > + brw_MAC(p, retype(g0, inst[i].dst_type),
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > + } else {
> > + assert(inst[i].opcode == BRW_OPCODE_ADD);
> > + brw_ADD(p, retype(g0, inst[i].dst_type),
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > + }
> > +
> > + brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
> > +
> > + brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +
> > + if (devinfo.is_cherryview || devinfo.gen >= 9)
> > + EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
> > + else
> > + EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align1_math_strided_fp16_inputs)
> > +{
> > + static const struct {
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + unsigned dst_stride;
> > + unsigned src0_stride;
> > + unsigned src1_stride;
> > + bool expected_result;
> > + } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type, \
> > + dst_stride, src0_stride, src1_stride, expected_result) \
> > + { \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + BRW_HORIZONTAL_STRIDE_##dst_stride, \
> > + BRW_HORIZONTAL_STRIDE_##src0_stride, \
> > + BRW_HORIZONTAL_STRIDE_##src1_stride, \
> > + expected_result, \
> > + }
> > +
> > + INST(HF, HF, F, 2, 2, 1, true),
> > + INST(HF, F, HF, 2, 1, 2, true),
> > + INST(HF, F, HF, 1, 1, 2, true),
> > + INST(HF, F, HF, 2, 1, 1, false),
> > + INST(HF, HF, F, 2, 1, 1, false),
> > + INST(HF, HF, F, 1, 1, 1, false),
> > + INST(HF, HF, F, 2, 1, 1, false),
> > + INST( F, HF, F, 1, 1, 1, false),
> > + INST( F, F, HF, 1, 1, 2, true),
> > + INST( F, HF, HF, 1, 2, 1, false),
> > + INST( F, HF, HF, 1, 2, 2, true),
> > +
> > +#undef INST
> > + };
> > +
> > + /* No half-float math in gen8 */
> > + if (devinfo.gen < 9)
> > + return;
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + gen6_math(p, retype(g0, inst[i].dst_type),
> > + BRW_MATH_FUNCTION_POW,
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > +
> > + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
> > + brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].src0_stride);
> > +
> > + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
> > + brw_inst_set_src1_hstride(&devinfo, last_inst, inst[i].src1_stride);
> > +
> > + EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align1_packed_fp16_dst)
> > +{
> > + static const struct {
> > + unsigned exec_size;
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + unsigned dst_stride;
> > + unsigned dst_subnr;
> > + bool expected_result_bdw;
> > + bool expected_result_chv_skl;
> > + } inst[] = {
> > +#define INST(exec_size, dst_type, src0_type, src1_type, dst_stride, dst_subnr, \
> > + expected_result_bdw, expected_result_chv_skl) \
> > + { \
> > + BRW_EXECUTE_##exec_size, \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + BRW_HORIZONTAL_STRIDE_##dst_stride, \
> > + dst_subnr, \
> > + expected_result_bdw, \
> > + expected_result_chv_skl \
> > + }
> > +
> > + /* SIMD8 packed fp16 dst won't cross oword boundaries if region is
> > + * oword-aligned
> > + */
> > + INST( 8, HF, HF, F, 1, 0, false, true),
> > + INST( 8, HF, HF, F, 1, 2, false, false),
> > + INST( 8, HF, HF, F, 1, 4, false, false),
> > + INST( 8, HF, HF, F, 1, 8, false, false),
> > + INST( 8, HF, HF, F, 1, 16, false, true),
> > +
> > + /* SIMD16 packed fp16 always crosses oword boundaries */
> > + INST(16, HF, HF, F, 1, 0, false, false),
> > + INST(16, HF, HF, F, 1, 2, false, false),
> > + INST(16, HF, HF, F, 1, 4, false, false),
> > + INST(16, HF, HF, F, 1, 8, false, false),
> > + INST(16, HF, HF, F, 1, 16, false, false),
> > +
> > + /* If destination is not packed (or not fp16) we can cross oword
> > + * boundaries
> > + */
> > + INST( 8, HF, HF, F, 2, 0, true, true),
> > + INST( 8, F, HF, F, 1, 0, true, true),
> > +
> > +#undef INST
> > + };
> > +
> > + if (devinfo.gen < 8)
> > + return;
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + brw_ADD(p, retype(g0, inst[i].dst_type),
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
> > + brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, inst[i].dst_subnr);
> > +
> > + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > + brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
> > + brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
> > +
> > + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > + brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
> > + brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
> > +
> > + brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
> > +
> > + if (devinfo.is_cherryview || devinfo.gen >= 9)
> > + EXPECT_EQ(inst[i].expected_result_chv_skl, validate(p));
> > + else
> > + EXPECT_EQ(inst[i].expected_result_bdw, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align16_packed_data)
> > +{
> > + static const struct {
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + unsigned src0_vstride;
> > + unsigned src1_vstride;
> > + bool expected_result;
> > + } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type, \
> > + src0_vstride, src1_vstride, expected_result) \
> > + { \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + BRW_VERTICAL_STRIDE_##src0_vstride, \
> > + BRW_VERTICAL_STRIDE_##src1_vstride, \
> > + expected_result, \
> > + }
> > +
> > + /* We only test with F destination because there is a restriction
> > + * by which F->HF conversions need to be DWord aligned but Align16 also
> > + * requires that destination horizontal stride is 1.
> > + */
> > + INST(F, F, HF, 4, 4, true),
> > + INST(F, F, HF, 2, 4, false),
> > + INST(F, F, HF, 4, 2, false),
> > + INST(F, F, HF, 0, 4, true),
> > + INST(F, F, HF, 4, 0, false),
> > + INST(F, HF, F, 4, 4, true),
> > + INST(F, HF, F, 4, 2, false),
> > + INST(F, HF, F, 2, 4, false),
> > + INST(F, HF, F, 0, 4, false),
> > + INST(F, HF, F, 4, 0, true),
> > +
> > +#undef INST
> > + };
> > +
> > + if (devinfo.gen < 8 || devinfo.gen >= 11)
> > + return;
> > +
> > + brw_set_default_access_mode(p, BRW_ALIGN_16);
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + brw_ADD(p, retype(g0, inst[i].dst_type),
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src0_vstride);
> > + brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].src1_vstride);
> > +
> > + EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align16_no_simd16)
> > +{
> > + static const struct {
> > + unsigned exec_size;
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + bool expected_result;
> > + } inst[] = {
> > +#define INST(exec_size, dst_type, src0_type, src1_type, expected_result) \
> > + { \
> > + BRW_EXECUTE_##exec_size, \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + expected_result, \
> > + }
> > +
> > + /* We only test with F destination because there is a restriction
> > + * by which F->HF conversions need to be DWord aligned but Align16 also
> > + * requires that destination horizontal stride is 1.
> > + */
> > + INST( 8, F, F, HF, true),
> > + INST( 8, F, HF, F, true),
> > + INST( 8, F, F, HF, true),
> > + INST(16, F, F, HF, false),
> > + INST(16, F, HF, F, false),
> > + INST(16, F, F, HF, false),
> > +
> > +#undef INST
> > + };
> > +
> > + if (devinfo.gen < 8 || devinfo.gen >= 11)
> > + return;
> > +
> > + brw_set_default_access_mode(p, BRW_ALIGN_16);
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + brw_ADD(p, retype(g0, inst[i].dst_type),
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
> > +
> > + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +
> > + EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align16_no_acc_read)
> > +{
> > + static const struct {
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + bool read_acc;
> > + bool expected_result;
> > + } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type, read_acc, expected_result) \
> > + { \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + read_acc, \
> > + expected_result, \
> > + }
> > +
> > + /* We only test with F destination because there is a restriction
> > + * by which F->HF conversions need to be DWord aligned but Align16 also
> > + * requires that destination horizontal stride is 1.
> > + */
> > + INST( F, F, HF, false, true),
> > + INST( F, F, HF, true, false),
> > + INST( F, HF, F, false, true),
> > + INST( F, HF, F, true, false),
> > +
> > +#undef INST
> > + };
> > +
> > + if (devinfo.gen < 8 || devinfo.gen >= 11)
> > + return;
> > +
> > + brw_set_default_access_mode(p, BRW_ALIGN_16);
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + brw_ADD(p, retype(g0, inst[i].dst_type),
> > + retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > + brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
> > +
> > + EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > +TEST_P(validation_test, mixed_float_align16_math_packed_format)
> > +{
> > + static const struct {
> > + enum brw_reg_type dst_type;
> > + enum brw_reg_type src0_type;
> > + enum brw_reg_type src1_type;
> > + unsigned src0_vstride;
> > + unsigned src1_vstride;
> > + bool expected_result;
> > + } inst[] = {
> > +#define INST(dst_type, src0_type, src1_type, \
> > + src0_vstride, src1_vstride, expected_result) \
> > + { \
> > + BRW_REGISTER_TYPE_##dst_type, \
> > + BRW_REGISTER_TYPE_##src0_type, \
> > + BRW_REGISTER_TYPE_##src1_type, \
> > + BRW_VERTICAL_STRIDE_##src0_vstride, \
> > + BRW_VERTICAL_STRIDE_##src1_vstride, \
> > + expected_result, \
> > + }
> > +
> > + /* We only test with F destination because there is a restriction
> > + * by which F->HF conversions need to be DWord aligned but Align16 also
> > + * requires that destination horizontal stride is 1.
> > + */
> > + INST( F, HF, F, 4, 0, false),
> > + INST( F, HF, HF, 4, 4, true),
> > + INST( F, F, HF, 4, 0, false),
> > + INST( F, F, HF, 2, 4, false),
> > + INST( F, F, HF, 4, 2, false),
> > + INST( F, HF, HF, 0, 4, false),
> > +
> > +#undef INST
> > + };
> > +
> > + /* Align16 Math for mixed float mode is not supported in gen8 */
> > + if (devinfo.gen < 9 || devinfo.gen >= 11)
> > + return;
> > +
> > + brw_set_default_access_mode(p, BRW_ALIGN_16);
> > +
> > + for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
> > + gen6_math(p, retype(g0, inst[i].dst_type),
> > + BRW_MATH_FUNCTION_POW,
> > + retype(g0, inst[i].src0_type),
> > + retype(g0, inst[i].src1_type));
> > +
> > + brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src0_vstride);
> > + brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].src1_vstride);
> > +
> > + EXPECT_EQ(inst[i].expected_result, validate(p));
> > +
> > + clear_instructions(p);
> > + }
> > +}
> > +
> > TEST_P(validation_test, vector_immediate_destination_alignment)
> > {
> > static const struct {
> > --
> > 2.20.1
More information about the mesa-dev
mailing list