[Mesa-dev] [PATCH 21/21] i965/fs: Expose arbitrary channel execution groups to the IR.
Francisco Jerez
currojerez at riseup.net
Wed May 25 00:18:34 UTC 2016
Jason Ekstrand <jason at jlekstrand.net> writes:
> On Tue, May 24, 2016 at 12:18 AM, Francisco Jerez <currojerez at riseup.net>
> wrote:
>
>> This generalizes the current fs_inst::force_sechalf flag to allow
>> specifying channel enable groups other than 0 or 8. At some point it
>> will likely make sense to fix the vec4 generator to support arbitrary
>> execution groups and then move the definition of fs_inst::group into
>> backend_instruction (e.g. so we can do FP64 in the VEC4 back-end).
>> ---
>> src/mesa/drivers/dri/i965/brw_fs.cpp | 20
>> ++++++++------------
>> src/mesa/drivers/dri/i965/brw_fs_builder.h | 14 +++++++++++---
>> src/mesa/drivers/dri/i965/brw_fs_cse.cpp | 4 ++--
>> src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 7 ++++---
>> src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp | 2 +-
>> src/mesa/drivers/dri/i965/brw_ir_fs.h | 20
>> +++++++++-----------
>> 6 files changed, 35 insertions(+), 32 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> index a59cd3c..92caeaa 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> @@ -3638,7 +3638,7 @@ fs_visitor::lower_integer_multiplication()
>> mul->src[1].stride *= 2;
>>
>> } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
>> - inst->force_sechalf) {
>> + inst->group > 0) {
>> /* Among other things the quarter control bits influence which
>> * accumulator register is used by the hardware for
>> instructions
>> * that access the accumulator implicitly (e.g. MACH). A
>> @@ -3655,7 +3655,7 @@ fs_visitor::lower_integer_multiplication()
>> * to get the result masked correctly according to the current
>> * channel enables.
>> */
>> - mach->force_sechalf = false;
>> + mach->group = 0;
>> mach->force_writemask_all = true;
>> mach->dst = ibld.vgrf(inst->dst.type);
>> ibld.MOV(inst->dst, mach->dst);
>> @@ -3791,8 +3791,8 @@ lower_fb_write_logical_send(const fs_builder &bld,
>> fs_inst *inst,
>> sample_mask.stride *= 2;
>>
>> bld.exec_all().annotate("FB write oMask")
>> - .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
>> - inst->force_sechalf),
>> + .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
>> + inst->group),
>> sample_mask);
>> length++;
>> }
>> @@ -5017,10 +5017,10 @@ fs_visitor::lower_simd_width()
>> * execution size of the builder to the highest of both for now
>> so
>> * we're sure that both cases can be handled.
>> */
>> + const unsigned max_width = MAX2(inst->exec_size, lower_width);
>> const fs_builder ibld = bld.at(block, inst)
>> .exec_all(inst->force_writemask_all)
>> - .group(MAX2(inst->exec_size,
>> lower_width),
>> - inst->force_sechalf);
>> + .group(max_width, inst->group /
>> max_width);
>>
>> /* Split the copies in chunks of the execution width of either
>> the
>> * original or the lowered instruction, whichever is lower.
>> @@ -5352,12 +5352,8 @@ fs_visitor::dump_instruction(backend_instruction
>> *be_inst, FILE *file)
>> if (inst->force_writemask_all)
>> fprintf(file, "NoMask ");
>>
>> - if (dispatch_width == 16 && inst->exec_size == 8) {
>> - if (inst->force_sechalf)
>> - fprintf(file, "2ndhalf ");
>> - else
>> - fprintf(file, "1sthalf ");
>> - }
>> + if (inst->exec_size != dispatch_width)
>> + fprintf(file, "group%d ", inst->group);
>>
>> fprintf(file, "\n");
>> }
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h
>> b/src/mesa/drivers/dri/i965/brw_fs_builder.h
>> index b50dda4..c1d13a2 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
>> @@ -72,7 +72,7 @@ namespace brw {
>> fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
>> shader(shader), block(block), cursor(inst),
>> _dispatch_width(inst->exec_size),
>> - _group(inst->force_sechalf ? 8 : 0),
>> + _group(inst->group),
>> force_writemask_all(inst->force_writemask_all)
>> {
>> annotation.str = inst->annotation;
>> @@ -168,6 +168,15 @@ namespace brw {
>> }
>>
>> /**
>> + * Get the channel group in use.
>> + */
>> + unsigned
>> + group() const
>> + {
>> + return _group;
>> + }
>> +
>> + /**
>> * Allocate a virtual register of natural vector size (one for this
>> IR)
>> * and SIMD width. \p n gives the amount of space to allocate in
>> * dispatch_width units (which is just enough space for one logical
>> @@ -353,9 +362,8 @@ namespace brw {
>> assert(inst->exec_size <= 32);
>> assert(inst->exec_size == dispatch_width() ||
>> force_writemask_all);
>> - assert(_group == 0 || _group == 8);
>>
>> - inst->force_sechalf = (_group == 8);
>> + inst->group = _group;
>> inst->force_writemask_all = force_writemask_all;
>> inst->annotation = annotation.str;
>> inst->ir = annotation.ir;
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
>> b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
>> index 9c39106..159bf5d 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
>> @@ -162,7 +162,7 @@ instructions_match(fs_inst *a, fs_inst *b, bool
>> *negate)
>> return a->opcode == b->opcode &&
>> a->force_writemask_all == b->force_writemask_all &&
>> a->exec_size == b->exec_size &&
>> - a->force_sechalf == b->force_sechalf &&
>> + a->group == b->group &&
>> a->saturate == b->saturate &&
>> a->predicate == b->predicate &&
>> a->predicate_inverse == b->predicate_inverse &&
>> @@ -215,7 +215,7 @@ create_copy_instr(const fs_builder &bld, fs_inst
>> *inst, fs_reg src, bool negate)
>> copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
>> } else {
>> copy = bld.MOV(inst->dst, src);
>> - copy->force_sechalf = inst->force_sechalf;
>> + copy->group = inst->group;
>> copy->force_writemask_all = inst->force_writemask_all;
>> copy->src[0].negate = negate;
>> }
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> index 804c639..914ec9b 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> @@ -212,7 +212,7 @@ fs_generator::fire_fb_write(fs_inst *inst,
>> if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
>> msg_control =
>> BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
>> else if (prog_data->dual_src_blend) {
>> - if (!inst->force_sechalf)
>> + if (!inst->group)
>> msg_control =
>> BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
>> else
>> msg_control =
>> BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
>>
>
> This doesn't look right. I suppose it's correct for SIMD16-only and that
> you have a follow-on patch for SIMD32 but it seems wrong. If it's an easy
> one-liner, maybe just squash it in?
>
Yeah, definitely, but it will be more than a few lines to fix FB writes
to work on SIMD32. Because it's quite some churn and it's unused by
compute shaders I didn't include any of the FB write patches in this
series, this patch is only intended to extend the representation of
channel groups in the IR and keep the surrounding logic functionally
equivalent. FB writes are currently broken in SIMD32 mode beyond the
message type field set above anyway.
>
>> @@ -1071,7 +1071,7 @@ fs_generator::generate_scratch_write(fs_inst *inst,
>> struct brw_reg src)
>> brw_set_default_compression(p, lower_size > 8);
>>
>> for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
>> - brw_set_default_group(p, (inst->force_sechalf ? 8 : 0) + lower_size
>> * i);
>> + brw_set_default_group(p, inst->group + lower_size * i);
>>
>> brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
>> retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
>> @@ -1615,7 +1615,7 @@ fs_generator::generate_code(const cfg_t *cfg, int
>> dispatch_width)
>> const bool compressed =
>> inst->dst.component_size(inst->exec_size) > REG_SIZE;
>> brw_set_default_compression(p, compressed);
>> - brw_set_default_group(p, inst->force_sechalf ? 8 : 0);
>> + brw_set_default_group(p, inst->group);
>>
>> for (unsigned int i = 0; i < inst->sources; i++) {
>> src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen,
>> @@ -1643,6 +1643,7 @@ fs_generator::generate_code(const cfg_t *cfg, int
>> dispatch_width)
>> brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
>>
>> assert(inst->force_writemask_all || inst->exec_size >= 8);
>> + assert(inst->force_writemask_all || inst->group % inst->exec_size
>> == 0);
>> assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
>> assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
>> b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
>> index 8613725..8cd897f 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
>> @@ -163,7 +163,7 @@ fs_visitor::opt_peephole_sel()
>> /* Check that the MOVs are the right form. */
>> if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
>> then_mov[i]->exec_size != else_mov[i]->exec_size ||
>> - then_mov[i]->force_sechalf != else_mov[i]->force_sechalf ||
>> + then_mov[i]->group != else_mov[i]->group ||
>> then_mov[i]->force_writemask_all !=
>> else_mov[i]->force_writemask_all ||
>> then_mov[i]->is_partial_write() ||
>> else_mov[i]->is_partial_write() ||
>> diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h
>> b/src/mesa/drivers/dri/i965/brw_ir_fs.h
>> index cdb4464..6a93e81 100644
>> --- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
>> +++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
>> @@ -289,22 +289,20 @@ public:
>> */
>> uint8_t exec_size;
>>
>> + /**
>> + * Channel group from the hardware execution and predication mask that
>> + * should be applied to the instruction. The subset of channel enable
>> + * signals (calculated from the EU control flow and predication state)
>> + * given by [group, group + exec_size[ will be used to mask GRF writes
>> and
>>
>
> If you're intending to indicate a half-open interval, I believe the
> notation more people will be familiar with is "[group, group + exec_size)"
>
Sure, I don't mind one notation or the other, changed here and in
another comment in PATCH 1.
>
>> + * any other side effects of the instruction.
>> + */
>> + uint8_t group;
>> +
>> bool eot:1;
>> - bool force_sechalf:1;
>> bool pi_noperspective:1; /**< Pixel interpolator noperspective flag
>> */
>> };
>>
>> /**
>> - * Set second-half quarter control on \p inst.
>> - */
>> -static inline fs_inst *
>> -set_sechalf(fs_inst *inst)
>> -{
>> - inst->force_sechalf = true;
>> - return inst;
>> -}
>> -
>> -/**
>> * Make the execution of \p inst dependent on the evaluation of a possibly
>> * inverted predicate.
>> */
>> --
>> 2.7.3
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 212 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160524/e5efbb31/attachment.sig>
More information about the mesa-dev
mailing list