<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Fri, Dec 8, 2017 at 10:04 AM, Pohjolainen, Topi <<a href="mailto:topi.pohjolainen@gmail.com" target="_blank">topi.pohjolainen@gmail.com</a>> wrote: <blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5">On Fri, Dec 08, 2017 at 08:55:56AM -0800, Jason Ekstrand wrote: > On Fri, Dec 8, 2017 at 12:40 AM, Pohjolainen, Topi < > <a href="mailto:topi.pohjolainen@gmail.com">topi.pohjolainen@gmail.com</a>> wrote: > > > On Wed, Dec 06, 2017 at 08:34:19PM -0800, Jason Ekstrand wrote: > > > This rewires the logic for assigning uniform locations to work in terms > > > of "complex alignments". The basic idea is that, as we walk the list of > > > instructions, we keep track of the alignment and continuity requirements > > > of each slot and assert that the alignments all match up. We then use > > > those alignments in the compaction stage to ensure that everything gets > > > placed at a properly aligned register. The old mechanism handled > > > alignments by special-casing each of the bit sizes and placing 64-bit > > > values first followed by 32-bit values. > > > > > > The old scheme had the advantage of never leaving a hole since all the > > > 64-bit values could be tightly packed and so could the 32-bit values. > > > However, the new scheme has no type size special cases so it handles not > > > only 32 and 64-bit types but should gracefully extend to 16 and 8-bit > > > types as the need arises. > > > > > > Tested-by: Jose Maria Casanova Crespo <<a href="mailto:jmcasanova@igalia.com">jmcasanova@igalia.com</a>> > > > --- > > > src/intel/compiler/brw_fs.cpp | 307 ++++++++++++++++++++++++------ > > ------------ > > > 1 file changed, 174 insertions(+), 133 deletions(-) > > > > > > diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs. > > cpp > > > index 93bb6b4..41260b4 100644 > > > --- a/src/intel/compiler/brw_fs.cpp > > > +++ b/src/intel/compiler/brw_fs.cpp > > > @@ -1906,62 +1906,6 @@ fs_visitor::compact_virtual_grfs() > > > return progress; > > > } > > > > > > -static void > > > -set_push_pull_constant_loc(unsigned uniform, int *chunk_start, > > > - unsigned *max_chunk_bitsize, > > > - bool contiguous, unsigned bitsize, > > > - const unsigned target_bitsize, > > > - int *push_constant_loc, int > > *pull_constant_loc, > > > - unsigned *num_push_constants, > > > - unsigned *num_pull_constants, > > > - const unsigned max_push_components, > > > - const unsigned max_chunk_size, > > > - bool allow_pull_constants, > > > - struct brw_stage_prog_data *stage_prog_data) > > > -{ > > > - /* This is the first live uniform in the chunk */ > > > - if (*chunk_start < 0) > > > - *chunk_start = uniform; > > > - > > > - /* Keep track of the maximum bit size access in contiguous uniforms > > */ > > > - *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize); > > > - > > > - /* If this element does not need to be contiguous with the next, we > > > - * split at this point and everything between chunk_start and u > > forms a > > > - * single chunk. > > > - */ > > > - if (!contiguous) { > > > - /* If bitsize doesn't match the target one, skip it */ > > > - if (*max_chunk_bitsize != target_bitsize) { > > > - /* FIXME: right now we only support 32 and 64-bit accesses */ > > > - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); > > > - *max_chunk_bitsize = 0; > > > - *chunk_start = -1; > > > - return; > > > - } > > > - > > > - unsigned chunk_size = uniform - *chunk_start + 1; > > > - > > > - /* Decide whether we should push or pull this parameter. In the > > > - * Vulkan driver, push constants are explicitly exposed via the > > API > > > - * so we push everything. In GL, we only push small arrays. > > > - */ > > > - if (!allow_pull_constants || > > > - (*num_push_constants + chunk_size <= max_push_components && > > > - chunk_size <= max_chunk_size)) { > > > - assert(*num_push_constants + chunk_size <= > > max_push_components); > > > - for (unsigned j = *chunk_start; j <= uniform; j++) > > > - push_constant_loc[j] = (*num_push_constants)++; > > > - } else { > > > - for (unsigned j = *chunk_start; j <= uniform; j++) > > > - pull_constant_loc[j] = (*num_pull_constants)++; > > > - } > > > - > > > - *max_chunk_bitsize = 0; > > > - *chunk_start = -1; > > > - } > > > -} > > > - > > > static int > > > get_subgroup_id_param_index(const brw_stage_prog_data *prog_data) > > > { > > > @@ -1977,6 +1921,98 @@ get_subgroup_id_param_index(const > > brw_stage_prog_data *prog_data) > > > } > > > > > > /** > > > + * Struct for handling complex alignments. > > > + * > > > + * A complex alignment is stored as multiplier and an offset. A value > > is > > > + * considered to be aligned if it is congruent to the offset modulo the > > > + * multiplier. > > > + */ > > > +struct cplx_align { > > > + unsigned mul:4; > > > + unsigned offset:4; > > > +}; > > > + > > > +#define CPLX_ALIGN_MAX_MUL 8 > > > + > > > +static void > > > +cplx_align_assert_sane(struct cplx_align a) > > > +{ > > > + assert(a.mul > 0 && util_is_power_of_two(a.mul)); > > > + assert(a.offset < a.mul); > > > +} > > > + > > > +/** > > > + * Combines two alignments to produce a least multiple of sorts. > > > + * > > > + * The returned alignment is the smallest (in terms of multiplier) such > > that > > > + * anything aligned to both a and b will be aligned to the new > > alignment. > > > + * This function will assert-fail if a and b are not compatible, i.e. > > if the > > > + * offset parameters are such that no common alignment is possible. > > > + */ > > > +static struct cplx_align > > > +cplx_align_combine(struct cplx_align a, struct cplx_align b) > > > +{ > > > + cplx_align_assert_sane(a); > > > + cplx_align_assert_sane(b); > > > + > > > + /* Assert that the alignments agree. */ > > > + assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1))); > > > + > > > + return a.mul > b.mul ? a : b; > > > +} > > > + > > > +/** > > > + * Apply a complex alignment > > > + * > > > + * This function will return the smallest number greater than or equal > > to > > > + * offset that is aligned to align. > > > + */ > > > +static unsigned > > > +cplx_align_apply(struct cplx_align align, unsigned offset) > > > +{ > > > + return ALIGN(offset - align.offset, align.mul) + align.offset; > > > +} > > > + > > > +#define UNIFORM_SLOT_SIZE 4 > > > + > > > +struct uniform_slot_info { > > > + /** True if the given uniform slot is live */ > > > + unsigned is_live:1; > > > + > > > + /** True if this slot and the slot must remain contiguous */ > > > > Missing a word? Maybe "...this slot and the slot after must..."? > > > > Yes, "slot after" and "next slot" both work. I'll fix that up. > > > > > + unsigned contiguous:1; > > > + > > > + struct cplx_align align; > > > +}; > > > + > > > +static void > > > +mark_uniform_slots_read(struct uniform_slot_info *slots, > > > + unsigned num_slots, unsigned alignment) > > > +{ > > > + assert(alignment > 0 && util_is_power_of_two(alignment)); > > > + assert(alignment <= CPLX_ALIGN_MAX_MUL); > > > + > > > + /* We can't align a slot to anything less than the slot size */ > > > + alignment = MAX2(alignment, UNIFORM_SLOT_SIZE); > > > + > > > + struct cplx_align align = {alignment, 0}; > > > + cplx_align_assert_sane(align); > > > + > > > + for (unsigned i = 0; i < num_slots; i++) { > > > + slots[i].is_live = true; > > > + if (i < num_slots - 1) > > > + slots[i].contiguous = true; > > > + > > > + align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1); > > > + if (slots[i].align.mul == 0) { > > > + slots[i].align = align; > > > + } else { > > > + slots[i].align = cplx_align_combine(slots[i].align, align); > > > + } > > > + } > > > +} > > > + > > > +/** > > > * Assign UNIFORM file registers to either push constants or pull > > constants. > > > * > > > * We allow a fragment shader to have more than the specified minimum > > > @@ -1994,60 +2030,44 @@ fs_visitor::assign_constant_locations() > > > return; > > > } > > > > > > - bool is_live[uniforms]; > > > - memset(is_live, 0, sizeof(is_live)); > > > - unsigned bitsize_access[uniforms]; > > > - memset(bitsize_access, 0, sizeof(bitsize_access)); > > > - > > > - /* For each uniform slot, a value of true indicates that the given > > slot and > > > - * the next slot must remain contiguous. This is used to keep us > > from > > > - * splitting arrays and 64-bit values apart. > > > - */ > > > - bool contiguous[uniforms]; > > > - memset(contiguous, 0, sizeof(contiguous)); > > > + struct uniform_slot_info slots[uniforms]; > > > + memset(slots, 0, sizeof(slots)); > > > > > > - /* First, we walk through the instructions and do two things: > > > - * > > > - * 1) Figure out which uniforms are live. > > > - * > > > - * 2) Mark any indirectly used ranges of registers as contiguous. > > > - * > > > - * Note that we don't move constant-indexed accesses to arrays. No > > > - * testing has been done of the performance impact of this choice. > > > - */ > > > foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { > > > for (int i = 0 ; i < inst->sources; i++) { > > > if (inst->src[i].file != UNIFORM) > > > continue; > > > > > > - int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; > > > + /* NIR tightly packs things so the uniform number might not be > > > + * aligned (if we have a double right after a float, for > > instance). > > > + * This is fine because the process of re-arranging them will > > ensure > > > + * that things are properly aligned. The offset into that > > uniform, > > > + * however, must be aligned. > > > > What about in case of types having size less than 32-bits (i.e., smaller > > than > > uniform slot size? Take, for example, invididual components of 16-bits > > vectors. Their offsets are not necessarily aligned on 32-bit slot > > boundaries. > > > > I did think about that. :-) My plan was that they would have their uniform > nr set to floor(offset/4) with an offset of offset % 4. Then their slot > would get aligned to MAX(2, UNIFORM_SLOT_SIZE) = 4 (see > mark_uniform_slots_read above). Assuming things are reasonably aligned > (i.e., no one puts a float at an offset of 2 bytes), this should work out > ok. </div></div>Right, I did notice the MAX2() in mark_uniform_slots_read(), and like said below, things here seem to work also. Mostly I was thinking the division inst->src[i].offset / UNIFORM_SLOT_SIZE. A little note saying that the offset may not align to slots but that it doesn't need to. One is only interested if the slot itself is used and that it is insignificant which part of the slot the instruction sources. Took me a while to crasp things but this is actually clearer than what we had before: Reviewed-by: Topi Pohjolainen <<a href="mailto:topi.pohjolainen@intel.com">topi.pohjolainen@intel.com</a>> > > Unfortunately, I see that we still have asserts that byte_offset % 4 == 0 > in our handling of nir_intrinsic_load_uniform and we're passing the tests. > They must be pretty bad. Yeah, I've been wondering how many things get exercised. I seemed to hit number of things in liveness analysis and optimization passes with SIMD8 and 16-bit regs with glsl. Then again, I don't have clear picture how much the 16-bit storage support requires from the 16-bit compiler infrastructure. <div class="HOEnZb"><div class="h5"></div></div></blockquote><div> </div><div>I just realized why this is Ok. Vulkan has a single variable for all push constants and it's at offset 0. That said, we also have an assert that the offset is a multiple of 4 and that definitely doesn't hold.</div><div> </div><div>--Jason </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5"> > > --Jason > > > > See also related question a few lines further. > > > > > + * > > > + * In Vulkan, we have explicit offsets but everything is > > crammed > > > + * into a single "variable" so inst->src[i].nr will always be > > 0. > > > + * Everything will be properly aligned relative to that one > > base. > > > + */ > > > + assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); > > > + > > > + unsigned u = inst->src[i].nr + > > > + inst->src[i].offset / UNIFORM_SLOT_SIZE; > > > > This effectively gives the same uniform slot no matter which individual > > byte(s) within are addressed. The logic here is about slots so this seems > > fine. Just wanted to check that this is your intention. And if so, should > > we > > comment here a little? > > > > > > > > + if (u >= uniforms) > > > + continue; > > > + > > > + unsigned slots_read; > > > if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { > > > - assert(inst->src[2].ud % 4 == 0); > > > - unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; > > > - assert(last < uniforms); > > > - > > > - for (unsigned j = constant_nr; j < last; j++) { > > > - is_live[j] = true; > > > - contiguous[j] = true; > > > - bitsize_access[j] = MAX2(bitsize_access[j], > > type_sz(inst->src[i].type)); > > > - } > > > - is_live[last] = true; > > > - bitsize_access[last] = MAX2(bitsize_access[last], > > type_sz(inst->src[i].type)); > > > + slots_read = DIV_ROUND_UP(inst->src[2].ud, > > UNIFORM_SLOT_SIZE); > > > } else { > > > - if (constant_nr >= 0 && constant_nr < (int) uniforms) { > > > - int regs_read = inst->components_read(i) * > > > - type_sz(inst->src[i].type) / 4; > > > - assert(regs_read <= 2); > > > - if (regs_read == 2) > > > - contiguous[constant_nr] = true; > > > - for (int j = 0; j < regs_read; j++) { > > > - is_live[constant_nr + j] = true; > > > - bitsize_access[constant_nr + j] = > > > - MAX2(bitsize_access[constant_nr + j], > > type_sz(inst->src[i].type)); > > > - } > > > - } > > > + unsigned bytes_read = inst->components_read(i) * > > > + type_sz(inst->src[i].type); > > > + slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); > > > } > > > + > > > + assert(u + slots_read <= uniforms); > > > + mark_uniform_slots_read(&slots[u], slots_read, > > > + type_sz(inst->src[i].type)); > > > } > > > } > > > > > > @@ -2082,43 +2102,64 @@ fs_visitor::assign_constant_locations() > > > memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); > > > > > > int chunk_start = -1; > > > - unsigned max_chunk_bitsize = 0; > > > - > > > - /* First push 64-bit uniforms to ensure they are properly aligned */ > > > - const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF); > > > + struct cplx_align align; > > > for (unsigned u = 0; u < uniforms; u++) { > > > - if (!is_live[u]) > > > + if (!slots[u].is_live) { > > > + assert(chunk_start == -1); > > > continue; > > > + } > > > > > > - set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, > > > - contiguous[u], bitsize_access[u], > > > - uniform_64_bit_size, > > > - push_constant_loc, pull_constant_loc, > > > - &num_push_constants, > > &num_pull_constants, > > > - max_push_components, max_chunk_size, > > > - compiler->supports_pull_constants, > > > - stage_prog_data); > > > + /* Skip subgroup_id_index to put it in the last push register. */ > > > + if (subgroup_id_index == (int)u) > > > + continue; > > > > > > - } > > > + if (chunk_start == -1) { > > > + chunk_start = u; > > > + align = slots[u].align; > > > + } else { > > > + /* Offset into the chunk */ > > > + unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; > > > > > > - /* Then push the rest of uniforms */ > > > - const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F); > > > - for (unsigned u = 0; u < uniforms; u++) { > > > - if (!is_live[u]) > > > - continue; > > > + /* Shift the slot alignment down by the chunk offset so it is > > > + * comparable with the base chunk alignment. > > > + */ > > > + struct cplx_align slot_align = slots[u].align; > > > + slot_align.offset = > > > + (slot_align.offset - chunk_offset) & (align.mul - 1); > > > > > > - /* Skip subgroup_id_index to put it in the last push register. */ > > > - if (subgroup_id_index == (int)u) > > > + align = cplx_align_combine(align, slot_align); > > > + } > > > + > > > + /* Sanity check the alignment */ > > > + cplx_align_assert_sane(align); > > > + > > > + if (slots[u].contiguous) > > > continue; > > > > > > - set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, > > > - contiguous[u], bitsize_access[u], > > > - uniform_32_bit_size, > > > - push_constant_loc, pull_constant_loc, > > > - &num_push_constants, > > &num_pull_constants, > > > - max_push_components, max_chunk_size, > > > - compiler->supports_pull_constants, > > > - stage_prog_data); > > > + /* Adjust the alignment to be in terms of slots, not bytes */ > > > + assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); > > > + assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); > > > + align.mul /= UNIFORM_SLOT_SIZE; > > > + align.offset /= UNIFORM_SLOT_SIZE; > > > + > > > + unsigned push_start_align = cplx_align_apply(align, > > num_push_constants); > > > + unsigned chunk_size = u - chunk_start + 1; > > > + if (!compiler->supports_pull_constants || > > > + (chunk_size < max_chunk_size && > > > + push_start_align + chunk_size <= max_push_components)) { > > > + /* Align up the number of push constants */ > > > + num_push_constants = push_start_align; > > > + for (unsigned i = 0; i < chunk_size; i++) > > > + push_constant_loc[chunk_start + i] = num_push_constants++; > > > + } else { > > > + /* We need to pull this one */ > > > + num_pull_constants = cplx_align_apply(align, > > num_pull_constants); > > > + for (unsigned i = 0; i < chunk_size; i++) > > > + pull_constant_loc[chunk_start + i] = num_pull_constants++; > > > + } > > > + > > > + /* Reset the chunk and start again */ > > > + chunk_start = -1; > > > } > > > > > > /* Add the CS local thread ID uniform at the end of the push > > constants */ > > > @@ -2131,8 +2172,8 @@ fs_visitor::assign_constant_locations() > > > uint32_t *param = stage_prog_data->param; > > > stage_prog_data->nr_params = num_push_constants; > > > if (num_push_constants) { > > > - stage_prog_data->param = ralloc_array(mem_ctx, uint32_t, > > > - num_push_constants); > > > + stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, > > > + num_push_constants); > > > } else { > > > stage_prog_data->param = NULL; > > > } > > > @@ -2140,8 +2181,8 @@ fs_visitor::assign_constant_locations() > > > assert(stage_prog_data->pull_param == NULL); > > > if (num_pull_constants > 0) { > > > stage_prog_data->nr_pull_params = num_pull_constants; > > > - stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t, > > > - num_pull_constants); > > > + stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, > > > + num_pull_constants); > > > } > > > > > > /* Now that we know how many regular uniforms we'll push, reduce the > > > -- > > > 2.5.0.400.gff86faf > > > > > > _______________________________________________ > > > mesa-dev mailing list > > > <a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a> > > > <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a> > > </div></div></blockquote></div> </div></div>