<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Fri, Dec 8, 2017 at 12:40 AM, Pohjolainen, Topi <<a href="mailto:topi.pohjolainen@gmail.com" target="_blank">topi.pohjolainen@gmail.com</a>> wrote: <blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">On Wed, Dec 06, 2017 at 08:34:19PM -0800, Jason Ekstrand wrote: <div><div class="h5">> This rewires the logic for assigning uniform locations to work in terms > of "complex alignments". The basic idea is that, as we walk the list of > instructions, we keep track of the alignment and continuity requirements > of each slot and assert that the alignments all match up. We then use > those alignments in the compaction stage to ensure that everything gets > placed at a properly aligned register. The old mechanism handled > alignments by special-casing each of the bit sizes and placing 64-bit > values first followed by 32-bit values. > > The old scheme had the advantage of never leaving a hole since all the > 64-bit values could be tightly packed and so could the 32-bit values. > However, the new scheme has no type size special cases so it handles not > only 32 and 64-bit types but should gracefully extend to 16 and 8-bit > types as the need arises. > > Tested-by: Jose Maria Casanova Crespo <<a href="mailto:jmcasanova@igalia.com">jmcasanova@igalia.com</a>> > --- > src/intel/compiler/brw_fs.cpp | 307 ++++++++++++++++++++++++------------------ > 1 file changed, 174 insertions(+), 133 deletions(-) > > diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp > index 93bb6b4..41260b4 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -1906,62 +1906,6 @@ fs_visitor::compact_virtual_grfs() > return progress; > } > > -static void > -set_push_pull_constant_loc(unsigned uniform, int *chunk_start, > - unsigned *max_chunk_bitsize, > - bool contiguous, unsigned bitsize, > - const unsigned target_bitsize, > - int *push_constant_loc, int *pull_constant_loc, > - unsigned *num_push_constants, > - unsigned *num_pull_constants, > - const unsigned max_push_components, > - const unsigned max_chunk_size, > - bool allow_pull_constants, > - struct brw_stage_prog_data *stage_prog_data) > -{ > - /* This is the first live uniform in the chunk */ > - if (*chunk_start < 0) > - *chunk_start = uniform; > - > - /* Keep track of the maximum bit size access in contiguous uniforms */ > - *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize); > - > - /* If this element does not need to be contiguous with the next, we > - * split at this point and everything between chunk_start and u forms a > - * single chunk. > - */ > - if (!contiguous) { > - /* If bitsize doesn't match the target one, skip it */ > - if (*max_chunk_bitsize != target_bitsize) { > - /* FIXME: right now we only support 32 and 64-bit accesses */ > - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); > - *max_chunk_bitsize = 0; > - *chunk_start = -1; > - return; > - } > - > - unsigned chunk_size = uniform - *chunk_start + 1; > - > - /* Decide whether we should push or pull this parameter. In the > - * Vulkan driver, push constants are explicitly exposed via the API > - * so we push everything. In GL, we only push small arrays. > - */ > - if (!allow_pull_constants || > - (*num_push_constants + chunk_size <= max_push_components && > - chunk_size <= max_chunk_size)) { > - assert(*num_push_constants + chunk_size <= max_push_components); > - for (unsigned j = *chunk_start; j <= uniform; j++) > - push_constant_loc[j] = (*num_push_constants)++; > - } else { > - for (unsigned j = *chunk_start; j <= uniform; j++) > - pull_constant_loc[j] = (*num_pull_constants)++; > - } > - > - *max_chunk_bitsize = 0; > - *chunk_start = -1; > - } > -} > - > static int > get_subgroup_id_param_index(const brw_stage_prog_data *prog_data) > { > @@ -1977,6 +1921,98 @@ get_subgroup_id_param_index(const brw_stage_prog_data *prog_data) > } > > /** > + * Struct for handling complex alignments. > + * > + * A complex alignment is stored as multiplier and an offset. A value is > + * considered to be aligned if it is congruent to the offset modulo the > + * multiplier. > + */ > +struct cplx_align { > + unsigned mul:4; > + unsigned offset:4; > +}; > + > +#define CPLX_ALIGN_MAX_MUL 8 > + > +static void > +cplx_align_assert_sane(struct cplx_align a) > +{ > + assert(a.mul > 0 && util_is_power_of_two(a.mul)); > + assert(a.offset < a.mul); > +} > + > +/** > + * Combines two alignments to produce a least multiple of sorts. > + * > + * The returned alignment is the smallest (in terms of multiplier) such that > + * anything aligned to both a and b will be aligned to the new alignment. > + * This function will assert-fail if a and b are not compatible, i.e. if the > + * offset parameters are such that no common alignment is possible. > + */ > +static struct cplx_align > +cplx_align_combine(struct cplx_align a, struct cplx_align b) > +{ > + cplx_align_assert_sane(a); > + cplx_align_assert_sane(b); > + > + /* Assert that the alignments agree. */ > + assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1))); > + > + return a.mul > b.mul ? a : b; > +} > + > +/** > + * Apply a complex alignment > + * > + * This function will return the smallest number greater than or equal to > + * offset that is aligned to align. > + */ > +static unsigned > +cplx_align_apply(struct cplx_align align, unsigned offset) > +{ > + return ALIGN(offset - align.offset, align.mul) + align.offset; > +} > + > +#define UNIFORM_SLOT_SIZE 4 > + > +struct uniform_slot_info { > + /** True if the given uniform slot is live */ > + unsigned is_live:1; > + > + /** True if this slot and the slot must remain contiguous */ </div></div>Missing a word? Maybe "...this slot and the slot after must..."? <div><div class="h5"></div></div></blockquote><div> </div><div>Yes, "slot after" and "next slot" both work. I'll fix that up. </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div><div class="h5"> > + unsigned contiguous:1; > + > + struct cplx_align align; > +}; > + > +static void > +mark_uniform_slots_read(struct uniform_slot_info *slots, > + unsigned num_slots, unsigned alignment) > +{ > + assert(alignment > 0 && util_is_power_of_two(alignment)); > + assert(alignment <= CPLX_ALIGN_MAX_MUL); > + > + /* We can't align a slot to anything less than the slot size */ > + alignment = MAX2(alignment, UNIFORM_SLOT_SIZE); > + > + struct cplx_align align = {alignment, 0}; > + cplx_align_assert_sane(align); > + > + for (unsigned i = 0; i < num_slots; i++) { > + slots[i].is_live = true; > + if (i < num_slots - 1) > + slots[i].contiguous = true; > + > + align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1); > + if (slots[i].align.mul == 0) { > + slots[i].align = align; > + } else { > + slots[i].align = cplx_align_combine(slots[i].align, align); > + } > + } > +} > + > +/** > * Assign UNIFORM file registers to either push constants or pull constants. > * > * We allow a fragment shader to have more than the specified minimum > @@ -1994,60 +2030,44 @@ fs_visitor::assign_constant_locations() > return; > } > > - bool is_live[uniforms]; > - memset(is_live, 0, sizeof(is_live)); > - unsigned bitsize_access[uniforms]; > - memset(bitsize_access, 0, sizeof(bitsize_access)); > - > - /* For each uniform slot, a value of true indicates that the given slot and > - * the next slot must remain contiguous. This is used to keep us from > - * splitting arrays and 64-bit values apart. > - */ > - bool contiguous[uniforms]; > - memset(contiguous, 0, sizeof(contiguous)); > + struct uniform_slot_info slots[uniforms]; > + memset(slots, 0, sizeof(slots)); > > - /* First, we walk through the instructions and do two things: > - * > - * 1) Figure out which uniforms are live. > - * > - * 2) Mark any indirectly used ranges of registers as contiguous. > - * > - * Note that we don't move constant-indexed accesses to arrays. No > - * testing has been done of the performance impact of this choice. > - */ > foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { > for (int i = 0 ; i < inst->sources; i++) { > if (inst->src[i].file != UNIFORM) > continue; > > - int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; > + /* NIR tightly packs things so the uniform number might not be > + * aligned (if we have a double right after a float, for instance). > + * This is fine because the process of re-arranging them will ensure > + * that things are properly aligned. The offset into that uniform, > + * however, must be aligned. </div></div>What about in case of types having size less than 32-bits (i.e., smaller than uniform slot size? Take, for example, invididual components of 16-bits vectors. Their offsets are not necessarily aligned on 32-bit slot boundaries. </blockquote><div> </div><div>I did think about that. :-) My plan was that they would have their uniform nr set to floor(offset/4) with an offset of offset % 4. Then their slot would get aligned to MAX(2, UNIFORM_SLOT_SIZE) = 4 (see mark_uniform_slots_read above). Assuming things are reasonably aligned (i.e., no one puts a float at an offset of 2 bytes), this should work out ok.</div><div> </div><div>Unfortunately, I see that we still have asserts that byte_offset % 4 == 0 in our handling of nir_intrinsic_load_uniform and we're passing the tests. They must be pretty bad.</div><div> </div><div>--Jason </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> See also related question a few lines further. > + * > + * In Vulkan, we have explicit offsets but everything is crammed > + * into a single "variable" so inst->src[i].nr will always be 0. > + * Everything will be properly aligned relative to that one base. > + */ > + assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); > + > + unsigned u = inst->src[i].nr + > + inst->src[i].offset / UNIFORM_SLOT_SIZE; This effectively gives the same uniform slot no matter which individual byte(s) within are addressed. The logic here is about slots so this seems fine. Just wanted to check that this is your intention. And if so, should we comment here a little? <div class="HOEnZb"><div class="h5"> > > + if (u >= uniforms) > + continue; > + > + unsigned slots_read; > if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { > - assert(inst->src[2].ud % 4 == 0); > - unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; > - assert(last < uniforms); > - > - for (unsigned j = constant_nr; j < last; j++) { > - is_live[j] = true; > - contiguous[j] = true; > - bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type)); > - } > - is_live[last] = true; > - bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type)); > + slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); > } else { > - if (constant_nr >= 0 && constant_nr < (int) uniforms) { > - int regs_read = inst->components_read(i) * > - type_sz(inst->src[i].type) / 4; > - assert(regs_read <= 2); > - if (regs_read == 2) > - contiguous[constant_nr] = true; > - for (int j = 0; j < regs_read; j++) { > - is_live[constant_nr + j] = true; > - bitsize_access[constant_nr + j] = > - MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type)); > - } > - } > + unsigned bytes_read = inst->components_read(i) * > + type_sz(inst->src[i].type); > + slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); > } > + > + assert(u + slots_read <= uniforms); > + mark_uniform_slots_read(&slots[u], slots_read, > + type_sz(inst->src[i].type)); > } > } > > @@ -2082,43 +2102,64 @@ fs_visitor::assign_constant_locations() > memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); > > int chunk_start = -1; > - unsigned max_chunk_bitsize = 0; > - > - /* First push 64-bit uniforms to ensure they are properly aligned */ > - const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF); > + struct cplx_align align; > for (unsigned u = 0; u < uniforms; u++) { > - if (!is_live[u]) > + if (!slots[u].is_live) { > + assert(chunk_start == -1); > continue; > + } > > - set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, > - contiguous[u], bitsize_access[u], > - uniform_64_bit_size, > - push_constant_loc, pull_constant_loc, > - &num_push_constants, &num_pull_constants, > - max_push_components, max_chunk_size, > - compiler->supports_pull_constants, > - stage_prog_data); > + /* Skip subgroup_id_index to put it in the last push register. */ > + if (subgroup_id_index == (int)u) > + continue; > > - } > + if (chunk_start == -1) { > + chunk_start = u; > + align = slots[u].align; > + } else { > + /* Offset into the chunk */ > + unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; > > - /* Then push the rest of uniforms */ > - const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F); > - for (unsigned u = 0; u < uniforms; u++) { > - if (!is_live[u]) > - continue; > + /* Shift the slot alignment down by the chunk offset so it is > + * comparable with the base chunk alignment. > + */ > + struct cplx_align slot_align = slots[u].align; > + slot_align.offset = > + (slot_align.offset - chunk_offset) & (align.mul - 1); > > - /* Skip subgroup_id_index to put it in the last push register. */ > - if (subgroup_id_index == (int)u) > + align = cplx_align_combine(align, slot_align); > + } > + > + /* Sanity check the alignment */ > + cplx_align_assert_sane(align); > + > + if (slots[u].contiguous) > continue; > > - set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, > - contiguous[u], bitsize_access[u], > - uniform_32_bit_size, > - push_constant_loc, pull_constant_loc, > - &num_push_constants, &num_pull_constants, > - max_push_components, max_chunk_size, > - compiler->supports_pull_constants, > - stage_prog_data); > + /* Adjust the alignment to be in terms of slots, not bytes */ > + assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); > + assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); > + align.mul /= UNIFORM_SLOT_SIZE; > + align.offset /= UNIFORM_SLOT_SIZE; > + > + unsigned push_start_align = cplx_align_apply(align, num_push_constants); > + unsigned chunk_size = u - chunk_start + 1; > + if (!compiler->supports_pull_constants || > + (chunk_size < max_chunk_size && > + push_start_align + chunk_size <= max_push_components)) { > + /* Align up the number of push constants */ > + num_push_constants = push_start_align; > + for (unsigned i = 0; i < chunk_size; i++) > + push_constant_loc[chunk_start + i] = num_push_constants++; > + } else { > + /* We need to pull this one */ > + num_pull_constants = cplx_align_apply(align, num_pull_constants); > + for (unsigned i = 0; i < chunk_size; i++) > + pull_constant_loc[chunk_start + i] = num_pull_constants++; > + } > + > + /* Reset the chunk and start again */ > + chunk_start = -1; > } > > /* Add the CS local thread ID uniform at the end of the push constants */ > @@ -2131,8 +2172,8 @@ fs_visitor::assign_constant_locations() > uint32_t *param = stage_prog_data->param; > stage_prog_data->nr_params = num_push_constants; > if (num_push_constants) { > - stage_prog_data->param = ralloc_array(mem_ctx, uint32_t, > - num_push_constants); > + stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, > + num_push_constants); > } else { > stage_prog_data->param = NULL; > } > @@ -2140,8 +2181,8 @@ fs_visitor::assign_constant_locations() > assert(stage_prog_data->pull_param == NULL); > if (num_pull_constants > 0) { > stage_prog_data->nr_pull_params = num_pull_constants; > - stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t, > - num_pull_constants); > + stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, > + num_pull_constants); > } > > /* Now that we know how many regular uniforms we'll push, reduce the > -- > 2.5.0.400.gff86faf > </div></div><div class="HOEnZb"><div class="h5">> _______________________________________________ > mesa-dev mailing list > <a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a> > <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a> </div></div></blockquote></div> </div></div>