<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Thu, Dec 7, 2017 at 9:10 AM, Pohjolainen, Topi <<a href="mailto:topi.pohjolainen@gmail.com" target="_blank">topi.pohjolainen@gmail.com</a>> wrote: <blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5">On Wed, Dec 06, 2017 at 08:34:19PM -0800, Jason Ekstrand wrote: > This rewires the logic for assigning uniform locations to work in terms > of "complex alignments". The basic idea is that, as we walk the list of > instructions, we keep track of the alignment and continuity requirements > of each slot and assert that the alignments all match up. We then use > those alignments in the compaction stage to ensure that everything gets > placed at a properly aligned register. The old mechanism handled > alignments by special-casing each of the bit sizes and placing 64-bit > values first followed by 32-bit values. > > The old scheme had the advantage of never leaving a hole since all the > 64-bit values could be tightly packed and so could the 32-bit values. > However, the new scheme has no type size special cases so it handles not > only 32 and 64-bit types but should gracefully extend to 16 and 8-bit > types as the need arises. > > Tested-by: Jose Maria Casanova Crespo <<a href="mailto:jmcasanova@igalia.com">jmcasanova@igalia.com</a>> > --- > src/intel/compiler/brw_fs.cpp | 307 ++++++++++++++++++++++++------------------ > 1 file changed, 174 insertions(+), 133 deletions(-) > > diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp > index 93bb6b4..41260b4 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -1906,62 +1906,6 @@ fs_visitor::compact_virtual_grfs() > return progress; > } > > -static void > -set_push_pull_constant_loc(unsigned uniform, int *chunk_start, > - unsigned *max_chunk_bitsize, > - bool contiguous, unsigned bitsize, > - const unsigned target_bitsize, > - int *push_constant_loc, int *pull_constant_loc, > - unsigned *num_push_constants, > - unsigned *num_pull_constants, > - const unsigned max_push_components, > - const unsigned max_chunk_size, > - bool allow_pull_constants, > - struct brw_stage_prog_data *stage_prog_data) > -{ > - /* This is the first live uniform in the chunk */ > - if (*chunk_start < 0) > - *chunk_start = uniform; > - > - /* Keep track of the maximum bit size access in contiguous uniforms */ > - *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize); > - > - /* If this element does not need to be contiguous with the next, we > - * split at this point and everything between chunk_start and u forms a > - * single chunk. > - */ > - if (!contiguous) { > - /* If bitsize doesn't match the target one, skip it */ > - if (*max_chunk_bitsize != target_bitsize) { > - /* FIXME: right now we only support 32 and 64-bit accesses */ > - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8); > - *max_chunk_bitsize = 0; > - *chunk_start = -1; > - return; > - } > - > - unsigned chunk_size = uniform - *chunk_start + 1; > - > - /* Decide whether we should push or pull this parameter. In the > - * Vulkan driver, push constants are explicitly exposed via the API > - * so we push everything. In GL, we only push small arrays. > - */ > - if (!allow_pull_constants || > - (*num_push_constants + chunk_size <= max_push_components && > - chunk_size <= max_chunk_size)) { > - assert(*num_push_constants + chunk_size <= max_push_components); > - for (unsigned j = *chunk_start; j <= uniform; j++) > - push_constant_loc[j] = (*num_push_constants)++; > - } else { > - for (unsigned j = *chunk_start; j <= uniform; j++) > - pull_constant_loc[j] = (*num_pull_constants)++; > - } > - > - *max_chunk_bitsize = 0; > - *chunk_start = -1; > - } > -} > - > static int > get_subgroup_id_param_index(const brw_stage_prog_data *prog_data) > { > @@ -1977,6 +1921,98 @@ get_subgroup_id_param_index(const brw_stage_prog_data *prog_data) > } > > /** > + * Struct for handling complex alignments. > + * > + * A complex alignment is stored as multiplier and an offset. A value is > + * considered to be aligned if it is congruent to the offset modulo the > + * multiplier. > + */ > +struct cplx_align { > + unsigned mul:4; > + unsigned offset:4; > +}; > + > +#define CPLX_ALIGN_MAX_MUL 8 > + > +static void > +cplx_align_assert_sane(struct cplx_align a) > +{ > + assert(a.mul > 0 && util_is_power_of_two(a.mul)); > + assert(a.offset < a.mul); > +} > + > +/** > + * Combines two alignments to produce a least multiple of sorts. > + * > + * The returned alignment is the smallest (in terms of multiplier) such that > + * anything aligned to both a and b will be aligned to the new alignment. > + * This function will assert-fail if a and b are not compatible, i.e. if the > + * offset parameters are such that no common alignment is possible. > + */ > +static struct cplx_align > +cplx_align_combine(struct cplx_align a, struct cplx_align b) > +{ > + cplx_align_assert_sane(a); > + cplx_align_assert_sane(b); > + > + /* Assert that the alignments agree. */ > + assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1))); > + > + return a.mul > b.mul ? a : b; > +} > + > +/** > + * Apply a complex alignment > + * > + * This function will return the smallest number greater than or equal to > + * offset that is aligned to align. > + */ > +static unsigned > +cplx_align_apply(struct cplx_align align, unsigned offset) > +{ </div></div>Should we assert that offset >= 4? </blockquote><div> </div><div>Not here, no. I intentionally want this code to be as generic as possible. We may have reason to re-use it (or something similar) elsewhere and I don't want to build in lots of push-constant assumptions. </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> > + return ALIGN(offset - align.offset, align.mul) + align.offset; Soon I'm going to continue with glsl mediump support and thought better to read these patches. I stumbled on something (probably missing something again): </blockquote><div> </div><div>:-) </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> To me it looks there can ever be only three different instances of cplx_align: </blockquote><div> </div><div>Today, yes, but there may come a day when we want to re-arrange with a finer granularity than dwords. </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> {8, 0}, {8, 4} and {4, 0} and that "offset" argument here is always multiple of 4. In case of align == {8, 0} "offset" gets aligned to 8, and in case of align == {4, 0} cplx_align_apply() is nop. But in case of {8, 4} and "offset" not multiple of 8, "offset" - 4 is multiple of 8 and returned value becomes ALIGN(offset - 4, 8) + 4 which in turn isn't aligned by 8. <div><div class="h5"></div></div></blockquote><div> </div><div>Correct, and it shouldn't be. Let's walk through the calculation with a more complex alignment: {8, 6} (no, that isn't possible today but the calculations should work with it) and some concrete numbers: 6, 8, 10, 12, 14, 16 </div><div> </div><div>6 -> ALIGN(6 - 6, 8) + 6 = 6 </div><div>8 -> ALIGN(8 - 6, 8) + 6 = 14</div><div>10 -> ALIGN(10 - 6, 8) + 6 = 14</div><div>12 -> ALIGN(12 - 6, 8) + 6 = 14</div><div>14 -> ALIGN(14 - 6, 8) + 6 = 14</div><div>16 -> ALIGN(16 - 6, 8) + 6 = 22</div><div> </div><div>As you can see, the result of each is the next number that is 6 more than a multiple of 8. There is also a question about negative numbers. Let's look at one more calculation, this time with more detail about the guts of ALIGN():</div><div> </div><div>4 -> ALIGN(4 - 6, 8) + 6 = (((4 - 6) + (8 - 1)) & ~(8-1)) + 6 = (5 & ~7) + 6 = 6</div><div> </div><div>That one's a bit more complicated but I think you see what's going on. If it makes things more clear, we could do something such as</div><div> </div><div>return (offset + (align.mul - align.offset - 1)) & ~(align.mul - 1)) + align.offset; </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div><div class="h5"> > +} > + > +#define UNIFORM_SLOT_SIZE 4 > + > +struct uniform_slot_info { > + /** True if the given uniform slot is live */ > + unsigned is_live:1; > + > + /** True if this slot and the slot must remain contiguous */ > + unsigned contiguous:1; > + > + struct cplx_align align; > +}; > + > +static void > +mark_uniform_slots_read(struct uniform_slot_info *slots, > + unsigned num_slots, unsigned alignment) > +{ > + assert(alignment > 0 && util_is_power_of_two(alignment)); > + assert(alignment <= CPLX_ALIGN_MAX_MUL); > + > + /* We can't align a slot to anything less than the slot size */ > + alignment = MAX2(alignment, UNIFORM_SLOT_SIZE); > + > + struct cplx_align align = {alignment, 0}; > + cplx_align_assert_sane(align); > + > + for (unsigned i = 0; i < num_slots; i++) { > + slots[i].is_live = true; > + if (i < num_slots - 1) > + slots[i].contiguous = true; > + > + align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1); > + if (slots[i].align.mul == 0) { > + slots[i].align = align; > + } else { > + slots[i].align = cplx_align_combine(slots[i].align, align); > + } > + } > +} > + > +/** > * Assign UNIFORM file registers to either push constants or pull constants. > * > * We allow a fragment shader to have more than the specified minimum > @@ -1994,60 +2030,44 @@ fs_visitor::assign_constant_locations() > return; > } > > - bool is_live[uniforms]; > - memset(is_live, 0, sizeof(is_live)); > - unsigned bitsize_access[uniforms]; > - memset(bitsize_access, 0, sizeof(bitsize_access)); > - > - /* For each uniform slot, a value of true indicates that the given slot and > - * the next slot must remain contiguous. This is used to keep us from > - * splitting arrays and 64-bit values apart. > - */ > - bool contiguous[uniforms]; > - memset(contiguous, 0, sizeof(contiguous)); > + struct uniform_slot_info slots[uniforms]; > + memset(slots, 0, sizeof(slots)); > > - /* First, we walk through the instructions and do two things: > - * > - * 1) Figure out which uniforms are live. > - * > - * 2) Mark any indirectly used ranges of registers as contiguous. > - * > - * Note that we don't move constant-indexed accesses to arrays. No > - * testing has been done of the performance impact of this choice. > - */ > foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { > for (int i = 0 ; i < inst->sources; i++) { > if (inst->src[i].file != UNIFORM) > continue; > > - int constant_nr = inst->src[i].nr + inst->src[i].offset / 4; > + /* NIR tightly packs things so the uniform number might not be > + * aligned (if we have a double right after a float, for instance). > + * This is fine because the process of re-arranging them will ensure > + * that things are properly aligned. The offset into that uniform, > + * however, must be aligned. > + * > + * In Vulkan, we have explicit offsets but everything is crammed > + * into a single "variable" so inst->src[i].nr will always be 0. > + * Everything will be properly aligned relative to that one base. > + */ > + assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); > + > + unsigned u = inst->src[i].nr + > + inst->src[i].offset / UNIFORM_SLOT_SIZE; > > + if (u >= uniforms) > + continue; > + > + unsigned slots_read; > if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { > - assert(inst->src[2].ud % 4 == 0); > - unsigned last = constant_nr + (inst->src[2].ud / 4) - 1; > - assert(last < uniforms); > - > - for (unsigned j = constant_nr; j < last; j++) { > - is_live[j] = true; > - contiguous[j] = true; > - bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type)); > - } > - is_live[last] = true; > - bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type)); > + slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); > } else { > - if (constant_nr >= 0 && constant_nr < (int) uniforms) { > - int regs_read = inst->components_read(i) * > - type_sz(inst->src[i].type) / 4; > - assert(regs_read <= 2); > - if (regs_read == 2) > - contiguous[constant_nr] = true; > - for (int j = 0; j < regs_read; j++) { > - is_live[constant_nr + j] = true; > - bitsize_access[constant_nr + j] = > - MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type)); > - } > - } > + unsigned bytes_read = inst->components_read(i) * > + type_sz(inst->src[i].type); > + slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); > } > + > + assert(u + slots_read <= uniforms); > + mark_uniform_slots_read(&slots[u], slots_read, > + type_sz(inst->src[i].type)); > } > } > > @@ -2082,43 +2102,64 @@ fs_visitor::assign_constant_locations() > memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); > > int chunk_start = -1; > - unsigned max_chunk_bitsize = 0; > - > - /* First push 64-bit uniforms to ensure they are properly aligned */ > - const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF); > + struct cplx_align align; > for (unsigned u = 0; u < uniforms; u++) { > - if (!is_live[u]) > + if (!slots[u].is_live) { > + assert(chunk_start == -1); > continue; > + } > > - set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, > - contiguous[u], bitsize_access[u], > - uniform_64_bit_size, > - push_constant_loc, pull_constant_loc, > - &num_push_constants, &num_pull_constants, > - max_push_components, max_chunk_size, > - compiler->supports_pull_constants, > - stage_prog_data); > + /* Skip subgroup_id_index to put it in the last push register. */ > + if (subgroup_id_index == (int)u) > + continue; > > - } > + if (chunk_start == -1) { > + chunk_start = u; > + align = slots[u].align; > + } else { > + /* Offset into the chunk */ > + unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; > > - /* Then push the rest of uniforms */ > - const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F); > - for (unsigned u = 0; u < uniforms; u++) { > - if (!is_live[u]) > - continue; > + /* Shift the slot alignment down by the chunk offset so it is > + * comparable with the base chunk alignment. > + */ > + struct cplx_align slot_align = slots[u].align; > + slot_align.offset = > + (slot_align.offset - chunk_offset) & (align.mul - 1); > > - /* Skip subgroup_id_index to put it in the last push register. */ > - if (subgroup_id_index == (int)u) > + align = cplx_align_combine(align, slot_align); > + } > + > + /* Sanity check the alignment */ > + cplx_align_assert_sane(align); > + > + if (slots[u].contiguous) > continue; > > - set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize, > - contiguous[u], bitsize_access[u], > - uniform_32_bit_size, > - push_constant_loc, pull_constant_loc, > - &num_push_constants, &num_pull_constants, > - max_push_components, max_chunk_size, > - compiler->supports_pull_constants, > - stage_prog_data); > + /* Adjust the alignment to be in terms of slots, not bytes */ > + assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); > + assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); > + align.mul /= UNIFORM_SLOT_SIZE; > + align.offset /= UNIFORM_SLOT_SIZE; > + > + unsigned push_start_align = cplx_align_apply(align, num_push_constants); > + unsigned chunk_size = u - chunk_start + 1; > + if (!compiler->supports_pull_constants || > + (chunk_size < max_chunk_size && > + push_start_align + chunk_size <= max_push_components)) { > + /* Align up the number of push constants */ > + num_push_constants = push_start_align; > + for (unsigned i = 0; i < chunk_size; i++) > + push_constant_loc[chunk_start + i] = num_push_constants++; > + } else { > + /* We need to pull this one */ > + num_pull_constants = cplx_align_apply(align, num_pull_constants); > + for (unsigned i = 0; i < chunk_size; i++) > + pull_constant_loc[chunk_start + i] = num_pull_constants++; > + } > + > + /* Reset the chunk and start again */ > + chunk_start = -1; > } > > /* Add the CS local thread ID uniform at the end of the push constants */ > @@ -2131,8 +2172,8 @@ fs_visitor::assign_constant_locations() > uint32_t *param = stage_prog_data->param; > stage_prog_data->nr_params = num_push_constants; > if (num_push_constants) { > - stage_prog_data->param = ralloc_array(mem_ctx, uint32_t, > - num_push_constants); > + stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, > + num_push_constants); > } else { > stage_prog_data->param = NULL; > } > @@ -2140,8 +2181,8 @@ fs_visitor::assign_constant_locations() > assert(stage_prog_data->pull_param == NULL); > if (num_pull_constants > 0) { > stage_prog_data->nr_pull_params = num_pull_constants; > - stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t, > - num_pull_constants); > + stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, > + num_pull_constants); > } > > /* Now that we know how many regular uniforms we'll push, reduce the > -- > 2.5.0.400.gff86faf > </div></div>> _______________________________________________ > mesa-dev mailing list > <a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a> > <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a> </blockquote></div> </div></div>