<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Fri, Feb 23, 2018 at 1:26 AM, Jose Maria Casanova Crespo <span dir="ltr"><<a href="mailto:jmcasanova@igalia.com" target="_blank">jmcasanova@igalia.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">16-bit load_ubo/ssbo operations that call do_untyped_read_vector doesn't<br>
guarantee that offsets are multiple of 4-bytes as required by untyped_read<br>
message. This happens for example on 16-bit scalar arrays and in the case<br>
of f16vec3 when then VK_KHR_relaxed_block_layoud is enabled.<br>
<br>
Vectors reads when we have non-constant offsets are implemented with<br>
multiple byte_scattered_read messages that not require 32-bit aligned offsets.<br>
The same applies for constant offsets not aligned to 32-bits.<br>
<br>
Untyped_read_surface is used message when there is a constant offset<br>
32-bit aligned and we have more than 1 component to read.<br>
---<br>
src/intel/compiler/brw_fs_nir.<wbr>cpp | 60 ++++++++++++++++++++++++++++--<wbr>---------<br>
1 file changed, 44 insertions(+), 16 deletions(-)<br>
<br>
diff --git a/src/intel/compiler/brw_fs_<wbr>nir.cpp b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
index 8efec34cc9..45b8e8b637 100644<br>
--- a/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
+++ b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
@@ -2305,27 +2305,55 @@ do_untyped_vector_read(const fs_builder &bld,<br>
if (type_sz(dest.type) <= 2) {<br>
assert(dest.stride == 1);<br>
<br>
- if (num_components > 1) {<br>
- /* Pairs of 16-bit components can be read with untyped read, for 16-bit<br>
- * vec3 4th component is ignored.<br>
+ unsigned pending_components = num_components;<br>
+ unsigned first_component = 0;<br>
+ boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;<br>
+ fs_reg read_offset;<br>
+ if (is_const_offset)<br>
+ read_offset = offset_reg;<br>
+ else {<br>
+ read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD)<wbr>;<br>
+ bld.MOV(read_offset, offset_reg);<br>
+ }<br>
+ while (pending_components > 0 &&<br>
+ (pending_components == 1 ||<br>
+ !is_const_offset ||<br>
+ (offset_reg.ud + first_component * 2) % 4)) {<br></blockquote><div><br></div><div>I think this would be easier to read as<br><br></div><div>while (pending_components > 0) {<br></div><div> if (!is_const_offset || (offset_reg.ud + first_component * type_sz(dst.type)) % 4) {<br></div><div> /* Do a single-component load with a byte message */<br></div><div> first_compoent++;<br></div><div> pending_components--;<br></div><div> } else {<br></div><div> /* Do a load with an untyped read */<br></div><div> pending_components = 0;<br></div><div> break;<br></div><div> }<br>}<br><br></div><div>Or, alternatively,<br><br></div><div>if (is_const_offset) {<br></div><div> uint32_t start = offset_reg.ud & 3;<br></div><div> uint32_t end = offset_reg + num_components * type_sz(dst.type);<br></div><div> end = ALIGN(end, 4);<br></div><div> assert(end - start <= 16);<br></div><div> /* Do a load with an untyped read and then throw away unneeded data at the beginning and the end */<br></div><div>} else {<br></div><div> /* Do the load using a loop and byte scattered reads */<br></div><div>}<br><br></div><div>This second one would require a small extension to shuffle_32bit_load_result_to_16bit_data to handle throwing away data at the start.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ /* Non constant offsets, 16-bit scalars and constant offsets not<br>
+ * aligned 32-bits are read using one byte_scattered_read message<br>
+ * for eache component untyped_read requires 32-bit aligned offsets.<br>
*/<br>
fs_reg read_result =<br>
- emit_untyped_read(bld, surf_index, offset_reg,<br>
- 1 /* dims */, DIV_ROUND_UP(num_components, 2),<br>
- BRW_PREDICATE_NONE);<br>
- shuffle_32bit_load_result_to_<wbr>16bit_data(bld,<br>
- retype(dest, BRW_REGISTER_TYPE_W),<br>
- retype(read_result, BRW_REGISTER_TYPE_D),<br>
- num_components);<br>
- } else {<br>
- assert(num_components == 1);<br>
- /* scalar 16-bit are read using one byte_scattered_read message */<br>
- fs_reg read_result =<br>
- emit_byte_scattered_read(bld, surf_index, offset_reg,<br>
+ emit_byte_scattered_read(bld, surf_index, read_offset,<br>
1 /* dims */, 1,<br>
type_sz(dest.type) * 8 /* bit_size */,<br>
BRW_PREDICATE_NONE);<br>
- bld.MOV(dest, subscript(read_result, dest.type, 0));<br>
+ shuffle_32bit_load_result_to_<wbr>16bit_data(bld,<br>
+ retype(offset(dest, bld, first_component), BRW_REGISTER_TYPE_W),<br>
+ retype(read_result, BRW_REGISTER_TYPE_D),<br>
+ 1);<br>
+ pending_components--;<br>
+ first_component ++;<br>
+ if (is_const_offset)<br>
+ read_offset.ud += 2;<br>
+ else<br>
+ bld.ADD(read_offset, offset_reg, brw_imm_ud(2 * first_component));<br>
+ }<br>
+ assert(pending_components != 1);<br>
+ if (pending_components > 1) {<br>
+ assert (is_const_offset &&<br>
+ (offset_reg.ud + first_component * 2) % 4 == 0);<br>
+ /* At this point we have multiple 16-bit components that have constant<br>
+ * offset multiple of 4-bytes that can be read with untyped_reads.<br>
+ */<br>
+ fs_reg read_result =<br>
+ emit_untyped_read(bld, surf_index, read_offset,<br>
+ 1 /* dims */, DIV_ROUND_UP(pending_<wbr>components, 2),<br>
+ BRW_PREDICATE_NONE);<br>
+ shuffle_32bit_load_result_to_<wbr>16bit_data(bld,<br>
+ retype(offset(dest,bld,first_<wbr>component), BRW_REGISTER_TYPE_W),<br>
+ retype(read_result, BRW_REGISTER_TYPE_D),<br>
+ pending_components);<br>
}<br>
} else if (type_sz(dest.type) == 4) {<br>
fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,<br>
<span class="HOEnZb"><font color="#888888">--<br>
2.14.3<br>
<br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>