Mesa (master): freedreno/ir3: Enable the i/o vectorizer on UBOs.

Wed Sep 30 20:10:27 UTC 2020

Module: Mesa
Branch: master
Commit: 49ec863e8303170fd2a871689f9d9366215dca7e
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=49ec863e8303170fd2a871689f9d9366215dca7e

Author: Eric Anholt <eric at anholt.net>
Date:   Thu Aug 20 13:29:58 2020 -0700

freedreno/ir3: Enable the i/o vectorizer on UBOs.

This will merge loads of UBO components together into vec4 loads.  At the
same time, it improves the alignment information on our loads, fixing the
regression from the vec3 loads fix.

shader-db results:
total instructions in shared programs: 12829370 -> 8755851 (-31.75%)
total cat6 in shared programs: 145840 -> 97027 (-33.47%)

Overall results from before the vec3 fix:
total instructions in shared programs: 8019997 -> 8755851 (9.18%)
total cat6 in shared programs: 87683 -> 97027 (10.66%)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6612>

---

 src/freedreno/ir3/ir3_nir.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 64daa685705..2628746a302 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -145,6 +145,36 @@ ir3_get_compiler_options(struct ir3_compiler *compiler)
 	return &options;
 }
 
+static bool
+ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
+		unsigned bit_size,
+		unsigned num_components,
+		nir_intrinsic_instr *low,
+		nir_intrinsic_instr *high)
+{
+	assert(bit_size >= 8);
+	if (bit_size != 32)
+		return false;
+	unsigned byte_size = bit_size / 8;
+
+	int size = num_components * byte_size;
+
+	/* Don't care about alignment past vec4. */
+	assert(util_is_power_of_two_nonzero(align_mul));
+	align_mul = MIN2(align_mul, 16);
+	align_offset &= 15;
+
+	/* Our offset alignment should aways be at least 4 bytes */
+	if (align_mul < 4)
+		return false;
+
+	unsigned worst_start_offset = 16 - align_mul + align_offset;
+	if (worst_start_offset + size > 16)
+		return false;
+
+	return true;
+}
+
 #define OPT(nir, pass, ...) ({                             \
    bool this_progress = false;                             \
    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
@@ -188,6 +218,9 @@ ir3_optimize_loop(nir_shader *s)
 		progress |= OPT(s, nir_lower_pack);
 		progress |= OPT(s, nir_opt_constant_folding);
 
+		progress |= OPT(s, nir_opt_load_store_vectorize, nir_var_mem_ubo,
+				ir3_nir_should_vectorize_mem, 0);
+
 		if (lower_flrp != 0) {
 			if (OPT(s, nir_lower_flrp,
 					lower_flrp,