Mesa (main): pan/bi: Interpolate varyings at 16-bit

Mon May 30 22:27:58 UTC 2022

Module: Mesa
Branch: main
Commit: 017050062760220d269d81da1b1e06df5d63f191
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=017050062760220d269d81da1b1e06df5d63f191

Author: Alyssa Rosenzweig <alyssa at collabora.com>
Date:   Fri May 27 14:39:14 2022 -0400

pan/bi: Interpolate varyings at 16-bit

On Bifrost, we have a single "load float varying" instruction that controls the
bit size of the result, allowing us to fold a f2f16 into the load. However, the
larger benefit is that 16-bit varying loads are interpolated at 16-bit. Arm
claims that the varying unit has a 32-bit data path, allowing 16-bit varyings to
be interpolated in half the cycles from 32-bit. This change should therefore
improve performance for workloads that are varying units. This means we want to
be aggressive about 16-bit varying loads, even if it costs some extra f2f32
instructions.

glmark2 total score on Mali-G52 up from 1173fps to 1218fps with particular wins
in -brefract, -bshadow, -bjellyfish, and -bshading.

total instructions in shared programs: 2432246 -> 2423668 (-0.35%)
instructions in affected programs: 516056 -> 507478 (-1.66%)
helped: 3641
HURT: 432
helped stats (abs) min: 1.0 max: 12.0 x̄: 2.91 x̃: 2
helped stats (rel) min: 0.08% max: 54.55% x̄: 9.88% x̃: 5.71%
HURT stats (abs)   min: 1.0 max: 42.0 x̄: 4.71 x̃: 4
HURT stats (rel)   min: 0.23% max: 200.00% x̄: 12.58% x̃: 6.37%
95% mean confidence interval for instructions value: -2.21 -2.00
95% mean confidence interval for instructions %-change: -7.92% -7.07%
Instructions are helped.

total tuples in shared programs: 1941309 -> 1934647 (-0.34%)
tuples in affected programs: 353169 -> 346507 (-1.89%)
helped: 3233
HURT: 453
helped stats (abs) min: 1.0 max: 14.0 x̄: 2.46 x̃: 2
helped stats (rel) min: 0.12% max: 50.00% x̄: 9.90% x̃: 5.56%
HURT stats (abs)   min: 1.0 max: 25.0 x̄: 2.85 x̃: 2
HURT stats (rel)   min: 0.22% max: 150.00% x̄: 8.96% x̃: 5.26%
95% mean confidence interval for tuples value: -1.89 -1.72
95% mean confidence interval for tuples %-change: -8.01% -7.15%
Tuples are helped.

total clauses in shared programs: 357354 -> 356610 (-0.21%)
clauses in affected programs: 25794 -> 25050 (-2.88%)
helped: 994
HURT: 317
helped stats (abs) min: 1.0 max: 3.0 x̄: 1.16 x̃: 1
helped stats (rel) min: 1.49% max: 33.33% x̄: 10.78% x̃: 10.00%
HURT stats (abs)   min: 1.0 max: 4.0 x̄: 1.31 x̃: 1
HURT stats (rel)   min: 1.19% max: 50.00% x̄: 13.56% x̃: 8.33%
95% mean confidence interval for clauses value: -0.63 -0.50
95% mean confidence interval for clauses %-change: -5.63% -4.16%
Clauses are helped.

total cycles in shared programs: 167697.96 -> 167431.15 (-0.16%)
cycles in affected programs: 12638.29 -> 12371.48 (-2.11%)
helped: 2652
HURT: 350
helped stats (abs) min: 0.04166399999999726 max: 0.75 x̄: 0.11 x̃: 0
helped stats (rel) min: 0.12% max: 100.00% x̄: 14.39% x̃: 5.04%
HURT stats (abs)   min: 0.041665999999999315 max: 0.5833329999999997 x̄: 0.11 x̃: 0
HURT stats (rel)   min: 0.00% max: 75.00% x̄: 7.90% x̃: 4.71%
95% mean confidence interval for cycles value: -0.09 -0.08
95% mean confidence interval for cycles %-change: -12.56% -11.02%
Cycles are helped.

total arith in shared programs: 74169.46 -> 73891.71 (-0.37%)
arith in affected programs: 13885.87 -> 13608.12 (-2.00%)
helped: 3215
HURT: 445
helped stats (abs) min: 0.04166399999999726 max: 0.5416680000000014 x̄: 0.10 x̃: 0
helped stats (rel) min: 0.12% max: 100.00% x̄: 14.16% x̃: 6.67%
HURT stats (abs)   min: 0.041665999999999315 max: 1.125 x̄: 0.12 x̃: 0
HURT stats (rel)   min: 0.00% max: 100.00% x̄: 9.76% x̃: 5.49%
95% mean confidence interval for arith value: -0.08 -0.07
95% mean confidence interval for arith %-change: -11.91% -10.59%
Arith are helped.

total texture in shared programs: 11936 -> 11931 (-0.04%)
texture in affected programs: 20 -> 15 (-25.00%)
helped: 10
HURT: 0
helped stats (abs) min: 0.5 max: 0.5 x̄: 0.50 x̃: 0
helped stats (rel) min: 14.29% max: 100.00% x̄: 45.71% x̃: 33.33%
95% mean confidence interval for texture value: -0.50 -0.50
95% mean confidence interval for texture %-change: -73.16% -18.26%
Texture are helped.

total vary in shared programs: 4180.88 -> 3447.19 (-17.55%)
vary in affected programs: 2109.88 -> 1376.19 (-34.77%)
helped: 2202
HURT: 39
helped stats (abs) min: 0.0625 max: 1.4375 x̄: 0.34 x̃: 0
helped stats (rel) min: 2.38% max: 66.67% x̄: 40.43% x̃: 50.00%
HURT stats (abs)   min: 0.125 max: 0.375 x̄: 0.26 x̃: 0
HURT stats (rel)   min: 0.00% max: 300.00% x̄: 92.54% x̃: 23.08%
95% mean confidence interval for vary value: -0.34 -0.32
95% mean confidence interval for vary %-change: -39.22% -37.01%
Vary are helped.

total quadwords in shared programs: 1689664 -> 1684852 (-0.28%)
quadwords in affected programs: 265522 -> 260710 (-1.81%)
helped: 2864
HURT: 447
helped stats (abs) min: 1.0 max: 14.0 x̄: 2.10 x̃: 2
helped stats (rel) min: 0.15% max: 31.58% x̄: 6.05% x̃: 4.65%
HURT stats (abs)   min: 1.0 max: 22.0 x̄: 2.67 x̃: 2
HURT stats (rel)   min: 0.27% max: 38.46% x̄: 6.79% x̃: 4.55%
95% mean confidence interval for quadwords value: -1.54 -1.37
95% mean confidence interval for quadwords %-change: -4.55% -4.08%
Quadwords are helped.

total threads in shared programs: 53656 -> 53688 (0.06%)
threads in affected programs: 32 -> 64 (100.00%)
helped: 32
HURT: 0
helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1
helped stats (rel) min: 100.00% max: 100.00% x̄: 100.00% x̃: 100.00%
95% mean confidence interval for threads value: 1.00 1.00
95% mean confidence interval for threads %-change: 100.00% 100.00%
Threads are helped.

total preloads in shared programs: 116212 -> 103476 (-10.96%)
preloads in affected programs: 45222 -> 32486 (-28.16%)
helped: 3022
HURT: 11
helped stats (abs) min: 1.0 max: 11.0 x̄: 4.23 x̃: 4
helped stats (rel) min: 7.14% max: 68.75% x̄: 30.39% x̃: 25.00%
HURT stats (abs)   min: 2.0 max: 4.0 x̄: 3.45 x̃: 4
HURT stats (rel)   min: 14.29% max: 50.00% x̄: 25.93% x̃: 25.00%
95% mean confidence interval for preloads value: -4.26 -4.14
95% mean confidence interval for preloads %-change: -30.68% -29.69%
Preloads are helped.

Signed-off-by: Alyssa Rosenzweig <alyssa at collabora.com>
Tested-by: Chris Healy cphealy at gmail.com
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16752>

---

 src/panfrost/bifrost/bifrost_compile.c | 75 +++++++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 305d6e19db0..b39e4b1eb5b 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -4748,6 +4748,76 @@ bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
         }
 }
 
+/*
+ * Build a bit mask of varyings (by location) that are flatshaded. This
+ * information is needed by lower_mediump_io, as we don't yet support 16-bit
+ * flat varyings.
+ *
+ * Also varyings that are used as texture coordinates should be kept at fp32 so
+ * the texture instruction may be promoted to VAR_TEX. In general this is a good
+ * idea, as fp16 texture coordinates are not supported by the hardware and are
+ * usually inappropriate. (There are both relevant CTS bugs here, even.)
+ *
+ * TODO: If we compacted the varyings with some fixup code in the vertex shader,
+ * we could implement 16-bit flat varyings. Consider if this case matters.
+ *
+ * TODO: The texture coordinate handling could be less heavyhanded.
+ */
+static bool
+bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
+{
+        uint64_t *mask = data;
+
+        if (instr->type != nir_instr_type_tex)
+                return false;
+
+        nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+        int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
+        if (coord_idx < 0)
+                return false;
+
+        nir_src src = tex->src[coord_idx].src;
+        assert(src.is_ssa);
+
+        nir_ssa_scalar x = nir_ssa_scalar_resolved(src.ssa, 0);
+        nir_ssa_scalar y = nir_ssa_scalar_resolved(src.ssa, 1);
+
+        if (x.def != y.def)
+                return false;
+
+        nir_instr *parent = x.def->parent_instr;
+
+        if (parent->type != nir_instr_type_intrinsic)
+                return false;
+
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
+
+        if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
+                return false;
+
+        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+        *mask |= BITFIELD64_BIT(sem.location);
+        return false;
+}
+
+static uint64_t
+bi_fp32_varying_mask(nir_shader *nir)
+{
+        uint64_t mask = 0;
+
+        assert(nir->info.stage == MESA_SHADER_FRAGMENT);
+
+        nir_foreach_shader_in_variable(var, nir) {
+                if (var->data.interpolation == INTERP_MODE_FLAT)
+                        mask |= BITFIELD64_BIT(var->data.location);
+        }
+
+        nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all, &mask);
+
+        return mask;
+}
+
 static void
 bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
 {
@@ -4790,8 +4860,9 @@ bi_finalize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
         NIR_PASS_V(nir, nir_opt_constant_folding);
 
         if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-                NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
-                                ~0, false);
+                NIR_PASS_V(nir, nir_lower_mediump_io,
+                           nir_var_shader_in | nir_var_shader_out,
+                           ~bi_fp32_varying_mask(nir), false);
         } else {
                 if (gpu_id >= 0x9000) {
                         NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,