Mesa (main): broadcom/compiler: implement nir_intrinsic_load_subgroup_id correctly

Tue Jun 29 07:03:32 UTC 2021

Module: Mesa
Branch: main
Commit: 30dec8b414ef6113ba36726e3e72915a7b7b5288
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=30dec8b414ef6113ba36726e3e72915a7b7b5288

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Tue Jun 22 10:55:04 2021 +0200

broadcom/compiler: implement nir_intrinsic_load_subgroup_id correctly

For some reason, this was implemented with the bulk of the compute
shader enablement, but this intrinsic is specific to subgroups and
thus was not really used. Also, its implementation was not correct,
since it was returning the element index within the subgroup, not
the subgroup index itself, which is the index of the batch in the
dispatch.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11620>

---

 src/broadcom/compiler/nir_to_vir.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index fced8e168de..de1cc9e6077 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -2773,6 +2773,13 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
         }
 }
 
+static inline struct qreg
+emit_load_local_invocation_index(struct v3d_compile *c)
+{
+        return vir_SHR(c, c->cs_payload[1],
+                       vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
+}
+
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
@@ -3034,12 +3041,6 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 }
                 break;
 
-        case nir_intrinsic_load_local_invocation_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_SHR(c, c->cs_payload[1],
-                                       vir_uniform_ui(c, 32 - c->local_invocation_index_bits)));
-                break;
-
         case nir_intrinsic_load_workgroup_id: {
                 struct qreg x = vir_AND(c, c->cs_payload[0],
                                          vir_uniform_ui(c, 0xffff));
@@ -3066,10 +3067,24 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 break;
         }
 
-        case nir_intrinsic_load_subgroup_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+        case nir_intrinsic_load_local_invocation_index:
+                ntq_store_dest(c, &instr->dest, 0,
+                               emit_load_local_invocation_index(c));
                 break;
 
+        case nir_intrinsic_load_subgroup_id: {
+                /* This is basically the batch index, which is the Local
+                 * Invocation Index divided by the SIMD width).
+                 */
+                STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
+                const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
+                struct qreg lii = emit_load_local_invocation_index(c);
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_SHR(c, lii,
+                                       vir_uniform_ui(c, divide_shift)));
+                break;
+        }
+
         case nir_intrinsic_load_per_vertex_input: {
                 /* The vertex shader writes all its used outputs into
                  * consecutive VPM offsets, so if any output component is