Mesa (main): radv: lower primitive shading rate in NIR

Mon Jul 12 18:11:13 UTC 2021

Module: Mesa
Branch: main
Commit: ee79b87c62f0187daac3f8498db924429e0b5204
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ee79b87c62f0187daac3f8498db924429e0b5204

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Thu Jun 24 15:59:45 2021 +0200

radv: lower primitive shading rate in NIR

This allows more potential compiler optimizations if the value is a
constant or from a scalar load.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11579>

---

 src/amd/compiler/aco_instruction_selection.cpp | 28 +-----------
 src/amd/vulkan/radv_nir_to_llvm.c              | 25 +---------
 src/amd/vulkan/radv_shader.c                   | 63 ++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 51 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 403c785f5c6..2af31108aae 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -10355,33 +10355,7 @@ static void export_vs_psiz_layer_viewport_vrs(isel_context *ctx, int *next_pos)
       }
    }
    if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {
-      Builder bld(ctx->program, ctx->block);
-      Temp cond;
-
-      /* xRate = (shadingRate & (Horizontal2Pixels | Horizontal4Pixels)) ? 0x1 : 0x0; */
-      Temp x_rate = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(12u),
-                             Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]));
-      cond = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(x_rate));
-      x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                        bld.copy(bld.def(v1), Operand(0u)),
-                        bld.copy(bld.def(v1), Operand(1u)), cond);
-
-      /* yRate = (shadingRate & (Vertical2Pixels | Vertical4Pixels)) ? 0x1 : 0x0; */
-      Temp y_rate = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(3u),
-                             Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]));
-      cond = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(y_rate));
-      y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
-                        bld.copy(bld.def(v1), Operand(0u)),
-                        bld.copy(bld.def(v1), Operand(1u)), cond);
-
-      /* Bits [2:3] = VRS rate X
-       * Bits [4:5] = VRS rate Y
-       * HW shading rate = (xRate << 2) | (yRate << 4)
-       */
-      y_rate = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(4u), Operand(y_rate));
-      Temp out = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), Operand(x_rate), Operand(2u), Operand(y_rate));
-
-      exp->operands[1] = Operand(out);
+      exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);
       exp->enabled_mask |= 0x2;
    } else if (ctx->options->force_vrs_rates) {
       /* Bits [2:3] = VRS rate X
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index caccff0b097..dff9f635756 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1331,30 +1331,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_v
       }
 
       if (outinfo->writes_primitive_shading_rate) {
-         LLVMValueRef v = ac_to_integer(&ctx->ac, primitive_shading_rate);
-         LLVMValueRef cond;
-
-         /* xRate = (shadingRate & (Horizontal2Pixels | Horizontal4Pixels)) ? 0x1 : 0x0; */
-         LLVMValueRef x_rate =
-            LLVMBuildAnd(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 4 | 8, false), "");
-         cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, x_rate, ctx->ac.i32_0, "");
-         x_rate = LLVMBuildSelect(ctx->ac.builder, cond, ctx->ac.i32_1, ctx->ac.i32_0, "");
-
-         /* yRate = (shadingRate & (Vertical2Pixels | Vertical4Pixels)) ? 0x1 : 0x0; */
-         LLVMValueRef y_rate =
-            LLVMBuildAnd(ctx->ac.builder, v, LLVMConstInt(ctx->ac.i32, 1 | 2, false), "");
-         cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, y_rate, ctx->ac.i32_0, "");
-         y_rate = LLVMBuildSelect(ctx->ac.builder, cond, ctx->ac.i32_1, ctx->ac.i32_0, "");
-
-         /* Bits [2:3] = VRS rate X
-          * Bits [4:5] = VRS rate Y
-          * HW shading rate = (xRate << 2) | (yRate << 4)
-          */
-         v = LLVMBuildOr(
-            ctx->ac.builder,
-            LLVMBuildShl(ctx->ac.builder, x_rate, LLVMConstInt(ctx->ac.i32, 2, false), ""),
-            LLVMBuildShl(ctx->ac.builder, y_rate, LLVMConstInt(ctx->ac.i32, 4, false), ""), "");
-         pos_args[1].out[1] = ac_to_float(&ctx->ac, v);
+         pos_args[1].out[1] = primitive_shading_rate;
       } else if (ctx->args->options->force_vrs_rates) {
          /* Bits [2:3] = VRS rate X
           * Bits [4:5] = VRS rate Y
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 72d793480e7..f1cb00f27ca 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -369,6 +369,62 @@ lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
    return progress;
 }
 
+static bool
+radv_lower_primitive_shading_rate(nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   bool progress = false;
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   /* Iterate in reverse order since there should be only one deref store to PRIMITIVE_SHADING_RATE
+    * after lower_io_to_temporaries for vertex shaders.
+    */
+   nir_foreach_block_reverse(block, impl) {
+      nir_foreach_instr_reverse(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         if (intr->intrinsic != nir_intrinsic_store_deref)
+            continue;
+
+         nir_variable *var = nir_intrinsic_get_var(intr, 0);
+         if (var->data.mode != nir_var_shader_out ||
+             var->data.location != VARYING_SLOT_PRIMITIVE_SHADING_RATE)
+            continue;
+
+         b.cursor = nir_before_instr(instr);
+
+         nir_ssa_def *val = nir_ssa_for_src(&b, intr->src[1], 1);
+
+         /* x_rate = (shadingRate & (Horizontal2Pixels | Horizontal4Pixels)) ? 0x1 : 0x0; */
+         nir_ssa_def *x_rate = nir_iand(&b, val, nir_imm_int(&b, 12));
+         x_rate = nir_b2i32(&b, nir_ine(&b, x_rate, nir_imm_int(&b, 0)));
+
+         /* y_rate = (shadingRate & (Vertical2Pixels | Vertical4Pixels)) ? 0x1 : 0x0; */
+         nir_ssa_def *y_rate = nir_iand(&b, val, nir_imm_int(&b, 3));
+         y_rate = nir_b2i32(&b, nir_ine(&b, y_rate, nir_imm_int(&b, 0)));
+
+         /* Bits [2:3] = VRS rate X
+          * Bits [4:5] = VRS rate Y
+          * HW shading rate = (xRate << 2) | (yRate << 4)
+          */
+         nir_ssa_def *out = nir_ior(&b, nir_ishl(&b, x_rate, nir_imm_int(&b, 2)),
+                                        nir_ishl(&b, y_rate, nir_imm_int(&b, 4)));
+
+         nir_instr_rewrite_src(&intr->instr, &intr->src[1], nir_src_for_ssa(out));
+
+         progress = true;
+         if (nir->info.stage == MESA_SHADER_VERTEX)
+            return progress;
+      }
+   }
+
+   return progress;
+}
+
 nir_shader *
 radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *module,
                            const char *entrypoint_name, gl_shader_stage stage,
@@ -699,6 +755,13 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
     */
    NIR_PASS_V(nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
 
+   /* Lower primitive shading rate to match HW requirements. */
+   if ((nir->info.stage == MESA_SHADER_VERTEX ||
+        nir->info.stage == MESA_SHADER_GEOMETRY) &&
+       nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE)) {
+      NIR_PASS_V(nir, radv_lower_primitive_shading_rate);
+   }
+
    /* Indirect lowering must be called after the radv_optimize_nir() loop
     * has been called at least once. Otherwise indirect lowering can
     * bloat the instruction count of the loop and cause it to be