Mesa (main): radv,aco,ac/llvm: use nir_op_f{sin,cos}_amd

Thu Jul 7 23:01:08 UTC 2022

Module: Mesa
Branch: main
Commit: 48578713b78e20bd1706cf65838fa6586ba35a63
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=48578713b78e20bd1706cf65838fa6586ba35a63

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Mon May  3 11:10:06 2021 +0100

radv,aco,ac/llvm: use nir_op_f{sin,cos}_amd

This lets NIR optimize the multiplication, particularly sin/cos(a * #b).

fossil-db (Sienna Cichlid):
Totals from 12306 (7.58% of 162293) affected shaders:
MaxWaves: 224814 -> 224834 (+0.01%)
Instrs: 17365273 -> 17338758 (-0.15%); split: -0.16%, +0.00%
CodeSize: 93478488 -> 93354912 (-0.13%); split: -0.14%, +0.01%
VGPRs: 752080 -> 752072 (-0.00%); split: -0.00%, +0.00%
SpillSGPRs: 8440 -> 8410 (-0.36%)
Latency: 200402154 -> 200279405 (-0.06%); split: -0.06%, +0.00%
InvThroughput: 37588077 -> 37545545 (-0.11%); split: -0.11%, +0.00%
VClause: 293863 -> 293874 (+0.00%); split: -0.03%, +0.03%
SClause: 619539 -> 619064 (-0.08%); split: -0.09%, +0.01%
Copies: 1151591 -> 1151641 (+0.00%); split: -0.04%, +0.05%
Branches: 506434 -> 506437 (+0.00%); split: -0.00%, +0.00%
PreSGPRs: 877609 -> 877517 (-0.01%); split: -0.01%, +0.00%
PreVGPRs: 711938 -> 711940 (+0.00%); split: -0.00%, +0.00%

fossil-db (LLVM, Sienna Cichlid):
Totals from 4377 (3.59% of 121873) affected shaders:
SGPRs: 358960 -> 359176 (+0.06%); split: -0.18%, +0.25%
VGPRs: 319832 -> 319720 (-0.04%); split: -0.18%, +0.15%
SpillSGPRs: 46983 -> 47007 (+0.05%); split: -0.99%, +1.04%
CodeSize: 30872812 -> 30764512 (-0.35%); split: -0.39%, +0.04%
MaxWaves: 73814 -> 73904 (+0.12%); split: +0.25%, -0.13%

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10587>

---

 src/amd/compiler/aco_instruction_selection.cpp       | 19 +++++++------------
 src/amd/compiler/aco_instruction_selection_setup.cpp |  4 ++--
 src/amd/llvm/ac_nir_to_llvm.c                        | 10 ++++++++++
 src/amd/vulkan/radv_shader.c                         | 17 +++++++++++++++++
 4 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 34e82a11b7f..442b187fc8f 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -2750,27 +2750,22 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       }
       break;
    }
-   case nir_op_fsin:
-   case nir_op_fcos: {
+   case nir_op_fsin_amd:
+   case nir_op_fcos_amd: {
       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
       aco_ptr<Instruction> norm;
       if (dst.regClass() == v2b) {
-         Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u));
-         Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), half_pi, src);
          aco_opcode opcode =
-            instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
-         bld.vop1(opcode, Definition(dst), tmp);
+            instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
+         bld.vop1(opcode, Definition(dst), src);
       } else if (dst.regClass() == v1) {
-         Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u));
-         Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
-
          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
          if (ctx->options->gfx_level < GFX9)
-            tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
+            src = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), src);
 
          aco_opcode opcode =
-            instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
-         bld.vop1(opcode, Definition(dst), tmp);
+            instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
+         bld.vop1(opcode, Definition(dst), src);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
       }
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 99ebec2031e..719c5b8b978 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -501,8 +501,8 @@ init_context(isel_context* ctx, nir_shader* shader)
                case nir_op_fceil:
                case nir_op_ftrunc:
                case nir_op_fround_even:
-               case nir_op_fsin:
-               case nir_op_fcos:
+               case nir_op_fsin_amd:
+               case nir_op_fcos_amd:
                case nir_op_f2f16:
                case nir_op_f2f16_rtz:
                case nir_op_f2f16_rtne:
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index fb09193443b..53f6242ae33 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -854,6 +854,16 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
       result =
          emit_intrin_1f_param(&ctx->ac, "llvm.cos", ac_to_float_type(&ctx->ac, def_type), src[0]);
       break;
+   case nir_op_fsin_amd:
+   case nir_op_fcos_amd:
+      /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
+      if (ctx->ac.gfx_level < GFX9)
+         src[0] = emit_intrin_1f_param_scalar(&ctx->ac, "llvm.amdgcn.fract",
+                                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+      result =
+         emit_intrin_1f_param(&ctx->ac, instr->op == nir_op_fsin_amd ? "llvm.amdgcn.sin" : "llvm.amdgcn.cos",
+                              ac_to_float_type(&ctx->ac, def_type), src[0]);
+      break;
    case nir_op_fsqrt:
       result =
          emit_intrin_1f_param(&ctx->ac, "llvm.sqrt", ac_to_float_type(&ctx->ac, def_type), src[0]);
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 3977ac031bb..afe76d6e65e 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -639,6 +639,21 @@ radv_lower_ms_workgroup_id(nir_shader *nir)
    return progress;
 }
 
+static bool
+is_sincos(const nir_instr *instr, const void *_)
+{
+   return instr->type == nir_instr_type_alu &&
+          (nir_instr_as_alu(instr)->op == nir_op_fsin || nir_instr_as_alu(instr)->op == nir_op_fcos);
+}
+
+static nir_ssa_def *
+lower_sincos(struct nir_builder *b, nir_instr *instr, void *_)
+{
+   nir_alu_instr *sincos = nir_instr_as_alu(instr);
+   nir_ssa_def *src = nir_fmul_imm(b, nir_ssa_for_alu_src(b, sincos, 0), 0.15915493667125702);
+   return sincos->op == nir_op_fsin ? nir_fsin_amd(b, src) : nir_fcos_amd(b, src);
+}
+
 nir_shader *
 radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_pipeline_stage *stage,
                          const struct radv_pipeline_key *key)
@@ -849,6 +864,8 @@ radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_pipeline_
       }
 
       NIR_PASS(_, nir, nir_lower_doubles, NULL, lower_doubles);
+
+      NIR_PASS(_, nir, nir_shader_lower_instructions, &is_sincos, &lower_sincos, NULL);
    }
 
    NIR_PASS(_, nir, nir_lower_system_values);