Mesa (master): aco: Fix reductions on GFX10.

Tue Oct 29 01:02:11 UTC 2019

Module: Mesa
Branch: master
Commit: 3865448012b16d0e98e706e1b462242a754436c7
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=3865448012b16d0e98e706e1b462242a754436c7

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Thu Sep 12 19:28:52 2019 +0100

aco: Fix reductions on GFX10.

Fixes p_reduce (all cluster sizes), p_inclusive_scan and p_exclusive_scan
with all reduction operations.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>

---

 src/amd/compiler/aco_ir.h                  |  2 +-
 src/amd/compiler/aco_lower_to_hw_instr.cpp | 92 ++++++++++++++++++++++++++----
 src/amd/compiler/aco_reduce_assign.cpp     | 19 +++---
 3 files changed, 95 insertions(+), 18 deletions(-)

diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 739ef869e6a..90fc3c6fe36 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -841,7 +841,7 @@ enum ReduceOp {
  * Operand(2): vector temporary
  * Definition(0): result
  * Definition(1): scalar temporary
- * Definition(2): scalar identity temporary
+ * Definition(2): scalar identity temporary (not used to store identity on GFX10)
  * Definition(3): scc clobber
  * Definition(4): vcc clobber
  *
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 39585111954..2cd451e48c5 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -85,6 +85,22 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, Ph
    }
 }
 
+void emit_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1,
+             aco_opcode op, Format format, bool clobber_vcc, unsigned size)
+{
+   aco_ptr<Instruction> instr;
+   if (format == Format::VOP3)
+      instr.reset(create_instruction<VOP3A_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
+   else
+      instr.reset(create_instruction<VOP2_instruction>(op, format, 2, clobber_vcc ? 2 : 1));
+   instr->operands[0] = Operand(src0, src0.reg >= 256 ? v1 : s1);
+   instr->operands[1] = Operand(src1, v1);
+   instr->definitions[0] = Definition(dst, v1);
+   if (clobber_vcc)
+      instr->definitions[1] = Definition(vcc, s2);
+   ctx->instructions.emplace_back(std::move(instr));
+}
+
 uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
 {
    switch (op) {
@@ -236,12 +252,12 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
    Operand vcndmask_identity[2] = {identity[0], identity[1]};
 
    /* First, copy the source to tmp and set inactive lanes to the identity */
-   // note: this clobbers SCC!
    bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2));
 
    for (unsigned i = 0; i < src.size(); i++) {
-      /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 */
-      if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
+      /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
+       * except on GFX10, where v_writelane_b32 can take a literal. */
+      if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
          bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
          identity[i] = Operand(PhysReg{sitmp+i}, s1);
 
@@ -283,6 +299,16 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
          exec_restored = true;
          emit_vopn(ctx, dst.physReg(), vtmp, tmp, src.regClass(), reduce_opcode, format, should_clobber_vcc);
          dst_written = true;
+      } else if (ctx->program->chip_class >= GFX10) {
+         assert(cluster_size == 64);
+         /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
+         for (unsigned i = 0; i < src.size(); i++)
+            bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
+         emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
+
+         for (unsigned i = 0; i < src.size(); i++)
+            bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+         emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
       } else {
          assert(cluster_size == 64);
          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
@@ -292,11 +318,38 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
       }
       break;
    case aco_opcode::p_exclusive_scan:
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
-                  dpp_wf_sr1, 0xf, 0xf, true, src.size());
+      if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
+         /* shift rows right */
+         for (unsigned i = 0; i < src.size(); i++) {
+            bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), dpp_row_sr(1), 0xf, 0xf, true);
+         }
+
+         /* fill in the gaps in rows 1 and 3 */
+         bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
+         bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
+         for (unsigned i = 0; i < src.size(); i++) {
+            Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
+                                         Definition(PhysReg{vtmp+i}, v1),
+                                         Operand(PhysReg{tmp+i}, v1),
+                                         Operand(0xffffffffu), Operand(0xffffffffu)).instr;
+            static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
+         }
+         bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
+
+         /* fill in the gap in row 2 */
+         for (unsigned i = 0; i < src.size(); i++) {
+            bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+            bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
+         }
+         std::swap(tmp, vtmp);
+      } else {
+         emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false,
+                     dpp_wf_sr1, 0xf, 0xf, true, src.size());
+      }
       for (unsigned i = 0; i < src.size(); i++) {
          if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
-            assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
+            if (ctx->program->chip_class < GFX10)
+               assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
             bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
                      identity[i], Operand(0u));
          }
@@ -312,10 +365,29 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
                   dpp_row_sr(4), 0xf, 0xf, false, src.size(), identity);
       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
                   dpp_row_sr(8), 0xf, 0xf, false, src.size(), identity);
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
-                  dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
-      emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
-                  dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
+      if (ctx->program->chip_class >= GFX10) {
+         bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xffff0000u));
+         bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffff0000u));
+         for (unsigned i = 0; i < src.size(); i++) {
+            Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
+                                         Definition(PhysReg{vtmp+i}, v1),
+                                         Operand(PhysReg{tmp+i}, v1),
+                                         Operand(0xffffffffu), Operand(0xffffffffu)).instr;
+            static_cast<VOP3A_instruction*>(perm)->opsel[0] = true; /* FI (Fetch Inactive) */
+         }
+         emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size());
+
+         bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
+         bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
+         for (unsigned i = 0; i < src.size(); i++)
+            bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
+         emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size());
+      } else {
+         emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
+                     dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity);
+         emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc,
+                     dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity);
+      }
       break;
    default:
       unreachable("Invalid reduction mode");
diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp
index 663a43c539a..66a3ec64c04 100644
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@@ -115,10 +115,13 @@ void setup_reduce_temp(Program* program)
          }
 
          /* same as before, except for the vector temporary instead of the reduce temporary */
+         unsigned cluster_size = static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size;
          bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
                           op == fmin64 || op == fmax64;
+         if (program->chip_class >= GFX10 && cluster_size == 64)
+            need_vtmp = true;
 
-         need_vtmp |= static_cast<Pseudo_reduction_instruction *>(instr)->cluster_size == 32;
+         need_vtmp |= cluster_size == 32;
          vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
          if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
             vtmp = {program->allocateId(), vtmp.regClass()};
@@ -144,12 +147,14 @@ void setup_reduce_temp(Program* program)
          instr->definitions[1] = bld.def(s2);
 
          /* scalar identity temporary */
-         if (instr->opcode == aco_opcode::p_exclusive_scan &&
-             (op == imin32 || op == imin64 ||
-              op == imax32 || op == imax64 ||
-              op == fmin32 || op == fmin64 ||
-              op == fmax32 || op == fmax64 ||
-              op == fmul64)) {
+         bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64;
+         if (instr->opcode == aco_opcode::p_exclusive_scan) {
+            need_sitmp |=
+               (op == imin32 || op == imin64 || op == imax32 || op == imax64 ||
+                op == fmin32 || op == fmin64 || op == fmax32 || op == fmax64 ||
+                op == fmul64);
+         }
+         if (need_sitmp) {
             instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size()));
          }