Mesa (staging/21.0): aco: create helpers to emit vop3p instructions

Wed Jan 13 19:31:07 UTC 2021

Module: Mesa
Branch: staging/21.0
Commit: 01a7662154fb36337382f0df75019d7034299ba3
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=01a7662154fb36337382f0df75019d7034299ba3

Author: Daniel Schürmann <daniel at schuermann.dev>
Date:   Mon Aug 24 19:43:26 2020 +0100

aco: create helpers to emit vop3p instructions

Also make get_alu_src() capable to return
unswizzled multi-component SGPR sources.

Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6680>
(cherry picked from commit 5ad52ac90630e344650cf9a1b48820432af22680)

---

 .pick_status.json                              |  2 +-
 src/amd/compiler/aco_builder_h.py              |  1 +
 src/amd/compiler/aco_instruction_selection.cpp | 68 +++++++++++++++++++++-----
 src/amd/compiler/aco_opcodes.py                |  3 ++
 4 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 30efc7a7ec7..a9c333ce6f3 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -355,7 +355,7 @@
         "description": "aco: create helpers to emit vop3p instructions",
         "nominated": false,
         "nomination_type": null,
-        "resolution": 4,
+        "resolution": 1,
         "master_sha": null,
         "because_sha": null
     },
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index a08eafd1c26..af0eacdb7ac 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -535,6 +535,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
            ("vop2_sdwa", [Format.VOP2, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2, 3])),
            ("vopc", [Format.VOPC], 'VOPC_instruction', itertools.product([1, 2], [2])),
            ("vop3", [Format.VOP3A], 'VOP3A_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]),
+           ("vop3p", [Format.VOP3P], 'VOP3P_instruction', [(1, 2), (1, 3)]),
            ("vintrp", [Format.VINTRP], 'Interp_instruction', [(1, 2), (1, 3)]),
            ("vop1_dpp", [Format.VOP1, Format.DPP], 'DPP_instruction', [(1, 1)]),
            ("vop2_dpp", [Format.VOP2, Format.DPP], 'DPP_instruction', itertools.product([1, 2], [2, 3])),
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 9c6bbe00003..0dad702e419 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -657,21 +657,20 @@ Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src
 
 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 {
-   if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
+   if (src.src.ssa->num_components == 1 && size == 1)
       return get_ssa_temp(ctx, src.src.ssa);
 
-   if (src.src.ssa->num_components == size) {
-      bool identity_swizzle = true;
-      for (unsigned i = 0; identity_swizzle && i < size; i++) {
-         if (src.swizzle[i] != i)
-            identity_swizzle = false;
-      }
-      if (identity_swizzle)
-         return get_ssa_temp(ctx, src.src.ssa);
-   }
-
    Temp vec = get_ssa_temp(ctx, src.src.ssa);
    unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
+   bool identity_swizzle = true;
+
+   for (unsigned i = 0; identity_swizzle && i < size; i++) {
+      if (src.swizzle[i] != i)
+         identity_swizzle = false;
+   }
+   if (identity_swizzle)
+      return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
+
    assert(elem_size > 0);
    assert(vec.bytes() % elem_size == 0);
 
@@ -701,6 +700,33 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
    }
 }
 
+Temp get_alu_src_vop3p(struct isel_context *ctx, nir_alu_src src)
+{
+   /* returns v2b or v1 for vop3p usage.
+    * The source expects exactly 2 16bit components
+    * which are within the same dword
+    */
+   assert(src.src.ssa->bit_size == 16);
+   assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
+
+   Temp tmp = get_ssa_temp(ctx, src.src.ssa);
+   if (tmp.size() == 1)
+      return tmp;
+
+   /* the size is larger than 1 dword: check the swizzle */
+   unsigned dword = src.swizzle[0] >> 1;
+
+   /* extract a full dword if possible */
+   if (tmp.bytes() >= (dword + 1) * 4) {
+      return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));
+   } else {
+      /* This must be a swizzled access to %a.zz where %a is v6b */
+      assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
+      assert(tmp.regClass() == v6b && dword == 1);
+      return emit_extract_vector(ctx, tmp, dword * 2, v2b);
+   }
+}
+
 uint32_t get_alu_src_ub(isel_context *ctx, nir_alu_instr *instr, int src_idx)
 {
    nir_ssa_scalar scalar = nir_ssa_scalar{instr->src[src_idx].src.ssa,
@@ -848,6 +874,26 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode
    }
 }
 
+Builder::Result emit_vop3p_instruction(isel_context *ctx, nir_alu_instr *instr,
+                                       aco_opcode op, Temp dst, bool swap_srcs=false)
+{
+   Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
+   Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
+   if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
+      src1 = as_vgpr(ctx, src1);
+   assert(instr->dest.dest.ssa.num_components == 2);
+
+   /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
+   unsigned opsel_lo = (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
+   unsigned opsel_hi = (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
+
+   Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
+   Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
+   emit_split_vector(ctx, dst, 2);
+   return res;
+}
+
 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index 82b47e8e1eb..2a8bc8c4cdc 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -134,6 +134,9 @@ class Format(Enum):
                  ('uint8_t', 'row_mask', '0xF'),
                  ('uint8_t', 'bank_mask', '0xF'),
                  ('bool', 'bound_ctrl', 'true')]
+      elif self == Format.VOP3P:
+         return [('uint8_t', 'opsel_lo', None),
+                 ('uint8_t', 'opsel_hi', None)]
       elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
          return [('uint16_t', 'offset', 0),
                  ('memory_sync_info', 'sync', 'memory_sync_info()'),