Mesa (main): aco: Omit p_extract after ds_read with matching bit size.

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Sep 28 18:24:25 UTC 2021


Module: Mesa
Branch: main
Commit: d3e0cf3d323347a5bb96e70dbcc19ccae06c5bf8
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d3e0cf3d323347a5bb96e70dbcc19ccae06c5bf8

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Fri Aug 27 15:45:59 2021 +0200

aco: Omit p_extract after ds_read with matching bit size.

Fossil DB stats on Sienna Cichlid:

Totals from 135 (0.10% of 128647) affected shaders:
CodeSize: 525184 -> 523704 (-0.28%)
Instrs: 92835 -> 92684 (-0.16%)
Latency: 311528 -> 311055 (-0.15%)
InvThroughput: 86572 -> 86455 (-0.14%)
Copies: 7666 -> 7650 (-0.21%)

Fossil DB stats on Sienna Cichlid with NGGC on:

Totals from 58374 (45.38% of 128647) affected shaders:
CodeSize: 160322912 -> 159622564 (-0.44%)
Instrs: 30755822 -> 30639193 (-0.38%)
Latency: 136713768 -> 136690360 (-0.02%)
InvThroughput: 21739219 -> 21658151 (-0.37%)
Copies: 3297969 -> 3297953 (-0.00%)

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11560>

---

 src/amd/compiler/aco_optimizer.cpp | 61 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 7b22d76b64a..5f528179fd9 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -1688,6 +1688,13 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       }
       break;
    }
+   case aco_opcode::ds_read_u8:
+   case aco_opcode::ds_read_u8_d16:
+   case aco_opcode::ds_read_u16:
+   case aco_opcode::ds_read_u16_d16: {
+      ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
+      break;
+   }
    default: break;
    }
 
@@ -2893,6 +2900,57 @@ apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    return true;
 }
 
+/* Remove superfluous extract after ds_read like so:
+ * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN()
+ */
+bool
+apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
+{
+   /* Check if p_extract has a usedef operand and is the only user. */
+   if (!ctx.info[extract->operands[0].tempId()].is_usedef() ||
+       ctx.uses[extract->operands[0].tempId()] > 1)
+      return false;
+
+   /* Check if the usedef is a DS instruction. */
+   Instruction* ds = ctx.info[extract->operands[0].tempId()].instr;
+   if (ds->format != Format::DS)
+      return false;
+
+   unsigned extract_idx = extract->operands[1].constantValue();
+   unsigned bits_extracted = extract->operands[2].constantValue();
+   unsigned sign_ext = extract->operands[3].constantValue();
+   unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
+
+   /* TODO: These are doable, but probably don't occour too often. */
+   if (extract_idx || sign_ext || dst_bitsize != 32)
+      return false;
+
+   unsigned bits_loaded = 0;
+   if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16)
+      bits_loaded = 8;
+   else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16)
+      bits_loaded = 16;
+   else
+      return false;
+
+   /* Shrink the DS load if the extracted bit size is smaller. */
+   bits_loaded = MIN2(bits_loaded, bits_extracted);
+
+   /* Change the DS opcode so it writes the full register. */
+   if (bits_loaded == 8)
+      ds->opcode = aco_opcode::ds_read_u8;
+   else if (bits_loaded == 16)
+      ds->opcode = aco_opcode::ds_read_u16;
+   else
+      unreachable("Forgot to add DS opcode above.");
+
+   /* The DS now produces the exact same thing as the extract, remove the extract. */
+   std::swap(ds->definitions[0], extract->definitions[0]);
+   ctx.uses[extract->definitions[0].tempId()] = 0;
+   ctx.info[ds->definitions[0].tempId()].label = 0;
+   return true;
+}
+
 /* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
 bool
 combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
@@ -3217,6 +3275,9 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    if (instr->isSDWA() || instr->isDPP())
       return;
 
+   if (instr->opcode == aco_opcode::p_extract)
+      apply_ds_extract(ctx, instr);
+
    /* TODO: There are still some peephole optimizations that could be done:
     * - abs(a - b) -> s_absdiff_i32
     * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32



More information about the mesa-commit mailing list