Mesa (staging/21.1): aco: use v1b/v2b for ds_read_u8/ds_read_u16

Thu Jun 10 11:32:25 UTC 2021

Module: Mesa
Branch: staging/21.1
Commit: d9c4ac6c04622962ecfed89c5e763028772e4c40
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d9c4ac6c04622962ecfed89c5e763028772e4c40

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Mon May 31 18:18:24 2021 +0100

aco: use v1b/v2b for ds_read_u8/ds_read_u16

The p_extract_vector isn't necessary.

For ds_read_u8 and ds_read_u16, we used a 32-bit regclass, but did't load
32 bits, and used dst_hint for vector loads when we shouldn't have.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4863
Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11113>
(cherry picked from commit 4870d7d829e57a993976d6da497e1202b1df2fa6)

---

 .pick_status.json                              | 2 +-
 src/amd/compiler/aco_instruction_selection.cpp | 5 +----
 src/amd/compiler/aco_opcodes.py                | 5 +++++
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 3fc144afc27..78e29ffecf8 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -661,7 +661,7 @@
         "description": "aco: use v1b/v2b for ds_read_u8/ds_read_u16",
         "nominated": true,
         "nomination_type": 0,
-        "resolution": 0,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null
     },
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 65a23bbda08..ba5c6ca6a58 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3644,7 +3644,7 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info,
 
    const_offset /= const_offset_unit;
 
-   RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
+   RegClass rc = RegClass::get(RegType::vgpr, size);
    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
    Instruction *instr;
    if (read2)
@@ -3653,9 +3653,6 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info,
       instr = bld.ds(op, Definition(val), offset, m, const_offset);
    instr->ds().sync = info.sync;
 
-   if (size < 4)
-      val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
-
    return val;
 }
 
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index abe852caa36..8d8960ba2dc 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -1687,3 +1687,8 @@ for ver in ['gfx9', 'gfx10']:
             sys.exit(1)
         else:
             op_to_name[key] = op.name
+
+# These instructions write the entire 32-bit VGPR, but it's not clear in Opcode's constructor that
+# it should be 32, since it works accidentally.
+assert(opcodes['ds_read_u8'].definition_size == 32)
+assert(opcodes['ds_read_u16'].definition_size == 32)