Mesa (staging/20.0): intel/compiler: fix cmod propagation optimisations

Wed Jun 3 19:22:00 UTC 2020

Module: Mesa
Branch: staging/20.0
Commit: 8c35c65f669706dcc5debad0a3b08f26163933c4
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8c35c65f669706dcc5debad0a3b08f26163933c4

Author: Yevhenii Kolesnikov <yevhenii.kolesnikov at globallogic.com>
Date:   Fri Jan  3 16:37:00 2020 +0200

intel/compiler: fix cmod propagation optimisations

Knowing following:
 - CMP writes to flag register the result of
   applying cmod to the `src0 - src1`.
   After that it stores the same value to dst.
   Other instructions first store their result to
   dst, and then store cmod(dst) to the flag
   register.
 - inst is either CMP or MOV
 - inst->dst is null
 - inst->src[0] overlaps with scan_inst->dst
 - inst->src[1] is zero
 - scan_inst wrote to a flag register

There can be three possible paths:

 - scan_inst is CMP:

   Considering that src0 is either 0x0 (false),
   or 0xffffffff (true), and src1 is 0x0:

   - If inst's cmod is NZ, we can always remove
     scan_inst: NZ is invariant for false and true. This
     holds even if src0 is NaN: .nz is the only cmod,
     that returns true for NaN.

   - .g is invariant if src0 has a UD type

   - .l is invariant if src0 has a D type

 - scan_inst and inst have the same cmod:

   If scan_inst is anything than CMP, it already
   wrote the appropriate value to the flag register.

 - else:

   We can change cmod of scan_inst to that of inst,
   and remove inst. It is valid as long as we make
   sure that no instruction uses the flag register
   between scan_inst and inst.

Nine new cmod_propagation unit tests:
 - cmp_cmpnz
 - cmp_cmpg
 - plnnz_cmpnz
 - plnnz_cmpz (*)
 - plnnz_sel_cmpz
 - cmp_cmpg_D
 - cmp_cmpg_UD (*)
 - cmp_cmpl_D (*)
 - cmp_cmpl_UD

(*) this would fail without changes to brw_fs_cmod_propagation.

This fixes optimisation that used to be illegal (see issue #2154)

= Before =
 0: linterp.z.f0.0(8) vgrf0:F, g2:F, attr0<0>:F
 1: cmp.nz.f0.0(8) null:F, vgrf0:F, 0f
= After =
 0: linterp.z.f0.0(8) vgrf0:F, g2:F, attr0<0>:F

Now it is optimised as such (note change of cmod in line 0):

= Before =
 0: linterp.z.f0.0(8) vgrf0:F, g2:F, attr0<0>:F
 1: cmp.nz.f0.0(8) null:F, vgrf0:F, 0f
= After =
 0: linterp.nz.f0.0(8) vgrf0:F, g2:F, attr0<0>:F

No shaderdb changes

Closes: https://gitlab.freedesktop.org/mesa/mesa/issues/2154

Signed-off-by: Yevhenii Kolesnikov <yevhenii.kolesnikov at globallogic.com>
Reviewed-by: Matt Turner <mattst88 at gmail.com>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3348>
(cherry picked from commit 32b7ba66b0156d9fd40b059f20da79a74451f7fd)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5149>

---

 src/intel/compiler/brw_fs_cmod_propagation.cpp  |  70 +++++-
 src/intel/compiler/test_fs_cmod_propagation.cpp | 275 ++++++++++++++++++++++++
 2 files changed, 336 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 45ea9206014..8f407563d74 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -326,17 +326,69 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
                }
             }
 
-            /* If the instruction generating inst's source also wrote the
-             * flag, and inst is doing a simple .nz comparison, then inst
-             * is redundant - the appropriate value is already in the flag
-             * register.  Delete inst.
+            /* Knowing following:
+             * - CMP writes to flag register the result of
+             *   applying cmod to the `src0 - src1`.
+             *   After that it stores the same value to dst.
+             *   Other instructions first store their result to
+             *   dst, and then store cmod(dst) to the flag
+             *   register.
+             * - inst is either CMP or MOV
+             * - inst->dst is null
+             * - inst->src[0] overlaps with scan_inst->dst
+             * - inst->src[1] is zero
+             * - scan_inst wrote to a flag register
+             *
+             * There can be three possible paths:
+             *
+             * - scan_inst is CMP:
+             *
+             *   Considering that src0 is either 0x0 (false),
+             *   or 0xffffffff (true), and src1 is 0x0:
+             *
+             *   - If inst's cmod is NZ, we can always remove
+             *     scan_inst: NZ is invariant for false and true. This
+             *     holds even if src0 is NaN: .nz is the only cmod,
+             *     that returns true for NaN.
+             *
+             *   - .g is invariant if src0 has a UD type
+             *
+             *   - .l is invariant if src0 has a D type
+             *
+             * - scan_inst and inst have the same cmod:
+             *
+             *   If scan_inst is anything than CMP, it already
+             *   wrote the appropriate value to the flag register.
+             *
+             * - else:
+             *
+             *   We can change cmod of scan_inst to that of inst,
+             *   and remove inst. It is valid as long as we make
+             *   sure that no instruction uses the flag register
+             *   between scan_inst and inst.
              */
-            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
-                !inst->src[0].negate &&
+            if (!inst->src[0].negate &&
                 scan_inst->flags_written()) {
-               inst->remove(block);
-               progress = true;
-               break;
+               if (scan_inst->opcode == BRW_OPCODE_CMP) {
+                  if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
+                      (inst->conditional_mod == BRW_CONDITIONAL_G &&
+                       inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
+                      (inst->conditional_mod == BRW_CONDITIONAL_L &&
+                       inst->src[0].type == BRW_REGISTER_TYPE_D)) {
+                     inst->remove(block);
+                     progress = true;
+                     break;
+                  }
+               } else if (scan_inst->conditional_mod == inst->conditional_mod) {
+                  inst->remove(block);
+                  progress = true;
+                  break;
+               } else if (!read_flag) {
+                  scan_inst->conditional_mod = inst->conditional_mod;
+                  inst->remove(block);
+                  progress = true;
+                  break;
+               }
             }
 
             /* The conditional mod of the CMP/CMPN instructions behaves
diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp
index d05c6897196..bf8a12647c0 100644
--- a/src/intel/compiler/test_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/test_fs_cmod_propagation.cpp
@@ -646,6 +646,281 @@ TEST_F(cmod_propagation_test, andnz_non_one)
    EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
 }
 
+TEST_F(cmod_propagation_test, cmp_cmpnz)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    * 1: cmp.nz.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpg)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_G);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    * 1: cmp.g.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_G, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, plnnz_cmpnz)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0));
+
+   set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero));
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    * 1: cmp.nz.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, plnnz_cmpz)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0));
+
+   set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero));
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_Z);
+
+   /* = Before =
+    * 0: pln.nz.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    * 1: cmp.z.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * 0: pln.z.f0.0(8) vgrf0:F, vgrf1:F, 0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, plnnz_sel_cmpz)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0));
+
+   set_condmod(BRW_CONDITIONAL_NZ, bld.PLN(dst0, src0, zero));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dst1, src0, zero));
+   bld.CMP(bld.null_reg_f(), dst0, zero, BRW_CONDITIONAL_Z);
+
+   /* = Before =
+    * 0: pln.nz.f0.0(8) vgrf0:F, vgrf2:F, 0f
+    * 1: (+f0.0) sel(8) vgrf1:F, vgrf2:F, 0f
+    * 2: cmp.z.f0.0(8) null:F, vgrf0:F, 0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_PLN, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_Z, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpg_D)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::int_type);
+   fs_reg src0 = v->vgrf(glsl_type::int_type);
+   fs_reg zero(brw_imm_d(0));
+   fs_reg one(brw_imm_d(1));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_d(), dst0, zero, BRW_CONDITIONAL_G);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d
+    * 1: cmp.g.f0.0(8) null:D, vgrf0:D, 0d
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_G, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpg_UD)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::uint_type);
+   fs_reg src0 = v->vgrf(glsl_type::uint_type);
+   fs_reg zero(brw_imm_ud(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_ud(), dst0, zero, BRW_CONDITIONAL_G);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u
+    * 1: cmp.g.f0.0(8) null:UD, vgrf0:UD, 0u
+    *
+    * = After =
+    * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpl_D)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::int_type);
+   fs_reg src0 = v->vgrf(glsl_type::int_type);
+   fs_reg zero(brw_imm_d(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_d(), dst0, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d
+    * 1: cmp.l.f0.0(8) null:D, vgrf0:D, 0d
+    *
+    * = After =
+    * 0: cmp.nz.f0.0(8) vgrf0:D, vgrf1:D, 0d
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_cmpl_UD)
+{
+   const fs_builder &bld = v->bld;
+
+   fs_reg dst0 = v->vgrf(glsl_type::uint_type);
+   fs_reg src0 = v->vgrf(glsl_type::uint_type);
+   fs_reg zero(brw_imm_ud(0));
+
+   bld.CMP(dst0, src0, zero, BRW_CONDITIONAL_NZ);
+   bld.CMP(bld.null_reg_ud(), dst0, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    * 0: cmp.nz.f0.0(8) vgrf0:UD, vgrf1:UD, 0u
+    * 1: cmp.l.f0.0(8) null:UD, vgrf0:UD, 0u
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+}
+
 TEST_F(cmod_propagation_test, andz_one)
 {
    const fs_builder &bld = v->bld;