[Mesa-dev] [PATCH 19/22] i965/vec4: fix SIMD-with lowering for CMP/MOV instructions with conditional modifiers

Thu Jan 5 13:07:39 UTC 2017

From: "Juan A. Suarez Romero" <jasuarez at igalia.com>

When splitting a CMP/MOV instruction with NULL dest, DF sources, and
conditional modifier; we can't use directly the flag registers, as they will
have the wrong results in IVB/VLV after the scalarization.

Rather, we need to store the result in a temporary register, and then use
that register to set proper the flags values.

If a MOV has a null destination register and a conditional modifier, it
can be replaced with a CMP against zero with the same conditional
modifier. By doing this replacement, we can do the SIMD lowering
without any problem.

Signed-off-by: Samuel Iglesias Gonsálvez <siglesias at igalia.com>
Signed-off-by: Juan A. Suarez Romero <jasuarez at igalia.com
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 80 +++++++++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index c654c8b..74ab866 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -2190,6 +2190,46 @@ vec4_visitor::lower_simd_width()
        * value of the instruction's dst.
        */
       bool needs_temp = dst_src_regions_overlap(inst);
+
+      /* When splitting instructions with conditional modifiers and NULL
+       * dest we can't rely directly on the flags to store the result. Rather,
+       * we need first to enqueue the result in a temporary register, and then
+       * move those values into flags.
+       */
+      bool inst_df_dst_null =
+         inst->dst.is_null() && inst->exec_data_size() == 8 &&
+         inst->conditional_mod != BRW_CONDITIONAL_NONE;
+
+      if (inst_df_dst_null) {
+         /* If there are other DF instructions with NULL destination,
+          * we need to verify if we can use the temporary register or
+          * if we need an extra lowering step.
+          */
+         assert(inst->opcode == BRW_OPCODE_MOV ||
+                inst->opcode == BRW_OPCODE_CMP);
+
+         /* Replace MOV.XX with null destination with the equivalente CMP.XX
+          * with null destination, so we can lower it as explained before.
+          */
+         if (inst->opcode == BRW_OPCODE_MOV) {
+            vec4_instruction *cmp =
+               new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst_null_df(),
+                                             inst->src[0],
+                                             setup_imm_df(0.0, block, inst));
+            cmp->conditional_mod = inst->conditional_mod;
+            cmp->exec_size = inst->exec_size;
+            cmp->group = inst->group;
+            cmp->size_written = inst->size_written;
+            inst->insert_before(block, cmp);
+            inst->remove(block);
+            inst = cmp;
+         }
+      }
+      dst_reg inst_dst;
+      if (inst_df_dst_null)
+         inst_dst =
+            retype(dst_reg(VGRF, alloc.allocate(1)), BRW_REGISTER_TYPE_F);
+
       for (unsigned n = 0; n < inst->exec_size / lowered_width; n++)  {
          unsigned channel_offset = lowered_width * n;
 
@@ -2212,7 +2252,7 @@ vec4_visitor::lower_simd_width()
          bool d2f_pass = (inst->opcode == VEC4_OPCODE_FROM_DOUBLE && n > 0);
          /* Compute split dst region */
          dst_reg dst;
-         if (needs_temp || d2f_pass) {
+         if (needs_temp || d2f_pass || inst_df_dst_null) {
             unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
             dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
                          inst->dst.type);
@@ -2242,24 +2282,52 @@ vec4_visitor::lower_simd_width()
 
          inst->insert_before(block, linst);
 
+         dst_reg d2f_dst;
+         if (inst_df_dst_null) {
+            unsigned num_regs = DIV_ROUND_UP(lowered_width, type_sz(BRW_REGISTER_TYPE_F));
+            d2f_dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)), BRW_REGISTER_TYPE_F);
+            vec4_instruction *d2f = new(mem_ctx) vec4_instruction(VEC4_OPCODE_FROM_DOUBLE, d2f_dst, src_reg(dst));
+            d2f->group = channel_offset;
+            d2f->exec_size = lowered_width;
+            d2f->size_written = lowered_width * type_sz(d2f_dst.type);
+            d2f->predicate = inst->predicate;
+            inst->insert_before(block, d2f);
+         }
+
          /* If we used a temporary to store the result of the split
           * instruction, copy the result to the original destination
           */
-         if (needs_temp || d2f_pass) {
+         if (needs_temp || d2f_pass || inst_df_dst_null) {
             vec4_instruction *mov;
-            if (d2f_pass)
+            if (d2f_pass) {
                mov = MOV(horiz_offset(inst->dst, n * type_sz(inst->dst.type)), src_reg(dst));
-            else
+               mov->size_written = size_written;
+            } else if (inst_df_dst_null) {
+               mov = MOV(horiz_offset(inst_dst, n * 4), src_reg(d2f_dst));
+               mov->size_written = lowered_width * type_sz(inst_dst.type);
+            } else {
                mov = MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
+               mov->size_written = size_written;
+            }
             mov->exec_size = lowered_width;
             mov->group = channel_offset;
-            mov->size_written = size_written;
             mov->predicate = inst->predicate;
             inst->insert_before(block, mov);
          }
       }
 
-      inst->remove(block);
+      /* For CMP.XX instruction, we need to set the flags correctly. We do
+       * this by comparing the register that has the results against 0.0,
+       * getting the values in the flags.
+       */
+      if (inst_df_dst_null) {
+         inst->dst.type = BRW_REGISTER_TYPE_F;
+         inst->src[0] = src_reg(inst_dst);
+         inst->src[1] = brw_imm_f(0.0f);
+         inst->conditional_mod = BRW_CONDITIONAL_NZ;
+      } else {
+         inst->remove(block);
+      }
       progress = true;
    }
 
-- 
2.9.3