Mesa (staging/19.3): intel/compiler: Move Gen4/5 rounding to visitor

Thu Jan 23 22:40:48 UTC 2020

Module: Mesa
Branch: staging/19.3
Commit: a2db01129edf32a2c5f884d3b1a3f6d3d2414a3e
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=a2db01129edf32a2c5f884d3b1a3f6d3d2414a3e

Author: Matt Turner <mattst88 at gmail.com>
Date:   Thu Jan 16 11:17:14 2020 -0800

intel/compiler: Move Gen4/5 rounding to visitor

Gen4/5's rounding instructions operate differently than later Gens'.
They all return the floor of the input and the "Round-increment"
conditional modifier answers whether the result should be incremented by
1.0 to get the appropriate result for the operation (and thus its
behavior is determined by the round opcode; e.g., RNDZ vs RNDE).

Since this requires a second instruciton (a predicated ADD) that
consumes the result of the round instruction, the round instruction
cannot write its result directly to the (write-only) message registers.
By emitting the ADD in the generator, the backend thinks it's safe to
store the round's result directly to the message register file.

To avoid this, we move the emission of the ADD instruction to the NIR
translator so that the backend has the information it needs.

I suspect this also fixes code generated for RNDZ.SAT but since
Gen4/5 don't support GLSL 1.30 which adds the trunc() function, I
couldn't write a piglit test to confirm. My thinking is that if x=-0.5:

      sat(trunc(-0.5)) = 0.0

But on Gen4/5 where sat(trunc(x)) is implemented as

      rndz.r.f0  result, x             // result = floor(x)
                                       // set f0 if increment needed
      (+f0) add  result, result, 1.0   // fixup so result = trunc(x)

then putting saturate on both instructions will give the wrong result.

      floor(-0.5) = -1.0
      sat(floor(-0.5)) = 0.0
      // +1 increment would be needed since floor(-0.5) != trunc(-0.5)
      sat(sat(floor(-0.5)) + 1.0) = 1.0

Fixes: 6f394343b1f ("nir/algebraic: i2f(f2i()) -> trunc()")
Closes: https://gitlab.freedesktop.org/mesa/mesa/issues/2355
Reviewed-by: Ian Romanick <ian.d.romanick at intel.com>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3459>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3459>
(cherry picked from commit 88a0523bd2d3c635c41f0d0f6844bd8cf43933cc)

---

 .pick_status.json                   |  2 +-
 src/intel/compiler/brw_eu.h         |  9 ++-------
 src/intel/compiler/brw_eu_emit.c    | 32 ++------------------------------
 src/intel/compiler/brw_fs_nir.cpp   | 12 ++++++++++++
 src/intel/compiler/brw_vec4_nir.cpp | 12 ++++++++++++
 5 files changed, 29 insertions(+), 38 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 9cff666164e..370d0a8707c 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -706,7 +706,7 @@
         "description": "intel/compiler: Move Gen4/5 rounding to visitor",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "master_sha": null,
         "because_sha": "6f394343b1f704f8b98a24add7f4106e72e2db7b"
     },
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index c35725bfe2b..4ba8b8042d3 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -199,9 +199,6 @@ brw_inst *brw_##OP(struct brw_codegen *p,	\
 	      struct brw_reg src1,		\
 	      struct brw_reg src2);
 
-#define ROUND(OP) \
-void brw_##OP(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0);
-
 ALU1(MOV)
 ALU2(SEL)
 ALU1(NOT)
@@ -222,6 +219,8 @@ ALU2(AVG)
 ALU2(MUL)
 ALU1(FRC)
 ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDZ)
 ALU2(MAC)
 ALU2(MACH)
 ALU1(LZD)
@@ -244,13 +243,9 @@ ALU2(ADDC)
 ALU2(SUBB)
 ALU2(MAC)
 
-ROUND(RNDZ)
-ROUND(RNDE)
-
 #undef ALU1
 #undef ALU2
 #undef ALU3
-#undef ROUND
 
 
 /* Helpers for SEND instruction:
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index ecd3c34470c..6061728dca5 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1019,33 +1019,6 @@ brw_inst *brw_##OP(struct brw_codegen *p,         \
    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
 }
 
-/* Rounding operations (other than RNDD) require two instructions - the first
- * stores a rounded value (possibly the wrong way) in the dest register, but
- * also sets a per-channel "increment bit" in the flag register.  A predicated
- * add of 1.0 fixes dest to contain the desired result.
- *
- * Sandybridge and later appear to round correctly without an ADD.
- */
-#define ROUND(OP)							      \
-void brw_##OP(struct brw_codegen *p,					      \
-	      struct brw_reg dest,					      \
-	      struct brw_reg src)					      \
-{									      \
-   const struct gen_device_info *devinfo = p->devinfo;					      \
-   brw_inst *rnd, *add;							      \
-   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
-   brw_set_dest(p, rnd, dest);						      \
-   brw_set_src0(p, rnd, src);						      \
-									      \
-   if (devinfo->gen < 6) {							      \
-      /* turn on round-increments */					      \
-      brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
-      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
-      brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
-   }									      \
-}
-
-
 ALU2(SEL)
 ALU1(NOT)
 ALU2(AND)
@@ -1060,6 +1033,8 @@ ALU2(ROR)
 ALU3(CSEL)
 ALU1(FRC)
 ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDZ)
 ALU2(MAC)
 ALU2(MACH)
 ALU1(LZD)
@@ -1079,9 +1054,6 @@ ALU1(CBIT)
 ALU2(ADDC)
 ALU2(SUBB)
 
-ROUND(RNDZ)
-ROUND(RNDE)
-
 brw_inst *
 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
 {
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 2d6b248d5fe..eed863170fe 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1592,6 +1592,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
 
    case nir_op_ftrunc:
       inst = bld.RNDZ(result, op[0]);
+      if (devinfo->gen < 6) {
+         set_condmod(BRW_CONDITIONAL_R, inst);
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.ADD(result, result, brw_imm_f(1.0f)));
+         inst = bld.MOV(result, result); /* for potential saturation */
+      }
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -1614,6 +1620,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
       break;
    case nir_op_fround_even:
       inst = bld.RNDE(result, op[0]);
+      if (devinfo->gen < 6) {
+         set_condmod(BRW_CONDITIONAL_R, inst);
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.ADD(result, result, brw_imm_f(1.0f)));
+         inst = bld.MOV(result, result); /* for potential saturation */
+      }
       inst->saturate = instr->dest.saturate;
       break;
 
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
index 03fda5e9aaa..d32f65a9dbe 100644
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -1379,6 +1379,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 
    case nir_op_ftrunc:
       inst = emit(RNDZ(dst, op[0]));
+      if (devinfo->gen < 6) {
+         inst->conditional_mod = BRW_CONDITIONAL_R;
+         inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
+      }
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -1409,6 +1415,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 
    case nir_op_fround_even:
       inst = emit(RNDE(dst, op[0]));
+      if (devinfo->gen < 6) {
+         inst->conditional_mod = BRW_CONDITIONAL_R;
+         inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */
+      }
       inst->saturate = instr->dest.saturate;
       break;