[Mesa-dev] [PATCH v3 04/44] nir: add support for flushing to zero denorm constants

Wed Feb 6 10:44:33 UTC 2019

v2:
- Refactor conditions and shared function (Connor)
- Move code to nir_eval_const_opcode() (Connor)
- Don't flush to zero on fquantize2f16
  From Vulkan spec, VK_KHR_shader_float_controls section:

  "3) Do denorm and rounding mode controls apply to OpSpecConstantOp?

  RESOLVED: Yes, except when the opcode is OpQuantizeToF16."

v3:
- Fix bit size (Connor)
- Fix execution mode on nir_loop_analize (Connor)

Signed-off-by: Samuel Iglesias Gonsálvez <siglesias at igalia.com>
---
 src/compiler/nir/nir_constant_expressions.h  |  3 +-
 src/compiler/nir/nir_constant_expressions.py | 71 ++++++++++++++++++--
 src/compiler/nir/nir_loop_analyze.c          | 22 +++---
 src/compiler/nir/nir_opt_constant_folding.c  | 15 +++--
 src/compiler/spirv/spirv_to_nir.c            |  3 +-
 5 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/src/compiler/nir/nir_constant_expressions.h b/src/compiler/nir/nir_constant_expressions.h
index 1d6bbbc25d3..a2d416abc45 100644
--- a/src/compiler/nir/nir_constant_expressions.h
+++ b/src/compiler/nir/nir_constant_expressions.h
@@ -31,6 +31,7 @@
 #include "nir.h"
 
 nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
-                                      unsigned bit_size, nir_const_value *src);
+                                      unsigned bit_size, nir_const_value *src,
+                                      unsigned float_controls_execution_mode);
 
 #endif /* NIR_CONSTANT_EXPRESSIONS_H */
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index 505cdd8baae..e79590f8359 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -66,6 +66,37 @@ template = """\
 #include "util/bigmath.h"
 #include "nir_constant_expressions.h"
 
+/**
+ * Checks if the provided value is a denorm and flushes it to zero.
+*/
+static nir_const_value
+constant_denorm_flush_to_zero(nir_const_value value, unsigned index, unsigned bit_size)
+{
+   switch(bit_size) {
+   case 64:
+      if (value.u64[index] < 0x0010000000000000)
+         value.u64[index] = 0;
+      if (value.u64[index] & 0x8000000000000000 &&
+          !(value.u64[index] & 0x7ff0000000000000))
+         value.u64[index] = 0x8000000000000000;
+      break;
+   case 32:
+      if (value.u32[index] < 0x00800000)
+         value.u32[index] = 0;
+      if (value.u32[index] & 0x80000000 &&
+          !(value.u32[index] & 0x7f800000))
+         value.u32[index] = 0x80000000;
+      break;
+   case 16:
+      if (value.u16[index] < 0x0400)
+         value.u16[index] = 0;
+      if (value.u16[index] & 0x8000 &&
+          !(value.u16[index] & 0x7c00))
+         value.u16[index] = 0x8000;
+   }
+   return value;
+}
+
 /**
  * Evaluate one component of packSnorm4x8.
  */
@@ -260,7 +291,7 @@ struct ${type}${width}_vec {
 % endfor
 % endfor
 
-<%def name="evaluate_op(op, bit_size)">
+<%def name="evaluate_op(op, bit_size, execution_mode)">
    <%
    output_type = type_add_size(op.output_type, bit_size)
    input_types = [type_add_size(type_, bit_size) for type_ in op.input_types]
@@ -343,6 +374,18 @@ struct ${type}${width}_vec {
          % else:
             _dst_val.${get_const_field(output_type)}[_i] = dst;
          % endif
+
+         % if op.name != "fquantize2f16" and type_base_type(output_type) == "float":
+            % if type_has_size(output_type):
+               if (execution_mode & SHADER_DENORM_FLUSH_TO_ZERO_FP${type_size(output_type)}) {
+                  _dst_val = constant_denorm_flush_to_zero(_dst_val, _i, ${type_size(output_type)});
+               }
+            % else:
+               if (execution_mode & SHADER_DENORM_FLUSH_TO_ZERO_FP${bit_size}) {
+                  _dst_val = constant_denorm_flush_to_zero(_dst_val, _i, bit_size);
+               }
+            %endif
+         % endif
       }
    % else:
       ## In the non-per-component case, create a struct dst with
@@ -375,6 +418,18 @@ struct ${type}${width}_vec {
          % else:
             _dst_val.${get_const_field(output_type)}[${k}] = dst.${"xyzw"[k]};
          % endif
+
+         % if op.name != "fquantize2f16" and type_base_type(output_type) == "float":
+            % if type_has_size(output_type):
+               if (execution_mode & SHADER_DENORM_FLUSH_TO_ZERO_FP${type_size(output_type)}) {
+                  _dst_val = constant_denorm_flush_to_zero(_dst_val, ${k}, ${type_size(output_type)});
+               }
+            % else:
+               if (execution_mode & SHADER_DENORM_FLUSH_TO_ZERO_FP${bit_size}) {
+                  _dst_val = constant_denorm_flush_to_zero(_dst_val, ${k}, bit_size);
+               }
+            % endif
+         % endif
       % endfor
    % endif
 </%def>
@@ -383,7 +438,8 @@ struct ${type}${width}_vec {
 static nir_const_value
 evaluate_${name}(MAYBE_UNUSED unsigned num_components,
                  ${"UNUSED" if op_bit_sizes(op) is None else ""} unsigned bit_size,
-                 MAYBE_UNUSED nir_const_value *_src)
+                 MAYBE_UNUSED nir_const_value *_src,
+                 MAYBE_UNUSED unsigned execution_mode)
 {
    nir_const_value _dst_val = { {0, } };
 
@@ -391,7 +447,7 @@ evaluate_${name}(MAYBE_UNUSED unsigned num_components,
       switch (bit_size) {
       % for bit_size in op_bit_sizes(op):
       case ${bit_size}: {
-         ${evaluate_op(op, bit_size)}
+         ${evaluate_op(op, bit_size, execution_mode)}
          break;
       }
       % endfor
@@ -400,7 +456,7 @@ evaluate_${name}(MAYBE_UNUSED unsigned num_components,
          unreachable("unknown bit width");
       }
    % else:
-      ${evaluate_op(op, 0)}
+      ${evaluate_op(op, 0, execution_mode)}
    % endif
 
    return _dst_val;
@@ -409,12 +465,13 @@ evaluate_${name}(MAYBE_UNUSED unsigned num_components,
 
 nir_const_value
 nir_eval_const_opcode(nir_op op, unsigned num_components,
-                      unsigned bit_width, nir_const_value *src)
+                      unsigned bit_width, nir_const_value *src,
+                      unsigned float_controls_execution_mode)
 {
    switch (op) {
 % for name in sorted(opcodes.keys()):
    case nir_op_${name}:
-      return evaluate_${name}(num_components, bit_width, src);
+      return evaluate_${name}(num_components, bit_width, src, float_controls_execution_mode);
 % endfor
    default:
       unreachable("shouldn't get here");
@@ -424,6 +481,8 @@ nir_eval_const_opcode(nir_op op, unsigned num_components,
 from mako.template import Template
 
 print(Template(template).render(opcodes=opcodes, type_sizes=type_sizes,
+                                type_base_type=type_base_type,
+                                type_size=type_size,
                                 type_has_size=type_has_size,
                                 type_add_size=type_add_size,
                                 op_bit_sizes=op_bit_sizes,
diff --git a/src/compiler/nir/nir_loop_analyze.c b/src/compiler/nir/nir_loop_analyze.c
index 6deb6cb9627..9026a4f406e 100644
--- a/src/compiler/nir/nir_loop_analyze.c
+++ b/src/compiler/nir/nir_loop_analyze.c
@@ -469,7 +469,8 @@ static bool
 test_iterations(int32_t iter_int, nir_const_value *step,
                 nir_const_value *limit, nir_op cond_op, unsigned bit_size,
                 nir_alu_type induction_base_type,
-                nir_const_value *initial, bool limit_rhs, bool invert_cond)
+                nir_const_value *initial, bool limit_rhs, bool invert_cond,
+                unsigned execution_mode)
 {
    assert(nir_op_infos[cond_op].num_inputs == 2);
 
@@ -497,19 +498,20 @@ test_iterations(int32_t iter_int, nir_const_value *step,
     */
    nir_const_value mul_src[2] = { iter_src, *step };
    nir_const_value mul_result =
-      nir_eval_const_opcode(mul_op, 1, bit_size, mul_src);
+      nir_eval_const_opcode(mul_op, 1, bit_size, mul_src, execution_mode);
 
    /* Add the initial value to the accumulated induction variable total */
    nir_const_value add_src[2] = { mul_result, *initial };
    nir_const_value add_result =
-      nir_eval_const_opcode(add_op, 1, bit_size, add_src);
+      nir_eval_const_opcode(add_op, 1, bit_size, add_src, execution_mode);
 
    nir_const_value src[2] = { { {0, } }, { {0, } } };
    src[limit_rhs ? 0 : 1] = add_result;
    src[limit_rhs ? 1 : 0] = *limit;
 
    /* Evaluate the loop exit condition */
-   nir_const_value result = nir_eval_const_opcode(cond_op, 1, bit_size, src);
+   nir_const_value result = nir_eval_const_opcode(cond_op, 1, bit_size, src,
+                                                  execution_mode);
 
    return invert_cond ? (result.u32[0] == 0) : (result.u32[0] != 0);
 }
@@ -517,7 +519,8 @@ test_iterations(int32_t iter_int, nir_const_value *step,
 static int
 calculate_iterations(nir_const_value *initial, nir_const_value *step,
                      nir_const_value *limit, nir_loop_variable *alu_def,
-                     nir_alu_instr *cond_alu, bool limit_rhs, bool invert_cond)
+                     nir_alu_instr *cond_alu, bool limit_rhs, bool invert_cond,
+                     unsigned execution_mode)
 {
    assert(initial != NULL && step != NULL && limit != NULL);
 
@@ -584,7 +587,7 @@ calculate_iterations(nir_const_value *initial, nir_const_value *step,
 
       if (test_iterations(iter_bias, step, limit, cond_alu->op, bit_size,
                           induction_base_type, initial,
-                          limit_rhs, invert_cond)) {
+                          limit_rhs, invert_cond, execution_mode)) {
          return iter_bias > 0 ? iter_bias - trip_offset : iter_bias;
       }
    }
@@ -599,7 +602,7 @@ calculate_iterations(nir_const_value *initial, nir_const_value *step,
  * loop.
  */
 static void
-find_trip_count(loop_info_state *state)
+find_trip_count(loop_info_state *state, unsigned execution_mode)
 {
    bool trip_count_known = true;
    nir_loop_terminator *limiting_terminator = NULL;
@@ -670,7 +673,8 @@ find_trip_count(loop_info_state *state)
                                                &limit_val,
                                                basic_ind->ind->alu_def, alu,
                                                limit_rhs,
-                                               terminator->continue_from_then);
+                                               terminator->continue_from_then,
+                                               execution_mode);
 
          /* Where we not able to calculate the iteration count */
          if (iterations == -1) {
@@ -801,7 +805,7 @@ get_loop_info(loop_info_state *state, nir_function_impl *impl)
       return;
 
    /* Run through each of the terminators and try to compute a trip-count */
-   find_trip_count(state);
+   find_trip_count(state, impl->function->shader->info.shader_float_controls_execution_mode);
 
    nir_foreach_block_in_cf_node(block, &state->loop->cf_node) {
       if (force_unroll_heuristics(state, block)) {
diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c
index 83be0d78dbd..10bbf553d45 100644
--- a/src/compiler/nir/nir_opt_constant_folding.c
+++ b/src/compiler/nir/nir_opt_constant_folding.c
@@ -39,7 +39,7 @@ struct constant_fold_state {
 };
 
 static bool
-constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
+constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx, unsigned execution_mode)
 {
    nir_const_value src[NIR_MAX_VEC_COMPONENTS];
 
@@ -108,7 +108,7 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
 
    nir_const_value dest =
       nir_eval_const_opcode(instr->op, instr->dest.dest.ssa.num_components,
-                            bit_size, src);
+                            bit_size, src, execution_mode);
 
    nir_load_const_instr *new_instr =
       nir_load_const_instr_create(mem_ctx,
@@ -161,14 +161,14 @@ constant_fold_intrinsic_instr(nir_intrinsic_instr *instr)
 }
 
 static bool
-constant_fold_block(nir_block *block, void *mem_ctx)
+constant_fold_block(nir_block *block, void *mem_ctx, unsigned execution_mode)
 {
    bool progress = false;
 
    nir_foreach_instr_safe(instr, block) {
       switch (instr->type) {
       case nir_instr_type_alu:
-         progress |= constant_fold_alu_instr(nir_instr_as_alu(instr), mem_ctx);
+         progress |= constant_fold_alu_instr(nir_instr_as_alu(instr), mem_ctx, execution_mode);
          break;
       case nir_instr_type_intrinsic:
          progress |=
@@ -184,13 +184,13 @@ constant_fold_block(nir_block *block, void *mem_ctx)
 }
 
 static bool
-nir_opt_constant_folding_impl(nir_function_impl *impl)
+nir_opt_constant_folding_impl(nir_function_impl *impl, unsigned execution_mode)
 {
    void *mem_ctx = ralloc_parent(impl);
    bool progress = false;
 
    nir_foreach_block(block, impl) {
-      progress |= constant_fold_block(block, mem_ctx);
+      progress |= constant_fold_block(block, mem_ctx, execution_mode);
    }
 
    if (progress) {
@@ -209,10 +209,11 @@ bool
 nir_opt_constant_folding(nir_shader *shader)
 {
    bool progress = false;
+   unsigned execution_mode = shader->info.shader_float_controls_execution_mode;
 
    nir_foreach_function(function, shader) {
       if (function->impl)
-         progress |= nir_opt_constant_folding_impl(function->impl);
+         progress |= nir_opt_constant_folding_impl(function->impl, execution_mode);
    }
 
    return progress;
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index 3f23e799431..c1703d98bc1 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -1966,7 +1966,8 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
          }
 
          val->constant->values[0] =
-            nir_eval_const_opcode(op, num_components, bit_size, src);
+            nir_eval_const_opcode(op, num_components, bit_size, src,
+                                  b->shader->info.shader_float_controls_execution_mode);
          break;
       } /* default */
       }
-- 
2.19.1