[Mesa-dev] [PATCH 51/56] intel/fs: Implement reduce and scan opeprations

Wed Mar 7 14:35:39 UTC 2018

Acked-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Reviewed-by: Iago Toral Quiroga <itoral at igalia.com>
---
 src/intel/compiler/brw_fs_nir.cpp | 154 ++++++++++++++++++++++++++++++++++++++
 src/intel/compiler/brw_reg.h      |   8 ++
 2 files changed, 162 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index d2d32f9..e25fd38 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3722,6 +3722,71 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
    }
 }
 
+static fs_reg
+brw_nir_reduction_op_identity(const fs_builder &bld,
+                              nir_op op, brw_reg_type type)
+{
+   nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
+   switch (type_sz(type)) {
+   case 2:
+      assert(type != BRW_REGISTER_TYPE_HF);
+      return retype(brw_imm_uw(value.u16[0]), type);
+   case 4:
+      return retype(brw_imm_ud(value.u32[0]), type);
+   case 8:
+      if (type == BRW_REGISTER_TYPE_DF)
+         return setup_imm_df(bld, value.f64[0]);
+      else
+         return retype(brw_imm_u64(value.u64[0]), type);
+   default:
+      unreachable("Invalid type size");
+   }
+}
+
+static opcode
+brw_op_for_nir_reduction_op(nir_op op)
+{
+   switch (op) {
+   case nir_op_iadd: return BRW_OPCODE_ADD;
+   case nir_op_fadd: return BRW_OPCODE_ADD;
+   case nir_op_imul: return BRW_OPCODE_MUL;
+   case nir_op_fmul: return BRW_OPCODE_MUL;
+   case nir_op_imin: return BRW_OPCODE_SEL;
+   case nir_op_umin: return BRW_OPCODE_SEL;
+   case nir_op_fmin: return BRW_OPCODE_SEL;
+   case nir_op_imax: return BRW_OPCODE_SEL;
+   case nir_op_umax: return BRW_OPCODE_SEL;
+   case nir_op_fmax: return BRW_OPCODE_SEL;
+   case nir_op_iand: return BRW_OPCODE_AND;
+   case nir_op_ior:  return BRW_OPCODE_OR;
+   case nir_op_ixor: return BRW_OPCODE_XOR;
+   default:
+      unreachable("Invalid reduction operation");
+   }
+}
+
+static brw_conditional_mod
+brw_cond_mod_for_nir_reduction_op(nir_op op)
+{
+   switch (op) {
+   case nir_op_iadd: return BRW_CONDITIONAL_NONE;
+   case nir_op_fadd: return BRW_CONDITIONAL_NONE;
+   case nir_op_imul: return BRW_CONDITIONAL_NONE;
+   case nir_op_fmul: return BRW_CONDITIONAL_NONE;
+   case nir_op_imin: return BRW_CONDITIONAL_L;
+   case nir_op_umin: return BRW_CONDITIONAL_L;
+   case nir_op_fmin: return BRW_CONDITIONAL_L;
+   case nir_op_imax: return BRW_CONDITIONAL_GE;
+   case nir_op_umax: return BRW_CONDITIONAL_GE;
+   case nir_op_fmax: return BRW_CONDITIONAL_GE;
+   case nir_op_iand: return BRW_CONDITIONAL_NONE;
+   case nir_op_ior:  return BRW_CONDITIONAL_NONE;
+   case nir_op_ixor: return BRW_CONDITIONAL_NONE;
+   default:
+      unreachable("Invalid reduction operation");
+   }
+}
+
 void
 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
 {
@@ -4523,6 +4588,95 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_reduce: {
+      fs_reg src = get_nir_src(instr->src[0]);
+      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+      unsigned cluster_size = nir_intrinsic_cluster_size(instr);
+      if (cluster_size == 0 || cluster_size > dispatch_width)
+         cluster_size = dispatch_width;
+
+      /* Figure out the source type */
+      src.type = brw_type_for_nir_type(devinfo,
+         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+                        nir_src_bit_size(instr->src[0])));
+
+      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+      opcode brw_op = brw_op_for_nir_reduction_op(redop);
+      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+      /* Set up a register for all of our scratching around and initialize it
+       * to reduction operation's identity value.
+       */
+      fs_reg scan = bld.vgrf(src.type);
+      bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+      bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
+
+      dest.type = src.type;
+      if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
+         /* In this case, CLUSTER_BROADCAST instruction isn't needed because
+          * the distance between clusters is at least 2 GRFs.  In this case,
+          * we don't need the weird striding of the CLUSTER_BROADCAST
+          * instruction and can just do regular MOVs.
+          */
+         assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
+         const unsigned groups =
+            (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
+         const unsigned group_size = dispatch_width / groups;
+         for (unsigned i = 0; i < groups; i++) {
+            const unsigned cluster = (i * group_size) / cluster_size;
+            const unsigned comp = cluster * cluster_size + (cluster_size - 1);
+            bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
+                                         component(scan, comp));
+         }
+      } else {
+         bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
+                  brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
+      }
+      break;
+   }
+
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_exclusive_scan: {
+      fs_reg src = get_nir_src(instr->src[0]);
+      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+
+      /* Figure out the source type */
+      src.type = brw_type_for_nir_type(devinfo,
+         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+                        nir_src_bit_size(instr->src[0])));
+
+      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+      opcode brw_op = brw_op_for_nir_reduction_op(redop);
+      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+      /* Set up a register for all of our scratching around and initialize it
+       * to reduction operation's identity value.
+       */
+      fs_reg scan = bld.vgrf(src.type);
+      const fs_builder allbld = bld.exec_all();
+      allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+      if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
+         /* Exclusive scan is a bit harder because we have to do an annoying
+          * shift of the contents before we can begin.  To make things worse,
+          * we can't do this with a normal stride; we have to use indirects.
+          */
+         fs_reg shifted = bld.vgrf(src.type);
+         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+         allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+                         brw_imm_w(-1));
+         allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
+         allbld.group(1, 0).MOV(component(shifted, 0), identity);
+         scan = shifted;
+      }
+
+      bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
+
+      bld.MOV(retype(dest, src.type), scan);
+      break;
+   }
+
    default:
       unreachable("unknown intrinsic");
    }
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
index c414081..7ad144b 100644
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@@ -590,6 +590,14 @@ brw_imm_df(double df)
 }
 
 static inline struct brw_reg
+brw_imm_u64(uint64_t u64)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UQ);
+   imm.u64 = u64;
+   return imm;
+}
+
+static inline struct brw_reg
 brw_imm_f(float f)
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
-- 
2.5.0.400.gff86faf