[Mesa-dev] [PATCH 9/9] i965/vec4: Use NIR to do GS input remapping

Sat May 6 06:17:10 UTC 2017

On 05/05/17 21:24, Jason Ekstrand wrote:
> On Fri, May 5, 2017 at 1:04 AM, Alejandro Piñeiro
> <apinheiro at igalia.com <mailto:apinheiro at igalia.com>> wrote:
>
>     On 05/05/17 04:11, Jason Ekstrand wrote:
>     > We're already doing this in the FS back-end.  This just does the
>     same
>     > thing in the vec4 back-end.
>     > ---
>     >  src/intel/compiler/brw_nir.c               |  3 --
>     >  src/intel/compiler/brw_vec4.cpp            | 44 -------------------
>     >  src/intel/compiler/brw_vec4.h              |  2 -
>     >  src/intel/compiler/brw_vec4_gs_nir.cpp     | 10 ++---
>     >  src/intel/compiler/brw_vec4_gs_visitor.cpp | 70
>     +++++++++++++++++++++---------
>     >  src/intel/compiler/brw_vec4_gs_visitor.h   |  4 +-
>     >  src/intel/compiler/gen6_gs_visitor.cpp     |  4 +-
>     >  7 files changed, 56 insertions(+), 81 deletions(-)
>     >
>     > diff --git a/src/intel/compiler/brw_nir.c
>     b/src/intel/compiler/brw_nir.c
>     > index d92e8fa..8a9f89f 100644
>     > --- a/src/intel/compiler/brw_nir.c
>     > +++ b/src/intel/compiler/brw_nir.c
>     > @@ -342,9 +342,6 @@ brw_nir_lower_vue_inputs(nir_shader *nir,
>     bool is_scalar,
>     >     /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
>     >     nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
>     >
>     > -   if (nir->stage == MESA_SHADER_GEOMETRY && !is_scalar)
>     > -      return;
>     > -
>     >     /* This pass needs actual constants */
>     >     nir_opt_constant_folding(nir);
>     >
>     > diff --git a/src/intel/compiler/brw_vec4.cpp
>     b/src/intel/compiler/brw_vec4.cpp
>     > index b387321..ad92951 100644
>     > --- a/src/intel/compiler/brw_vec4.cpp
>     > +++ b/src/intel/compiler/brw_vec4.cpp
>     > @@ -1693,50 +1693,6 @@ attribute_to_hw_reg(int attr,
>     brw_reg_type type, bool interleaved)
>     >     return reg;
>     >  }
>     >
>     > -
>     > -/**
>     > - * Replace each register of type ATTR in this->instructions
>     with a reference
>     > - * to a fixed HW register.
>     > - *
>     > - * If interleaved is true, then each attribute takes up half a
>     register, with
>     > - * register N containing attribute 2*N in its first half and
>     attribute 2*N+1
>     > - * in its second half (this corresponds to the payload setup
>     used by geometry
>     > - * shaders in "single" or "dual instanced" dispatch mode).  If
>     interleaved is
>     > - * false, then each attribute takes up a whole register, with
>     register N
>     > - * containing attribute N (this corresponds to the payload
>     setup used by
>     > - * vertex shaders, and by geometry shaders in "dual object"
>     dispatch mode).
>     > - */
>     > -void
>     > -vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
>     > -                                          bool interleaved)
>     > -{
>     > -   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
>     > -      for (int i = 0; i < 3; i++) {
>     > -         if (inst->src[i].file != ATTR)
>     > -            continue;
>     > -
>     > -         int grf = attribute_map[inst->src[i].nr +
>     > -                                 inst->src[i].offset / REG_SIZE];
>     > -         assert(inst->src[i].offset % REG_SIZE == 0);
>     > -
>     > -         /* All attributes used in the shader need to have been
>     assigned a
>     > -          * hardware register by the caller
>     > -          */
>     > -         assert(grf != 0);
>     > -
>     > -         struct brw_reg reg =
>     > -            attribute_to_hw_reg(grf, inst->src[i].type,
>     interleaved);
>     > -         reg.swizzle = inst->src[i].swizzle;
>     > -         if (inst->src[i].abs)
>     > -            reg = brw_abs(reg);
>     > -         if (inst->src[i].negate)
>     > -            reg = negate(reg);
>     > -
>     > -         inst->src[i] = reg;
>     > -      }
>     > -   }
>     > -}
>     > -
>     >  int
>     >  vec4_vs_visitor::setup_attributes(int payload_reg)
>     >  {
>     > diff --git a/src/intel/compiler/brw_vec4.h
>     b/src/intel/compiler/brw_vec4.h
>     > index 89adfaa..0f92f34 100644
>     > --- a/src/intel/compiler/brw_vec4.h
>     > +++ b/src/intel/compiler/brw_vec4.h
>     > @@ -367,8 +367,6 @@ public:
>     >
>     >  protected:
>     >     void emit_vertex();
>     > -   void lower_attributes_to_hw_regs(const int *attribute_map,
>     > -                                    bool interleaved);
>     >     void setup_payload_interference(struct ra_graph *g, int
>     first_payload_node,
>     >                                     int reg_node_count);
>     >     virtual void setup_payload() = 0;
>     > diff --git a/src/intel/compiler/brw_vec4_gs_nir.cpp
>     b/src/intel/compiler/brw_vec4_gs_nir.cpp
>     > index ed8c03b..577f587 100644
>     > --- a/src/intel/compiler/brw_vec4_gs_nir.cpp
>     > +++ b/src/intel/compiler/brw_vec4_gs_nir.cpp
>     > @@ -66,8 +66,10 @@
>     vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
>     >        nir_const_value *vertex =
>     nir_src_as_const_value(instr->src[0]);
>     >        nir_const_value *offset_reg =
>     nir_src_as_const_value(instr->src[1]);
>     >
>     > +      const unsigned input_array_stride =
>     prog_data->urb_read_length * 2;
>
>     The change of using the input_array_stride instead of
>     BRW_VARYING_SLOT_COUNT is not evident (at least to me). There is a
>     comment explaining the stride purpose on
>     vec4_gs_visitor::setup_varying_inputs, but perhaps a note here
>     would be
>     welcomed. You can drop this suggestion if you prefer.
>
>
> We were using BRW_VARYING_SLOT_COUNT before because we were doing a
> generic thing here that we would later remap using
> input_array_stride.  Now we're just using input_array_stride
> directly.  I think if this weren't a change, you would probably not
> notice. :-)  How about:
>
> All of the inputs for the first vertex come in first followed by all
> inputs for the second vertex etc.  The stride between vertices is
> governed by the URB read length.

Sounds good to me.

>
> --Jason
>  
>
>     Other than that:
>     Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com
>     <mailto:apinheiro at igalia.com>>
>

This still applies (in case you missed).

>
>     > +
>     >        if (nir_dest_bit_size(instr->dest) == 64) {
>     > -         src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT *
>     vertex->u32[0] +
>     > +         src = src_reg(ATTR, input_array_stride * vertex->u32[0] +
>     >                         instr->const_index[0] + offset_reg->u32[0],
>     >                         glsl_type::dvec4_type);
>     >
>     > @@ -85,15 +87,11 @@
>     vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
>     >           /* Make up a type...we have no way of knowing... */
>     >           const glsl_type *const type =
>     glsl_type::ivec(instr->num_components);
>     >
>     > -         src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT *
>     vertex->u32[0] +
>     > +         src = src_reg(ATTR, input_array_stride * vertex->u32[0] +
>     >                         instr->const_index[0] + offset_reg->u32[0],
>     >                         type);
>     >           src.swizzle =
>     BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
>     >
>     > -         /* gl_PointSize is passed in the .w component of the
>     VUE header */
>     > -         if (instr->const_index[0] == VARYING_SLOT_PSIZ)
>     > -            src.swizzle = BRW_SWIZZLE_WWWW;
>     > -
>     >           dest = get_nir_dest(instr->dest, src.type);
>     >           dest.writemask =
>     brw_writemask_for_size(instr->num_components);
>     >           emit(MOV(dest, src));
>     > diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp
>     b/src/intel/compiler/brw_vec4_gs_visitor.cpp
>     > index 4a8b5be..516ab0b 100644
>     > --- a/src/intel/compiler/brw_vec4_gs_visitor.cpp
>     > +++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
>     > @@ -29,6 +29,7 @@
>     >
>     >  #include "brw_vec4_gs_visitor.h"
>     >  #include "gen6_gs_visitor.h"
>     > +#include "brw_cfg.h"
>     >  #include "brw_fs.h"
>     >  #include "brw_nir.h"
>     >  #include "common/gen_debug.h"
>     > @@ -72,9 +73,36 @@
>     vec4_gs_visitor::make_reg_for_system_value(int location)
>     >     return reg;
>     >  }
>     >
>     > +static inline struct brw_reg
>     > +attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved)
>     > +{
>     > +   struct brw_reg reg;
>     >
>     > +   unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type));
>     > +   if (interleaved) {
>     > +      reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) *
>     4), 0, width, 1);
>     > +   } else {
>     > +      reg = brw_vecn_grf(width, attr, 0);
>     > +   }
>     > +
>     > +   reg.type = type;
>     > +   return reg;
>     > +}
>     > +
>     > +/**
>     > + * Replace each register of type ATTR in this->instructions
>     with a reference
>     > + * to a fixed HW register.
>     > + *
>     > + * If interleaved is true, then each attribute takes up half a
>     register, with
>     > + * register N containing attribute 2*N in its first half and
>     attribute 2*N+1
>     > + * in its second half (this corresponds to the payload setup
>     used by geometry
>     > + * shaders in "single" or "dual instanced" dispatch mode).  If
>     interleaved is
>     > + * false, then each attribute takes up a whole register, with
>     register N
>     > + * containing attribute N (this corresponds to the payload
>     setup used by
>     > + * vertex shaders, and by geometry shaders in "dual object"
>     dispatch mode).
>     > + */
>     >  int
>     > -vec4_gs_visitor::setup_varying_inputs(int payload_reg, int
>     *attribute_map,
>     > +vec4_gs_visitor::setup_varying_inputs(int payload_reg,
>     >                                        int attributes_per_reg)
>     >  {
>     >     /* For geometry shaders there are N copies of the input
>     attributes, where N
>     > @@ -89,12 +117,24 @@ vec4_gs_visitor::setup_varying_inputs(int
>     payload_reg, int *attribute_map,
>     >     assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
>     >     unsigned input_array_stride = prog_data->urb_read_length * 2;
>     >
>     > -   for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
>     > -      int varying = c->input_vue_map.slot_to_varying[slot];
>     > -      for (unsigned vertex = 0; vertex < num_input_vertices;
>     vertex++) {
>     > -         attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
>     > -            attributes_per_reg * payload_reg +
>     input_array_stride * vertex +
>     > -            slot;
>     > +   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
>     > +      for (int i = 0; i < 3; i++) {
>     > +         if (inst->src[i].file != ATTR)
>     > +            continue;
>     > +
>     > +         assert(inst->src[i].offset % REG_SIZE == 0);
>     > +         int grf = payload_reg * attributes_per_reg +
>     > +                   inst->src[i].nr + inst->src[i].offset /
>     REG_SIZE;
>     > +
>     > +         struct brw_reg reg =
>     > +            attribute_to_hw_reg(grf, inst->src[i].type,
>     attributes_per_reg > 1);
>     > +         reg.swizzle = inst->src[i].swizzle;
>     > +         if (inst->src[i].abs)
>     > +            reg = brw_abs(reg);
>     > +         if (inst->src[i].negate)
>     > +            reg = negate(reg);
>     > +
>     > +         inst->src[i] = reg;
>     >        }
>     >     }
>     >
>     > @@ -103,25 +143,15 @@ vec4_gs_visitor::setup_varying_inputs(int
>     payload_reg, int *attribute_map,
>     >     return payload_reg + regs_used;
>     >  }
>     >
>     > -
>     >  void
>     >  vec4_gs_visitor::setup_payload()
>     >  {
>     > -   int attribute_map[BRW_VARYING_SLOT_COUNT *
>     MAX_GS_INPUT_VERTICES];
>     > -
>     >     /* If we are in dual instanced or single mode, then
>     attributes are going
>     >      * to be interleaved, so one register contains two attribute
>     slots.
>     >      */
>     >     int attributes_per_reg =
>     >        prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT
>     ? 1 : 2;
>     >
>     > -   /* If a geometry shader tries to read from an input that
>     wasn't written by
>     > -    * the vertex shader, that produces undefined results, but
>     it shouldn't
>     > -    * crash anything.  So initialize attribute_map to
>     zeros--that ensures that
>     > -    * these undefined results are read from r0.
>     > -    */
>     > -   memset(attribute_map, 0, sizeof(attribute_map));
>     > -
>     >     int reg = 0;
>     >
>     >     /* The payload always contains important data in r0, which
>     contains
>     > @@ -132,13 +162,11 @@ vec4_gs_visitor::setup_payload()
>     >
>     >     /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
>     >     if (gs_prog_data->include_primitive_id)
>     > -      attribute_map[VARYING_SLOT_PRIMITIVE_ID] =
>     attributes_per_reg * reg++;
>     > +      reg++;
>     >
>     >     reg = setup_uniforms(reg);
>     >
>     > -   reg = setup_varying_inputs(reg, attribute_map,
>     attributes_per_reg);
>     > -
>     > -   lower_attributes_to_hw_regs(attribute_map,
>     attributes_per_reg > 1);
>     > +   reg = setup_varying_inputs(reg, attributes_per_reg);
>     >
>     >     this->first_non_payload_grf = reg;
>     >  }
>     > diff --git a/src/intel/compiler/brw_vec4_gs_visitor.h
>     b/src/intel/compiler/brw_vec4_gs_visitor.h
>     > index 09221f9..6b21a33 100644
>     > --- a/src/intel/compiler/brw_vec4_gs_visitor.h
>     > +++ b/src/intel/compiler/brw_vec4_gs_visitor.h
>     > @@ -33,6 +33,7 @@
>     >  #include "brw_vec4.h"
>     >
>     >  #define MAX_GS_INPUT_VERTICES 6
>     > +#define MAX_GS_INPUT_VARYINGS 32
>     >
>     >  #ifdef __cplusplus
>     >  namespace brw {
>     > @@ -64,8 +65,7 @@ protected:
>     >     virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
>     >
>     >  protected:
>     > -   int setup_varying_inputs(int payload_reg, int *attribute_map,
>     > -                            int attributes_per_reg);
>     > +   int setup_varying_inputs(int payload_reg, int
>     attributes_per_reg);
>     >     void emit_control_data_bits();
>     >     void set_stream_control_data_bits(unsigned stream_id);
>     >
>     > diff --git a/src/intel/compiler/gen6_gs_visitor.cpp
>     b/src/intel/compiler/gen6_gs_visitor.cpp
>     > index 075bc4a..afc6056 100644
>     > --- a/src/intel/compiler/gen6_gs_visitor.cpp
>     > +++ b/src/intel/compiler/gen6_gs_visitor.cpp
>     > @@ -516,9 +516,7 @@ gen6_gs_visitor::setup_payload()
>     >
>     >     reg = setup_uniforms(reg);
>     >
>     > -   reg = setup_varying_inputs(reg, attribute_map,
>     attributes_per_reg);
>     > -
>     > -   lower_attributes_to_hw_regs(attribute_map, true);
>     > +   reg = setup_varying_inputs(reg, attributes_per_reg);
>     >
>     >     this->first_non_payload_grf = reg;
>     >  }
>
>
>