[Mesa-dev] [PATCH 6/7] i965: Rewrite FS input handling to use the new NIR intrinsics.

Wed Jul 20 00:02:24 UTC 2016

On Tuesday, July 19, 2016 1:39:23 PM PDT Jason Ekstrand wrote:
> On Mon, Jul 18, 2016 at 1:26 PM, Kenneth Graunke <kenneth at whitecape.org>
> wrote:
> 
> > This eliminates the need to walk the list of input variables, recurse
> > into their types (via logic largely redundant with nir_lower_io), and
> > interpolate all possible inputs up front.  The backend no longer has
> > to care about variables at all, which eliminates complications from
> > trying to pack multiple variables into the same location.  Instead,
> > each intrinsic specifies exactly what's needed.
> >
> > This should unblock Timothy's work on GL_ARB_enhanced_layouts.
> >
> > Each load_interpolated_input intrinsic corresponds to PLN instructions,
> > while load_barycentric_at_* intrinsics correspond to pixel interpolator
> > messages.  The pixel/centroid/sample barycentric intrinsics simply refer
> > to payload fields (delta_xy[]), and don't actually generate any code.
> >
> > Because we use a single intrinsic for both centroid-qualified variables
> > and interpolateAtCentroid(), they become indistinguishable.  We stop
> > sending pixel interpolator messages for those, and instead use the
> > payload provided data, which should be considerably faster.
> >
> > On Broadwell:
> >
> > total instructions in shared programs: 9067751 -> 9067570 (-0.00%)
> > instructions in affected programs: 145902 -> 145721 (-0.12%)
> > helped: 422
> > HURT: 209
> >
> > total spills in shared programs: 2849 -> 2899 (1.76%)
> > spills in affected programs: 760 -> 810 (6.58%)
> > helped: 0
> > HURT: 10
> >
> > total fills in shared programs: 3910 -> 3950 (1.02%)
> > fills in affected programs: 617 -> 657 (6.48%)
> > helped: 0
> > HURT: 10
> >
> > LOST:   3
> > GAINED: 3
> >
> > The differences mostly appear to be slight changes in MOVs.
> >
> > Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> > ---
> >  src/mesa/drivers/dri/i965/brw_fs.cpp     | 175 ++++---------
> >  src/mesa/drivers/dri/i965/brw_fs.h       |   9 +-
> >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 410
> > ++++++++++++++++---------------
> >  src/mesa/drivers/dri/i965/brw_nir.c      |  16 +-
> >  4 files changed, 269 insertions(+), 341 deletions(-)
> >
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > index 94127bc..06007fe 100644
> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > @@ -1067,21 +1067,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg
> > wpos)
> >     bld.MOV(wpos, this->wpos_w);
> >  }
> >
> > -static enum brw_barycentric_mode
> > -barycentric_mode(enum glsl_interp_mode mode,
> > -                 bool is_centroid, bool is_sample)
> > +enum brw_barycentric_mode
> > +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
> >  {
> > -   unsigned bary;
> > -
> >     /* Barycentric modes don't make sense for flat inputs. */
> >     assert(mode != INTERP_MODE_FLAT);
> >
> > -   if (is_sample) {
> > -      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> > -   } else if (is_centroid) {
> > -      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> > -   } else {
> > +   unsigned bary;
> > +   switch (op) {
> > +   case nir_intrinsic_load_barycentric_pixel:
> > +   case nir_intrinsic_load_barycentric_at_offset:
> >        bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
> > +      break;
> > +   case nir_intrinsic_load_barycentric_centroid:
> > +      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> > +      break;
> > +   case nir_intrinsic_load_barycentric_sample:
> > +   case nir_intrinsic_load_barycentric_at_sample:
> > +      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> > +      break;
> > +   default:
> > +      assert(!"invalid intrinsic");
> >     }
> >
> >     if (mode == INTERP_MODE_NOPERSPECTIVE)
> > @@ -1101,107 +1107,6 @@ centroid_to_pixel(enum brw_barycentric_mode bary)
> >     return (enum brw_barycentric_mode) ((unsigned) bary - 1);
> >  }
> >
> > -void
> > -fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
> > -                                       const glsl_type *type,
> > -                                       glsl_interp_mode
> > interpolation_mode,
> > -                                       int *location, bool mod_centroid,
> > -                                       bool mod_sample)
> > -{
> > -   assert(stage == MESA_SHADER_FRAGMENT);
> > -   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> > -
> > -   if (type->is_array() || type->is_matrix()) {
> > -      const glsl_type *elem_type = glsl_get_array_element(type);
> > -      const unsigned length = glsl_get_length(type);
> > -
> > -      for (unsigned i = 0; i < length; i++) {
> > -         emit_general_interpolation(attr, name, elem_type,
> > interpolation_mode,
> > -                                    location, mod_centroid, mod_sample);
> > -      }
> > -   } else if (type->is_record()) {
> > -      for (unsigned i = 0; i < type->length; i++) {
> > -         const glsl_type *field_type = type->fields.structure[i].type;
> > -         emit_general_interpolation(attr, name, field_type,
> > interpolation_mode,
> > -                                    location, mod_centroid, mod_sample);
> > -      }
> > -   } else {
> > -      assert(type->is_scalar() || type->is_vector());
> > -
> > -      if (prog_data->urb_setup[*location] == -1) {
> > -         /* If there's no incoming setup data for this slot, don't
> > -          * emit interpolation for it.
> > -          */
> > -         *attr = offset(*attr, bld, type->vector_elements);
> > -         (*location)++;
> > -         return;
> > -      }
> > -
> > -      attr->type = brw_type_for_base_type(type->get_scalar_type());
> > -
> > -      if (interpolation_mode == INTERP_MODE_FLAT) {
> > -         /* Constant interpolation (flat shading) case. The SF has
> > -          * handed us defined values in only the constant offset
> > -          * field of the setup reg.
> > -          */
> > -         unsigned vector_elements = type->vector_elements;
> > -
> > -         /* Data starts at suboffet 3 in 32-bit units (12 bytes), so it
> > is not
> > -          * 64-bit aligned and the current implementation fails to read
> > the
> > -          * data properly. Instead, when there is a double input varying,
> > -          * read it as vector of floats with twice the number of
> > components.
> > -          */
> > -         if (attr->type == BRW_REGISTER_TYPE_DF) {
> > -            vector_elements *= 2;
> > -            attr->type = BRW_REGISTER_TYPE_F;
> > -         }
> > -         for (unsigned int i = 0; i < vector_elements; i++) {
> > -            struct brw_reg interp = interp_reg(*location, i);
> > -            interp = suboffset(interp, 3);
> > -            interp.type = attr->type;
> > -            bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
> > -            *attr = offset(*attr, bld, 1);
> > -         }
> > -      } else {
> > -         /* Smooth/noperspective interpolation case. */
> > -         enum brw_barycentric_mode bary =
> > -            barycentric_mode(interpolation_mode, mod_centroid,
> > mod_sample);
> > -
> > -         for (unsigned int i = 0; i < type->vector_elements; i++) {
> > -            fs_reg interp(interp_reg(*location, i));
> > -            if (devinfo->needs_unlit_centroid_workaround && mod_centroid)
> > {
> > -               /* Get the pixel/sample mask into f0 so that we know
> > -                * which pixels are lit.  Then, for each channel that is
> > -                * unlit, replace the centroid data with non-centroid
> > -                * data.
> > -                */
> > -               bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> > -
> > -               fs_inst *inst;
> > -               inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> > -                               delta_xy[centroid_to_pixel(bary)], interp);
> > -               inst->predicate = BRW_PREDICATE_NORMAL;
> > -               inst->predicate_inverse = true;
> > -               inst->no_dd_clear = true;
> > -
> > -               inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> > -                               delta_xy[bary], interp);
> > -               inst->predicate = BRW_PREDICATE_NORMAL;
> > -               inst->predicate_inverse = false;
> > -               inst->no_dd_check = true;
> > -            } else {
> > -               bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary], interp);
> > -            }
> > -            if (devinfo->gen < 6 && interpolation_mode ==
> > INTERP_MODE_SMOOTH) {
> > -               bld.MUL(*attr, *attr, this->pixel_w);
> > -            }
> > -            *attr = offset(*attr, bld, 1);
> > -         }
> > -      }
> > -      (*location)++;
> > -   }
> > -}
> > -
> >  fs_reg *
> >  fs_visitor::emit_frontfacing_interpolation()
> >  {
> > @@ -6327,6 +6232,10 @@ fs_visitor::run_cs()
> >  /**
> >   * Return a bitfield where bit n is set if barycentric interpolation mode
> > n
> >   * (see enum brw_barycentric_mode) is needed by the fragment shader.
> > + *
> > + * We examine the load_barycentric intrinsics rather than looking at input
> > + * variables so that we catch interpolateAtCentroid() messages too, which
> > + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
> >   */
> >  static unsigned
> >  brw_compute_barycentric_interp_modes(const struct brw_device_info
> > *devinfo,
> > @@ -6334,29 +6243,37 @@ brw_compute_barycentric_interp_modes(const struct
> > brw_device_info *devinfo,
> >  {
> >     unsigned barycentric_interp_modes = 0;
> >
> > -   nir_foreach_variable(var, &shader->inputs) {
> > -      /* Ignore WPOS; it doesn't require interpolation. */
> > -      if (var->data.location == VARYING_SLOT_POS)
> > +   nir_foreach_function(f, shader) {
> > +      if (!f->impl)
> >           continue;
> >
> > -      /* Flat inputs don't need barycentric modes. */
> > -      if (var->data.interpolation == INTERP_MODE_FLAT)
> > -         continue;
> > +      nir_foreach_block(block, f->impl) {
> > +         nir_foreach_instr(instr, block) {
> > +            if (instr->type != nir_instr_type_intrinsic)
> > +               continue;
> >
> > -      /* Determine the set (or sets) of barycentric coordinates needed to
> > -       * interpolate this variable.  Note that when
> > -       * brw->needs_unlit_centroid_workaround is set, centroid
> > interpolation
> > -       * uses PIXEL interpolation for unlit pixels and CENTROID
> > interpolation
> > -       * for lit pixels, so we need both sets of barycentric coordinates.
> > -       */
> > -      enum brw_barycentric_mode bary_mode =
> > -         barycentric_mode((glsl_interp_mode) var->data.interpolation,
> > -                          var->data.centroid, var->data.sample);
> > +            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
> > +            if (intrin->intrinsic !=
> > nir_intrinsic_load_interpolated_input)
> > +               continue;
> >
> 
> Any particular reason why you're looking at the source of the load rather
> than just looking for the load_barycentric intrinsic directly?
> 
> 
> > +
> > +            /* Ignore WPOS; it doesn't require interpolation. */
> > +            if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
> > +               continue;
> >
> 
> Ugh...

This is why.  gl_FragCoord still generates a load_interpolated_input
with a load_barycentric_pixel, and I want to skip that.  (It might be
the only user of pixel coordinates, and if so, we don't want them.)

You need the load_interpolated_input intrinsic to know which input it is.

It's pretty clear that gl_FragCoord doesn't behave anything like a
normal input, so it ought to be a system value.  With that cleaned up,
we could simplify this.

That ended up being more work than I expected, so I was hoping to do
it as a follow-on series, to unblock Tim sooner rather than later.

> >
> > -      barycentric_interp_modes |= 1 << bary_mode;
> > +            intrin =
> > nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
> > +            enum glsl_interp_mode interp = (enum glsl_interp_mode)
> > +               nir_intrinsic_interp_mode(intrin);
> > +            nir_intrinsic_op bary_op = intrin->intrinsic;
> > +            enum brw_barycentric_mode bary =
> > +               brw_barycentric_mode(interp, bary_op);
> >
> > -      if (var->data.centroid && devinfo->needs_unlit_centroid_workaround)
> > -         barycentric_interp_modes |= 1 << centroid_to_pixel(bary_mode);
> > +            barycentric_interp_modes |= 1 << bary;
> > +
> > +            if (devinfo->needs_unlit_centroid_workaround &&
> > +                bary_op == nir_intrinsic_load_barycentric_centroid)
> > +               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
> > +         }
> > +      }
> >     }
> >
> >     return barycentric_interp_modes;
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h
> > b/src/mesa/drivers/dri/i965/brw_fs.h
> > index 7998f51..574475f 100644
> > --- a/src/mesa/drivers/dri/i965/brw_fs.h
> > +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> > @@ -174,11 +174,6 @@ public:
> >     fs_reg *emit_samplepos_setup();
> >     fs_reg *emit_sampleid_setup();
> >     fs_reg *emit_samplemaskin_setup();
> > -   void emit_general_interpolation(fs_reg *attr, const char *name,
> > -                                   const glsl_type *type,
> > -                                   glsl_interp_mode interpolation_mode,
> > -                                   int *location, bool mod_centroid,
> > -                                   bool mod_sample);
> >     fs_reg *emit_vs_system_value(int location);
> >     void emit_interpolation_setup_gen4();
> >     void emit_interpolation_setup_gen6();
> > @@ -195,7 +190,6 @@ public:
> >     bool opt_zero_samples();
> >
> >     void emit_nir_code();
> > -   void nir_setup_inputs();
> >     void nir_setup_single_output_varying(fs_reg *reg, const glsl_type
> > *type,
> >                                          unsigned *location);
> >     void nir_setup_outputs();
> > @@ -511,3 +505,6 @@ void shuffle_64bit_data_for_32bit_write(const
> > brw::fs_builder &bld,
> >                                          uint32_t components);
> >  fs_reg setup_imm_df(const brw::fs_builder &bld,
> >                      double v);
> > +
> > +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode,
> > +                                               nir_intrinsic_op op);
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > index 6265dc6..610c151 100644
> > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > @@ -36,7 +36,6 @@ fs_visitor::emit_nir_code()
> >     /* emit the arrays used for inputs and outputs - load/store intrinsics
> > will
> >      * be converted to reads/writes of these arrays
> >      */
> > -   nir_setup_inputs();
> >     nir_setup_outputs();
> >     nir_setup_uniforms();
> >     nir_emit_system_values();
> > @@ -50,38 +49,6 @@ fs_visitor::emit_nir_code()
> >  }
> >
> >  void
> > -fs_visitor::nir_setup_inputs()
> > -{
> > -   if (stage != MESA_SHADER_FRAGMENT)
> > -      return;
> > -
> > -   nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
> > -
> > -   nir_foreach_variable(var, &nir->inputs) {
> > -      fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
> > -
> > -      fs_reg reg;
> > -      if (var->data.location == VARYING_SLOT_POS) {
> > -         emit_fragcoord_interpolation(input);
> > -      } else if (var->data.location == VARYING_SLOT_LAYER) {
> > -         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER,
> > 1), 3);
> > -         reg.type = BRW_REGISTER_TYPE_D;
> > -         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D),
> > reg);
> > -      } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
> > -         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT,
> > 2), 3);
> > -         reg.type = BRW_REGISTER_TYPE_D;
> > -         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D),
> > reg);
> > -      } else {
> > -         int location = var->data.location;
> > -         emit_general_interpolation(&input, var->name, var->type,
> > -                                    (glsl_interp_mode)
> > var->data.interpolation,
> > -                                    &location, var->data.centroid,
> > -                                    var->data.sample);
> > -      }
> > -   }
> > -}
> > -
> > -void
> >  fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
> >                                              const glsl_type *type,
> >                                              unsigned *location)
> > @@ -3063,7 +3030,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder
> > &bld,
> >                                    nir_intrinsic_instr *instr)
> >  {
> >     assert(stage == MESA_SHADER_FRAGMENT);
> > -   const struct brw_wm_prog_key *wm_key = (const struct brw_wm_prog_key
> > *) key;
> >
> >     fs_reg dest;
> >     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> > @@ -3120,189 +3086,245 @@ fs_visitor::nir_emit_fs_intrinsic(const
> > fs_builder &bld,
> >        break;
> >     }
> >
> > -   case nir_intrinsic_interp_var_at_centroid:
> > -   case nir_intrinsic_interp_var_at_sample:
> > -   case nir_intrinsic_interp_var_at_offset: {
> > -      /* Handle ARB_gpu_shader5 interpolation intrinsics
> > -       *
> > -       * It's worth a quick word of explanation as to why we handle the
> > full
> > -       * variable-based interpolation intrinsic rather than a lowered
> > version
> > -       * with like we do for other inputs.  We have to do that because
> > the way
> > -       * we set up inputs doesn't allow us to use the already setup
> > inputs for
> > -       * interpolation.  At the beginning of the shader, we go through
> > all of
> > -       * the input variables and do the initial interpolation and put it
> > in
> > -       * the nir_inputs array based on its location as determined in
> > -       * nir_lower_io.  If the input isn't used, dead code cleans up and
> > -       * everything works fine.  However, when we get to the
> > ARB_gpu_shader5
> > -       * interpolation intrinsics, we need to reinterpolate the input
> > -       * differently.  If we used an intrinsic that just had an index it
> > would
> > -       * only give us the offset into the nir_inputs array.  However,
> > this is
> > -       * useless because that value is post-interpolation and we need
> > -       * pre-interpolation.  In order to get the actual location of the
> > bits
> > -       * we get from the vertex fetching hardware, we need the variable.
> > -       */
> > -      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> > -      const glsl_interp_mode interpolation =
> > -         (glsl_interp_mode) instr->variables[0]->var->data.interpolation;
> > +   case nir_intrinsic_load_input: {
> > +      /* load_input is only used for flat inputs */
> > +      unsigned base = nir_intrinsic_base(instr);
> > +      unsigned component = nir_intrinsic_component(instr);
> > +      unsigned num_components = instr->num_components;
> > +      enum brw_reg_type type = dest.type;
> >
> > -      switch (instr->intrinsic) {
> > -      case nir_intrinsic_interp_var_at_centroid:
> > -         emit_pixel_interpolater_send(bld,
> > -                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,
> > -                                      dst_xy,
> > -                                      fs_reg(), /* src */
> > -                                      brw_imm_ud(0u),
> > -                                      interpolation);
> > -         break;
> > +      /* Special case fields in the VUE header */
> > +      if (base == VARYING_SLOT_LAYER)
> > +         component = 1;
> > +      else if (base == VARYING_SLOT_VIEWPORT)
> > +         component = 2;
> >
> > -      case nir_intrinsic_interp_var_at_sample: {
> > -         if (!wm_key->multisample_fbo) {
> > -            /* From the ARB_gpu_shader5 specification:
> > -             * "If multisample buffers are not available, the input
> > varying
> > -             *  will be evaluated at the center of the pixel."
> > -             */
> > -            emit_pixel_interpolater_send(bld,
> > -
> >  FS_OPCODE_INTERPOLATE_AT_CENTROID,
> > -                                         dst_xy,
> > -                                         fs_reg(), /* src */
> > -                                         brw_imm_ud(0u),
> > -                                         interpolation);
> > -            break;
> > -         }
> > +      if (nir_dest_bit_size(instr->dest) == 64) {
> > +         /* const_index is in 32-bit type size units that could not be
> > aligned
> > +          * with DF. We need to read the double vector as if it was a
> > float
> > +          * vector of twice the number of components to fetch the right
> > data.
> > +          */
> > +         type = BRW_REGISTER_TYPE_F;
> > +         num_components *= 2;
> > +      }
> >
> > -         nir_const_value *const_sample =
> > nir_src_as_const_value(instr->src[0]);
> > +      for (unsigned int i = 0; i < num_components; i++) {
> > +         struct brw_reg interp = interp_reg(base, component + i);
> > +         interp = suboffset(interp, 3);
> > +         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
> > +                  retype(fs_reg(interp), type));
> > +      }
> >
> > -         if (const_sample) {
> > -            unsigned msg_data = const_sample->i32[0] << 4;
> > +      if (nir_dest_bit_size(instr->dest) == 64) {
> > +         shuffle_32bit_load_result_to_64bit_data(bld,
> > +                                                 dest,
> > +                                                 retype(dest, type),
> > +                                                 instr->num_components);
> > +      }
> > +      break;
> > +   }
> > +
> > +   case nir_intrinsic_load_barycentric_pixel:
> > +   case nir_intrinsic_load_barycentric_centroid:
> > +   case nir_intrinsic_load_barycentric_sample:
> > +      /* Do nothing - load_interpolated_input handling will handle it
> > later. */
> > +      break;
> >
> 
> I'm very courious why you made this choice.  It seems odd to me.

I originally tried emitting MOVs here.  The data layout is a bit funky:

    R3: X coordinates for Slots 0-7
    R4: Y coordinates for Slots 0-7
    R5: X coordinates for Slots 8-15
    R5: Y coordinates for Slots 8-15

For SIMD8, you could do two separate MOVs (move X, then Y).  You might be
able to do a compressed mov(16)...but would need to whack the channel
enables into 1Q for both halves (if that's possible) or use NoMask
(which I wanted to avoid).

For SIMD16...you can't move both X's with a single MOV.  A <16,8,1>
region would cover it, but sources can't span more than two adjacent
registers.  So that doesn't work.  It might take four MOVs.  Maybe
only two if I could use the above trick.

Generating MOVs also means that we're relying on the optimizer to
remove them...which it didn't seem to be.  It ended up being much
easier to just point LINTERP at delta_xy[] directly.

> > +   case nir_intrinsic_load_barycentric_at_sample: {
> > +      const glsl_interp_mode interpolation =
> > +         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
> > +
> > +      nir_const_value *const_sample =
> > nir_src_as_const_value(instr->src[0]);
> > +
> > +      if (const_sample) {
> > +         unsigned msg_data = const_sample->i32[0] << 4;
> > +
> > +         emit_pixel_interpolater_send(bld,
> > +                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > +                                      dest,
> > +                                      fs_reg(), /* src */
> > +                                      brw_imm_ud(msg_data),
> > +                                      interpolation);
> > +      } else {
> > +         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> > +                                          BRW_REGISTER_TYPE_UD);
> > +
> > +         if (nir_src_is_dynamically_uniform(instr->src[0])) {
> > +            const fs_reg sample_id = bld.emit_uniformize(sample_src);
> > +            const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > +            bld.exec_all().group(1, 0)
> > +               .SHL(msg_data, sample_id, brw_imm_ud(4u));
> >              emit_pixel_interpolater_send(bld,
> >                                           FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > -                                         dst_xy,
> > +                                         dest,
> >                                           fs_reg(), /* src */
> > -                                         brw_imm_ud(msg_data),
> > +                                         msg_data,
> >                                           interpolation);
> >           } else {
> > -            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> > -                                             BRW_REGISTER_TYPE_UD);
> > -
> > -            if (nir_src_is_dynamically_uniform(instr->src[0])) {
> > -               const fs_reg sample_id = bld.emit_uniformize(sample_src);
> > -               const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > -               bld.exec_all().group(1, 0)
> > -                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > +            /* Make a loop that sends a message to the pixel interpolater
> > +             * for the sample number in each live channel. If there are
> > +             * multiple channels with the same sample number then these
> > +             * will be handled simultaneously with a single interation of
> > +             * the loop.
> > +             */
> > +            bld.emit(BRW_OPCODE_DO);
> > +
> > +            /* Get the next live sample number into sample_id_reg */
> > +            const fs_reg sample_id = bld.emit_uniformize(sample_src);
> > +
> > +            /* Set the flag register so that we can perform the send
> > +             * message on all channels that have the same sample number
> > +             */
> > +            bld.CMP(bld.null_reg_ud(),
> > +                    sample_src, sample_id,
> > +                    BRW_CONDITIONAL_EQ);
> > +            const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > +            bld.exec_all().group(1, 0)
> > +               .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > +            fs_inst *inst =
> >                 emit_pixel_interpolater_send(bld,
> >
> >  FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > -                                            dst_xy,
> > +                                            dest,
> >                                              fs_reg(), /* src */
> >                                              msg_data,
> >                                              interpolation);
> > -            } else {
> > -               /* Make a loop that sends a message to the pixel
> > interpolater
> > -                * for the sample number in each live channel. If there are
> > -                * multiple channels with the same sample number then these
> > -                * will be handled simultaneously with a single interation
> > of
> > -                * the loop.
> > -                */
> > -               bld.emit(BRW_OPCODE_DO);
> > -
> > -               /* Get the next live sample number into sample_id_reg */
> > -               const fs_reg sample_id = bld.emit_uniformize(sample_src);
> > +            set_predicate(BRW_PREDICATE_NORMAL, inst);
> >
> > -               /* Set the flag register so that we can perform the send
> > -                * message on all channels that have the same sample number
> > -                */
> > -               bld.CMP(bld.null_reg_ud(),
> > -                       sample_src, sample_id,
> > -                       BRW_CONDITIONAL_EQ);
> > -               const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > -               bld.exec_all().group(1, 0)
> > -                  .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > -               fs_inst *inst =
> > -                  emit_pixel_interpolater_send(bld,
> > -
> >  FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > -                                               dst_xy,
> > -                                               fs_reg(), /* src */
> > -                                               msg_data,
> > -                                               interpolation);
> > -               set_predicate(BRW_PREDICATE_NORMAL, inst);
> > -
> > -               /* Continue the loop if there are any live channels left */
> > -               set_predicate_inv(BRW_PREDICATE_NORMAL,
> > -                                 true, /* inverse */
> > -                                 bld.emit(BRW_OPCODE_WHILE));
> > -            }
> > +            /* Continue the loop if there are any live channels left */
> > +            set_predicate_inv(BRW_PREDICATE_NORMAL,
> > +                              true, /* inverse */
> > +                              bld.emit(BRW_OPCODE_WHILE));
> >           }
> > -
> > -         break;
> >        }
> > +      break;
> > +   }
> >
> > -      case nir_intrinsic_interp_var_at_offset: {
> > -         nir_const_value *const_offset =
> > nir_src_as_const_value(instr->src[0]);
> > +   case nir_intrinsic_load_barycentric_at_offset: {
> > +      const glsl_interp_mode interpolation =
> > +         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
> >
> > -         if (const_offset) {
> > -            unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) &
> > 0xf;
> > -            unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) &
> > 0xf;
> > +      nir_const_value *const_offset =
> > nir_src_as_const_value(instr->src[0]);
> >
> > -            emit_pixel_interpolater_send(bld,
> > -
> >  FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> > -                                         dst_xy,
> > -                                         fs_reg(), /* src */
> > -                                         brw_imm_ud(off_x | (off_y << 4)),
> > -                                         interpolation);
> > -         } else {
> > -            fs_reg src = vgrf(glsl_type::ivec2_type);
> > -            fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> > -                                       BRW_REGISTER_TYPE_F);
> > -            for (int i = 0; i < 2; i++) {
> > -               fs_reg temp = vgrf(glsl_type::float_type);
> > -               bld.MUL(temp, offset(offset_src, bld, i),
> > brw_imm_f(16.0f));
> > -               fs_reg itemp = vgrf(glsl_type::int_type);
> > -               /* float to int */
> > -               bld.MOV(itemp, temp);
> > -
> > -               /* Clamp the upper end of the range to +7/16.
> > -                * ARB_gpu_shader5 requires that we support a maximum
> > offset
> > -                * of +0.5, which isn't representable in a S0.4 value -- if
> > -                * we didn't clamp it, we'd end up with -8/16, which is the
> > -                * opposite of what the shader author wanted.
> > -                *
> > -                * This is legal due to ARB_gpu_shader5's quantization
> > -                * rules:
> > -                *
> > -                * "Not all values of <offset> may be supported; x and y
> > -                * offsets may be rounded to fixed-point values with the
> > -                * number of fraction bits given by the
> > -                * implementation-dependent constant
> > -                * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> > -                */
> > -               set_condmod(BRW_CONDITIONAL_L,
> > -                           bld.SEL(offset(src, bld, i), itemp,
> > brw_imm_d(7)));
> > -            }
> > +      if (const_offset) {
> > +         unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
> > +         unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
> >
> > -            const enum opcode opcode =
> > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> > -            emit_pixel_interpolater_send(bld,
> > -                                         opcode,
> > -                                         dst_xy,
> > -                                         src,
> > -                                         brw_imm_ud(0u),
> > -                                         interpolation);
> > +         emit_pixel_interpolater_send(bld,
> > +
> > FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> > +                                      dest,
> > +                                      fs_reg(), /* src */
> > +                                      brw_imm_ud(off_x | (off_y << 4)),
> > +                                      interpolation);
> > +      } else {
> > +         fs_reg src = vgrf(glsl_type::ivec2_type);
> > +         fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> > +                                    BRW_REGISTER_TYPE_F);
> > +         for (int i = 0; i < 2; i++) {
> > +            fs_reg temp = vgrf(glsl_type::float_type);
> > +            bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
> > +            fs_reg itemp = vgrf(glsl_type::int_type);
> > +            /* float to int */
> > +            bld.MOV(itemp, temp);
> > +
> > +            /* Clamp the upper end of the range to +7/16.
> > +             * ARB_gpu_shader5 requires that we support a maximum offset
> > +             * of +0.5, which isn't representable in a S0.4 value -- if
> > +             * we didn't clamp it, we'd end up with -8/16, which is the
> > +             * opposite of what the shader author wanted.
> > +             *
> > +             * This is legal due to ARB_gpu_shader5's quantization
> > +             * rules:
> > +             *
> > +             * "Not all values of <offset> may be supported; x and y
> > +             * offsets may be rounded to fixed-point values with the
> > +             * number of fraction bits given by the
> > +             * implementation-dependent constant
> > +             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> > +             */
> > +            set_condmod(BRW_CONDITIONAL_L,
> > +                        bld.SEL(offset(src, bld, i), itemp,
> > brw_imm_d(7)));
> >           }
> > +
> > +         const enum opcode opcode =
> > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> > +         emit_pixel_interpolater_send(bld,
> > +                                      opcode,
> > +                                      dest,
> > +                                      src,
> > +                                      brw_imm_ud(0u),
> > +                                      interpolation);
> > +      }
> > +      break;
> > +   }
> > +
> > +   case nir_intrinsic_load_interpolated_input: {
> > +      if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
> > +         emit_fragcoord_interpolation(dest);
> >           break;
> >        }
> >
> > -      default:
> > -         unreachable("Invalid intrinsic");
> > +      assert(instr->src[0].ssa &&
> > +             instr->src[0].ssa->parent_instr->type ==
> > nir_instr_type_intrinsic);
> > +      nir_intrinsic_instr *bary_intrinsic =
> > +         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
> > +      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
> > +      enum glsl_interp_mode interp_mode =
> > +         (enum glsl_interp_mode)
> > nir_intrinsic_interp_mode(bary_intrinsic);
> > +      fs_reg dst_xy;
> > +
> > +      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
> > +          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
> > +         /* Use the result of the PI message */
> > +         dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
> > +      } else {
> > +         /* Use the delta_xy values computed from the payload */
> > +         enum brw_barycentric_mode bary =
> > +            brw_barycentric_mode(interp_mode, bary_intrin);
> > +
> > +         dst_xy = this->delta_xy[bary];
> >        }
> >
> > -      for (unsigned j = 0; j < instr->num_components; j++) {
> > -         fs_reg src = interp_reg(instr->variables[0]->var->data.location,
> > j);
> > -         src.type = dest.type;
> > +      for (unsigned int i = 0; i < instr->num_components; i++) {
> > +         fs_reg interp =
> > +            fs_reg(interp_reg(nir_intrinsic_base(instr),
> > +                              nir_intrinsic_component(instr) + i));
> > +         interp.type = BRW_REGISTER_TYPE_F;
> > +         dest.type = BRW_REGISTER_TYPE_F;
> >
> > -         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
> > -         dest = offset(dest, bld, 1);
> > +         if (devinfo->needs_unlit_centroid_workaround &&
> > +             bary_intrin == nir_intrinsic_load_barycentric_centroid) {
> > +
> > +            /* Get the pixel/sample mask into f0 so that we know which
> > +             * pixels are lit.  Then, for each channel that is unlit,
> > +             * replace the centroid data with non-centroid data.
> > +             */
> > +            bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> > +
> > +            fs_reg dest_i = offset(dest, bld, i);
> > +            fs_reg dst_xy_pixel =
> > +               delta_xy[brw_barycentric_mode(interp_mode,
> > +                  nir_intrinsic_load_barycentric_pixel)];
> > +
> > +            fs_inst *inst;
> > +            inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy_pixel,
> > interp);
> > +            inst->predicate = BRW_PREDICATE_NORMAL;
> > +            inst->predicate_inverse = true;
> > +            inst->no_dd_clear = true;
> > +
> > +            inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy, interp);
> > +            inst->predicate = BRW_PREDICATE_NORMAL;
> > +            inst->predicate_inverse = false;
> > +            inst->no_dd_check = true;
> > +         } else if (devinfo->gen < 6 && interp_mode ==
> > INTERP_MODE_SMOOTH) {
> > +            fs_reg tmp = vgrf(glsl_type::float_type);
> > +            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
> > +            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
> > +         } else {
> > +            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy,
> > interp);
> > +         }
> >
> 
> Ugh... I think I see why you're handling the barycentric load here now.
> Eventually, I think we could probably be doing this better but I'm not
> seeing an easy path at the moment.

Yeah, perhaps.  I don't think it's that bad as is, though.

> >        }
> >        break;
> >     }
> > +
> >     default:
> >        nir_emit_intrinsic(bld, instr);
> >        break;
> > @@ -3869,26 +3891,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder
> > &bld, nir_intrinsic_instr *instr
> >     }
> >
> >     case nir_intrinsic_load_input: {
> > -      fs_reg src;
> > +      fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
> >        unsigned num_components = instr->num_components;
> >        enum brw_reg_type type = dest.type;
> >
> > -      if (stage == MESA_SHADER_VERTEX) {
> > -         src = fs_reg(ATTR, instr->const_index[0], dest.type);
> > -      } else {
> > -         assert(type_sz(type) >= 4);
> > -         if (type == BRW_REGISTER_TYPE_DF) {
> > -            /* const_index is in 32-bit type size units that could not be
> > aligned
> > -             * with DF. We need to read the double vector as if it was a
> > float
> > -             * vector of twice the number of components to fetch the
> > right data.
> > -             */
> > -            dest = retype(dest, BRW_REGISTER_TYPE_F);
> > -            num_components *= 2;
> > -         }
> > -         src = offset(retype(nir_inputs, dest.type), bld,
> > -                      instr->const_index[0]);
> > -      }
> >
> 
> This hunk seems odd... It doesn't look like it really has anything to do
> with FS.  Is that else clause really FS-only or does it apply to GS and
> tess stages?

Yep, this is FS specific.  TCS/TES/GS input intrinsics are all handled
by nir_emit_{tcs,tes,gs}_intrinsic.  This code in the generic intrinsic
handler is only for VS and FS.  And the VS code obviously didn't include
this hunk.

Sorry this was immensely non-obvious.  This raises a good point, though:
after this patch, I can move the remaining code to nir_emit_vs_intrinsic
so it's clear it only applies to the VS.

> > -
> >        nir_const_value *const_offset =
> > nir_src_as_const_value(instr->src[0]);
> >        assert(const_offset && "Indirect input loads not allowed");
> >        src = offset(src, bld, const_offset->u32[0]);
> > diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
> > b/src/mesa/drivers/dri/i965/brw_nir.c
> > index caf9fe0..d1a823a 100644
> > --- a/src/mesa/drivers/dri/i965/brw_nir.c
> > +++ b/src/mesa/drivers/dri/i965/brw_nir.c
> > @@ -30,7 +30,8 @@ static bool
> >  is_input(nir_intrinsic_instr *intrin)
> >  {
> >     return intrin->intrinsic == nir_intrinsic_load_input ||
> > -          intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
> > +          intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
> > +          intrin->intrinsic == nir_intrinsic_load_interpolated_input;
> >  }
> >
> >  static bool
> > @@ -282,9 +283,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir, const
> > struct brw_vue_map *vue_map)
> >  void
> >  brw_nir_lower_fs_inputs(nir_shader *nir)
> >  {
> > -   nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
> > VARYING_SLOT_VAR0,
> > -                            type_size_scalar);
> > -   nir_lower_io(nir, nir_var_shader_in, type_size_scalar, false);
> > +   foreach_list_typed(nir_variable, var, node, &nir->inputs) {
> > +      var->data.driver_location = var->data.location;
> > +   }
> > +
> > +   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, true);
> > +
> > +   /* This pass needs actual constants */
> > +   nir_opt_constant_folding(nir);
> > +
> > +   add_const_offset_to_base(nir, nir_var_shader_in);
> >  }
> >
> >  void
> > --
> > 2.9.0
> >
> > _______________________________________________
> > mesa-dev mailing list
> > mesa-dev at lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> >
> 

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160719/10f4de09/attachment-0001.sig>