<div dir="ltr"><br><div class="gmail_extra"><br><div class="gmail_quote">On Tue, Jul 19, 2016 at 5:02 PM, Kenneth Graunke <span dir="ltr"><<a href="mailto:kenneth@whitecape.org" target="_blank">kenneth@whitecape.org</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5">On Tuesday, July 19, 2016 1:39:23 PM PDT Jason Ekstrand wrote:<br>
> On Mon, Jul 18, 2016 at 1:26 PM, Kenneth Graunke <<a href="mailto:kenneth@whitecape.org">kenneth@whitecape.org</a>><br>
> wrote:<br>
><br>
> > This eliminates the need to walk the list of input variables, recurse<br>
> > into their types (via logic largely redundant with nir_lower_io), and<br>
> > interpolate all possible inputs up front.  The backend no longer has<br>
> > to care about variables at all, which eliminates complications from<br>
> > trying to pack multiple variables into the same location.  Instead,<br>
> > each intrinsic specifies exactly what's needed.<br>
> ><br>
> > This should unblock Timothy's work on GL_ARB_enhanced_layouts.<br>
> ><br>
> > Each load_interpolated_input intrinsic corresponds to PLN instructions,<br>
> > while load_barycentric_at_* intrinsics correspond to pixel interpolator<br>
> > messages.  The pixel/centroid/sample barycentric intrinsics simply refer<br>
> > to payload fields (delta_xy[]), and don't actually generate any code.<br>
> ><br>
> > Because we use a single intrinsic for both centroid-qualified variables<br>
> > and interpolateAtCentroid(), they become indistinguishable.  We stop<br>
> > sending pixel interpolator messages for those, and instead use the<br>
> > payload provided data, which should be considerably faster.<br>
> ><br>
> > On Broadwell:<br>
> ><br>
> > total instructions in shared programs: 9067751 -> 9067570 (-0.00%)<br>
> > instructions in affected programs: 145902 -> 145721 (-0.12%)<br>
> > helped: 422<br>
> > HURT: 209<br>
> ><br>
> > total spills in shared programs: 2849 -> 2899 (1.76%)<br>
> > spills in affected programs: 760 -> 810 (6.58%)<br>
> > helped: 0<br>
> > HURT: 10<br>
> ><br>
> > total fills in shared programs: 3910 -> 3950 (1.02%)<br>
> > fills in affected programs: 617 -> 657 (6.48%)<br>
> > helped: 0<br>
> > HURT: 10<br>
> ><br>
> > LOST:   3<br>
> > GAINED: 3<br>
> ><br>
> > The differences mostly appear to be slight changes in MOVs.<br>
> ><br>
> > Signed-off-by: Kenneth Graunke <<a href="mailto:kenneth@whitecape.org">kenneth@whitecape.org</a>><br>
> > ---<br>
> >  src/mesa/drivers/dri/i965/brw_fs.cpp     | 175 ++++---------<br>
> >  src/mesa/drivers/dri/i965/brw_fs.h       |   9 +-<br>
> >  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 410<br>
> > ++++++++++++++++---------------<br>
> >  src/mesa/drivers/dri/i965/brw_nir.c      |  16 +-<br>
> >  4 files changed, 269 insertions(+), 341 deletions(-)<br>
> ><br>
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
> > b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
> > index 94127bc..06007fe 100644<br>
> > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
> > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
> > @@ -1067,21 +1067,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg<br>
> > wpos)<br>
> >     bld.MOV(wpos, this->wpos_w);<br>
> >  }<br>
> ><br>
> > -static enum brw_barycentric_mode<br>
> > -barycentric_mode(enum glsl_interp_mode mode,<br>
> > -                 bool is_centroid, bool is_sample)<br>
> > +enum brw_barycentric_mode<br>
> > +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)<br>
> >  {<br>
> > -   unsigned bary;<br>
> > -<br>
> >     /* Barycentric modes don't make sense for flat inputs. */<br>
> >     assert(mode != INTERP_MODE_FLAT);<br>
> ><br>
> > -   if (is_sample) {<br>
> > -      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;<br>
> > -   } else if (is_centroid) {<br>
> > -      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;<br>
> > -   } else {<br>
> > +   unsigned bary;<br>
> > +   switch (op) {<br>
> > +   case nir_intrinsic_load_barycentric_pixel:<br>
> > +   case nir_intrinsic_load_barycentric_at_offset:<br>
> >        bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;<br>
> > +      break;<br>
> > +   case nir_intrinsic_load_barycentric_centroid:<br>
> > +      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;<br>
> > +      break;<br>
> > +   case nir_intrinsic_load_barycentric_sample:<br>
> > +   case nir_intrinsic_load_barycentric_at_sample:<br>
> > +      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;<br>
> > +      break;<br>
> > +   default:<br>
> > +      assert(!"invalid intrinsic");<br>
> >     }<br>
> ><br>
> >     if (mode == INTERP_MODE_NOPERSPECTIVE)<br>
> > @@ -1101,107 +1107,6 @@ centroid_to_pixel(enum brw_barycentric_mode bary)<br>
> >     return (enum brw_barycentric_mode) ((unsigned) bary - 1);<br>
> >  }<br>
> ><br>
> > -void<br>
> > -fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,<br>
> > -                                       const glsl_type *type,<br>
> > -                                       glsl_interp_mode<br>
> > interpolation_mode,<br>
> > -                                       int *location, bool mod_centroid,<br>
> > -                                       bool mod_sample)<br>
> > -{<br>
> > -   assert(stage == MESA_SHADER_FRAGMENT);<br>
> > -   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;<br>
> > -<br>
> > -   if (type->is_array() || type->is_matrix()) {<br>
> > -      const glsl_type *elem_type = glsl_get_array_element(type);<br>
> > -      const unsigned length = glsl_get_length(type);<br>
> > -<br>
> > -      for (unsigned i = 0; i < length; i++) {<br>
> > -         emit_general_interpolation(attr, name, elem_type,<br>
> > interpolation_mode,<br>
> > -                                    location, mod_centroid, mod_sample);<br>
> > -      }<br>
> > -   } else if (type->is_record()) {<br>
> > -      for (unsigned i = 0; i < type->length; i++) {<br>
> > -         const glsl_type *field_type = type->fields.structure[i].type;<br>
> > -         emit_general_interpolation(attr, name, field_type,<br>
> > interpolation_mode,<br>
> > -                                    location, mod_centroid, mod_sample);<br>
> > -      }<br>
> > -   } else {<br>
> > -      assert(type->is_scalar() || type->is_vector());<br>
> > -<br>
> > -      if (prog_data->urb_setup[*location] == -1) {<br>
> > -         /* If there's no incoming setup data for this slot, don't<br>
> > -          * emit interpolation for it.<br>
> > -          */<br>
> > -         *attr = offset(*attr, bld, type->vector_elements);<br>
> > -         (*location)++;<br>
> > -         return;<br>
> > -      }<br>
> > -<br>
> > -      attr->type = brw_type_for_base_type(type->get_scalar_type());<br>
> > -<br>
> > -      if (interpolation_mode == INTERP_MODE_FLAT) {<br>
> > -         /* Constant interpolation (flat shading) case. The SF has<br>
> > -          * handed us defined values in only the constant offset<br>
> > -          * field of the setup reg.<br>
> > -          */<br>
> > -         unsigned vector_elements = type->vector_elements;<br>
> > -<br>
> > -         /* Data starts at suboffet 3 in 32-bit units (12 bytes), so it<br>
> > is not<br>
> > -          * 64-bit aligned and the current implementation fails to read<br>
> > the<br>
> > -          * data properly. Instead, when there is a double input varying,<br>
> > -          * read it as vector of floats with twice the number of<br>
> > components.<br>
> > -          */<br>
> > -         if (attr->type == BRW_REGISTER_TYPE_DF) {<br>
> > -            vector_elements *= 2;<br>
> > -            attr->type = BRW_REGISTER_TYPE_F;<br>
> > -         }<br>
> > -         for (unsigned int i = 0; i < vector_elements; i++) {<br>
> > -            struct brw_reg interp = interp_reg(*location, i);<br>
> > -            interp = suboffset(interp, 3);<br>
> > -            interp.type = attr->type;<br>
> > -            bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));<br>
> > -            *attr = offset(*attr, bld, 1);<br>
> > -         }<br>
> > -      } else {<br>
> > -         /* Smooth/noperspective interpolation case. */<br>
> > -         enum brw_barycentric_mode bary =<br>
> > -            barycentric_mode(interpolation_mode, mod_centroid,<br>
> > mod_sample);<br>
> > -<br>
> > -         for (unsigned int i = 0; i < type->vector_elements; i++) {<br>
> > -            fs_reg interp(interp_reg(*location, i));<br>
> > -            if (devinfo->needs_unlit_centroid_workaround && mod_centroid)<br>
> > {<br>
> > -               /* Get the pixel/sample mask into f0 so that we know<br>
> > -                * which pixels are lit.  Then, for each channel that is<br>
> > -                * unlit, replace the centroid data with non-centroid<br>
> > -                * data.<br>
> > -                */<br>
> > -               bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);<br>
> > -<br>
> > -               fs_inst *inst;<br>
> > -               inst = bld.emit(FS_OPCODE_LINTERP, *attr,<br>
> > -                               delta_xy[centroid_to_pixel(bary)], interp);<br>
> > -               inst->predicate = BRW_PREDICATE_NORMAL;<br>
> > -               inst->predicate_inverse = true;<br>
> > -               inst->no_dd_clear = true;<br>
> > -<br>
> > -               inst = bld.emit(FS_OPCODE_LINTERP, *attr,<br>
> > -                               delta_xy[bary], interp);<br>
> > -               inst->predicate = BRW_PREDICATE_NORMAL;<br>
> > -               inst->predicate_inverse = false;<br>
> > -               inst->no_dd_check = true;<br>
> > -            } else {<br>
> > -               bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary], interp);<br>
> > -            }<br>
> > -            if (devinfo->gen < 6 && interpolation_mode ==<br>
> > INTERP_MODE_SMOOTH) {<br>
> > -               bld.MUL(*attr, *attr, this->pixel_w);<br>
> > -            }<br>
> > -            *attr = offset(*attr, bld, 1);<br>
> > -         }<br>
> > -      }<br>
> > -      (*location)++;<br>
> > -   }<br>
> > -}<br>
> > -<br>
> >  fs_reg *<br>
> >  fs_visitor::emit_frontfacing_interpolation()<br>
> >  {<br>
> > @@ -6327,6 +6232,10 @@ fs_visitor::run_cs()<br>
> >  /**<br>
> >   * Return a bitfield where bit n is set if barycentric interpolation mode<br>
> > n<br>
> >   * (see enum brw_barycentric_mode) is needed by the fragment shader.<br>
> > + *<br>
> > + * We examine the load_barycentric intrinsics rather than looking at input<br>
> > + * variables so that we catch interpolateAtCentroid() messages too, which<br>
> > + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.<br>
> >   */<br>
> >  static unsigned<br>
> >  brw_compute_barycentric_interp_modes(const struct brw_device_info<br>
> > *devinfo,<br>
> > @@ -6334,29 +6243,37 @@ brw_compute_barycentric_interp_modes(const struct<br>
> > brw_device_info *devinfo,<br>
> >  {<br>
> >     unsigned barycentric_interp_modes = 0;<br>
> ><br>
> > -   nir_foreach_variable(var, &shader->inputs) {<br>
> > -      /* Ignore WPOS; it doesn't require interpolation. */<br>
> > -      if (var->data.location == VARYING_SLOT_POS)<br>
> > +   nir_foreach_function(f, shader) {<br>
> > +      if (!f->impl)<br>
> >           continue;<br>
> ><br>
> > -      /* Flat inputs don't need barycentric modes. */<br>
> > -      if (var->data.interpolation == INTERP_MODE_FLAT)<br>
> > -         continue;<br>
> > +      nir_foreach_block(block, f->impl) {<br>
> > +         nir_foreach_instr(instr, block) {<br>
> > +            if (instr->type != nir_instr_type_intrinsic)<br>
> > +               continue;<br>
> ><br>
> > -      /* Determine the set (or sets) of barycentric coordinates needed to<br>
> > -       * interpolate this variable.  Note that when<br>
> > -       * brw->needs_unlit_centroid_workaround is set, centroid<br>
> > interpolation<br>
> > -       * uses PIXEL interpolation for unlit pixels and CENTROID<br>
> > interpolation<br>
> > -       * for lit pixels, so we need both sets of barycentric coordinates.<br>
> > -       */<br>
> > -      enum brw_barycentric_mode bary_mode =<br>
> > -         barycentric_mode((glsl_interp_mode) var->data.interpolation,<br>
> > -                          var->data.centroid, var->data.sample);<br>
> > +            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);<br>
> > +            if (intrin->intrinsic !=<br>
> > nir_intrinsic_load_interpolated_input)<br>
> > +               continue;<br>
> ><br>
><br>
> Any particular reason why you're looking at the source of the load rather<br>
> than just looking for the load_barycentric intrinsic directly?<br>
><br>
><br>
> > +<br>
> > +            /* Ignore WPOS; it doesn't require interpolation. */<br>
> > +            if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)<br>
> > +               continue;<br>
> ><br>
><br>
> Ugh...<br>
<br>
</div></div>This is why.  gl_FragCoord still generates a load_interpolated_input<br>
with a load_barycentric_pixel, and I want to skip that.  (It might be<br>
the only user of pixel coordinates, and if so, we don't want them.)<br>
<br>
You need the load_interpolated_input intrinsic to know which input it is.<br>
<br>
It's pretty clear that gl_FragCoord doesn't behave anything like a<br>
normal input, so it ought to be a system value.  With that cleaned up,<br>
we could simplify this.<br>
<br>
That ended up being more work than I expected, so I was hoping to do<br>
it as a follow-on series, to unblock Tim sooner rather than later.<br></blockquote><div><br></div><div>I'm ok with the "unblock Tim ASAP plan".  I would like to see gl_FragCoord as a system value eventually though.  It just seems so much cleaner.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div><div class="h5"><br>
> ><br>
> > -      barycentric_interp_modes |= 1 << bary_mode;<br>
> > +            intrin =<br>
> > nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);<br>
> > +            enum glsl_interp_mode interp = (enum glsl_interp_mode)<br>
> > +               nir_intrinsic_interp_mode(intrin);<br>
> > +            nir_intrinsic_op bary_op = intrin->intrinsic;<br>
> > +            enum brw_barycentric_mode bary =<br>
> > +               brw_barycentric_mode(interp, bary_op);<br>
> ><br>
> > -      if (var->data.centroid && devinfo->needs_unlit_centroid_workaround)<br>
> > -         barycentric_interp_modes |= 1 << centroid_to_pixel(bary_mode);<br>
> > +            barycentric_interp_modes |= 1 << bary;<br>
> > +<br>
> > +            if (devinfo->needs_unlit_centroid_workaround &&<br>
> > +                bary_op == nir_intrinsic_load_barycentric_centroid)<br>
> > +               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);<br>
> > +         }<br>
> > +      }<br>
> >     }<br>
> ><br>
> >     return barycentric_interp_modes;<br>
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h<br>
> > b/src/mesa/drivers/dri/i965/brw_fs.h<br>
> > index 7998f51..574475f 100644<br>
> > --- a/src/mesa/drivers/dri/i965/brw_fs.h<br>
> > +++ b/src/mesa/drivers/dri/i965/brw_fs.h<br>
> > @@ -174,11 +174,6 @@ public:<br>
> >     fs_reg *emit_samplepos_setup();<br>
> >     fs_reg *emit_sampleid_setup();<br>
> >     fs_reg *emit_samplemaskin_setup();<br>
> > -   void emit_general_interpolation(fs_reg *attr, const char *name,<br>
> > -                                   const glsl_type *type,<br>
> > -                                   glsl_interp_mode interpolation_mode,<br>
> > -                                   int *location, bool mod_centroid,<br>
> > -                                   bool mod_sample);<br>
> >     fs_reg *emit_vs_system_value(int location);<br>
> >     void emit_interpolation_setup_gen4();<br>
> >     void emit_interpolation_setup_gen6();<br>
> > @@ -195,7 +190,6 @@ public:<br>
> >     bool opt_zero_samples();<br>
> ><br>
> >     void emit_nir_code();<br>
> > -   void nir_setup_inputs();<br>
> >     void nir_setup_single_output_varying(fs_reg *reg, const glsl_type<br>
> > *type,<br>
> >                                          unsigned *location);<br>
> >     void nir_setup_outputs();<br>
> > @@ -511,3 +505,6 @@ void shuffle_64bit_data_for_32bit_write(const<br>
> > brw::fs_builder &bld,<br>
> >                                          uint32_t components);<br>
> >  fs_reg setup_imm_df(const brw::fs_builder &bld,<br>
> >                      double v);<br>
> > +<br>
> > +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode,<br>
> > +                                               nir_intrinsic_op op);<br>
> > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp<br>
> > b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp<br>
> > index 6265dc6..610c151 100644<br>
> > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp<br>
> > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp<br>
> > @@ -36,7 +36,6 @@ fs_visitor::emit_nir_code()<br>
> >     /* emit the arrays used for inputs and outputs - load/store intrinsics<br>
> > will<br>
> >      * be converted to reads/writes of these arrays<br>
> >      */<br>
> > -   nir_setup_inputs();<br>
> >     nir_setup_outputs();<br>
> >     nir_setup_uniforms();<br>
> >     nir_emit_system_values();<br>
> > @@ -50,38 +49,6 @@ fs_visitor::emit_nir_code()<br>
> >  }<br>
> ><br>
> >  void<br>
> > -fs_visitor::nir_setup_inputs()<br>
> > -{<br>
> > -   if (stage != MESA_SHADER_FRAGMENT)<br>
> > -      return;<br>
> > -<br>
> > -   nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);<br>
> > -<br>
> > -   nir_foreach_variable(var, &nir->inputs) {<br>
> > -      fs_reg input = offset(nir_inputs, bld, var->data.driver_location);<br>
> > -<br>
> > -      fs_reg reg;<br>
> > -      if (var->data.location == VARYING_SLOT_POS) {<br>
> > -         emit_fragcoord_interpolation(input);<br>
> > -      } else if (var->data.location == VARYING_SLOT_LAYER) {<br>
> > -         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER,<br>
> > 1), 3);<br>
> > -         reg.type = BRW_REGISTER_TYPE_D;<br>
> > -         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D),<br>
> > reg);<br>
> > -      } else if (var->data.location == VARYING_SLOT_VIEWPORT) {<br>
> > -         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT,<br>
> > 2), 3);<br>
> > -         reg.type = BRW_REGISTER_TYPE_D;<br>
> > -         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D),<br>
> > reg);<br>
> > -      } else {<br>
> > -         int location = var->data.location;<br>
> > -         emit_general_interpolation(&input, var->name, var->type,<br>
> > -                                    (glsl_interp_mode)<br>
> > var->data.interpolation,<br>
> > -                                    &location, var->data.centroid,<br>
> > -                                    var->data.sample);<br>
> > -      }<br>
> > -   }<br>
> > -}<br>
> > -<br>
> > -void<br>
> >  fs_visitor::nir_setup_single_output_varying(fs_reg *reg,<br>
> >                                              const glsl_type *type,<br>
> >                                              unsigned *location)<br>
> > @@ -3063,7 +3030,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder<br>
> > &bld,<br>
> >                                    nir_intrinsic_instr *instr)<br>
> >  {<br>
> >     assert(stage == MESA_SHADER_FRAGMENT);<br>
> > -   const struct brw_wm_prog_key *wm_key = (const struct brw_wm_prog_key<br>
> > *) key;<br>
> ><br>
> >     fs_reg dest;<br>
> >     if (nir_intrinsic_infos[instr->intrinsic].has_dest)<br>
> > @@ -3120,189 +3086,245 @@ fs_visitor::nir_emit_fs_intrinsic(const<br>
> > fs_builder &bld,<br>
> >        break;<br>
> >     }<br>
> ><br>
> > -   case nir_intrinsic_interp_var_at_centroid:<br>
> > -   case nir_intrinsic_interp_var_at_sample:<br>
> > -   case nir_intrinsic_interp_var_at_offset: {<br>
> > -      /* Handle ARB_gpu_shader5 interpolation intrinsics<br>
> > -       *<br>
> > -       * It's worth a quick word of explanation as to why we handle the<br>
> > full<br>
> > -       * variable-based interpolation intrinsic rather than a lowered<br>
> > version<br>
> > -       * with like we do for other inputs.  We have to do that because<br>
> > the way<br>
> > -       * we set up inputs doesn't allow us to use the already setup<br>
> > inputs for<br>
> > -       * interpolation.  At the beginning of the shader, we go through<br>
> > all of<br>
> > -       * the input variables and do the initial interpolation and put it<br>
> > in<br>
> > -       * the nir_inputs array based on its location as determined in<br>
> > -       * nir_lower_io.  If the input isn't used, dead code cleans up and<br>
> > -       * everything works fine.  However, when we get to the<br>
> > ARB_gpu_shader5<br>
> > -       * interpolation intrinsics, we need to reinterpolate the input<br>
> > -       * differently.  If we used an intrinsic that just had an index it<br>
> > would<br>
> > -       * only give us the offset into the nir_inputs array.  However,<br>
> > this is<br>
> > -       * useless because that value is post-interpolation and we need<br>
> > -       * pre-interpolation.  In order to get the actual location of the<br>
> > bits<br>
> > -       * we get from the vertex fetching hardware, we need the variable.<br>
> > -       */<br>
> > -      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);<br>
> > -      const glsl_interp_mode interpolation =<br>
> > -         (glsl_interp_mode) instr->variables[0]->var->data.interpolation;<br>
> > +   case nir_intrinsic_load_input: {<br>
> > +      /* load_input is only used for flat inputs */<br>
> > +      unsigned base = nir_intrinsic_base(instr);<br>
> > +      unsigned component = nir_intrinsic_component(instr);<br>
> > +      unsigned num_components = instr->num_components;<br>
> > +      enum brw_reg_type type = dest.type;<br>
> ><br>
> > -      switch (instr->intrinsic) {<br>
> > -      case nir_intrinsic_interp_var_at_centroid:<br>
> > -         emit_pixel_interpolater_send(bld,<br>
> > -                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,<br>
> > -                                      dst_xy,<br>
> > -                                      fs_reg(), /* src */<br>
> > -                                      brw_imm_ud(0u),<br>
> > -                                      interpolation);<br>
> > -         break;<br>
> > +      /* Special case fields in the VUE header */<br>
> > +      if (base == VARYING_SLOT_LAYER)<br>
> > +         component = 1;<br>
> > +      else if (base == VARYING_SLOT_VIEWPORT)<br>
> > +         component = 2;<br>
> ><br>
> > -      case nir_intrinsic_interp_var_at_sample: {<br>
> > -         if (!wm_key->multisample_fbo) {<br>
> > -            /* From the ARB_gpu_shader5 specification:<br>
> > -             * "If multisample buffers are not available, the input<br>
> > varying<br>
> > -             *  will be evaluated at the center of the pixel."<br>
> > -             */<br>
> > -            emit_pixel_interpolater_send(bld,<br>
> > -<br>
> >  FS_OPCODE_INTERPOLATE_AT_CENTROID,<br>
> > -                                         dst_xy,<br>
> > -                                         fs_reg(), /* src */<br>
> > -                                         brw_imm_ud(0u),<br>
> > -                                         interpolation);<br>
> > -            break;<br>
> > -         }<br>
> > +      if (nir_dest_bit_size(instr->dest) == 64) {<br>
> > +         /* const_index is in 32-bit type size units that could not be<br>
> > aligned<br>
> > +          * with DF. We need to read the double vector as if it was a<br>
> > float<br>
> > +          * vector of twice the number of components to fetch the right<br>
> > data.<br>
> > +          */<br>
> > +         type = BRW_REGISTER_TYPE_F;<br>
> > +         num_components *= 2;<br>
> > +      }<br>
> ><br>
> > -         nir_const_value *const_sample =<br>
> > nir_src_as_const_value(instr->src[0]);<br>
> > +      for (unsigned int i = 0; i < num_components; i++) {<br>
> > +         struct brw_reg interp = interp_reg(base, component + i);<br>
> > +         interp = suboffset(interp, 3);<br>
> > +         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),<br>
> > +                  retype(fs_reg(interp), type));<br>
> > +      }<br>
> ><br>
> > -         if (const_sample) {<br>
> > -            unsigned msg_data = const_sample->i32[0] << 4;<br>
> > +      if (nir_dest_bit_size(instr->dest) == 64) {<br>
> > +         shuffle_32bit_load_result_to_64bit_data(bld,<br>
> > +                                                 dest,<br>
> > +                                                 retype(dest, type),<br>
> > +                                                 instr->num_components);<br>
> > +      }<br>
> > +      break;<br>
> > +   }<br>
> > +<br>
> > +   case nir_intrinsic_load_barycentric_pixel:<br>
> > +   case nir_intrinsic_load_barycentric_centroid:<br>
> > +   case nir_intrinsic_load_barycentric_sample:<br>
> > +      /* Do nothing - load_interpolated_input handling will handle it<br>
> > later. */<br>
> > +      break;<br>
> ><br>
><br>
> I'm very courious why you made this choice.  It seems odd to me.<br>
<br>
</div></div>I originally tried emitting MOVs here.  The data layout is a bit funky:<br>
<br>
    R3: X coordinates for Slots 0-7<br>
    R4: Y coordinates for Slots 0-7<br>
    R5: X coordinates for Slots 8-15<br>
    R5: Y coordinates for Slots 8-15<br>
<br>
For SIMD8, you could do two separate MOVs (move X, then Y).  You might be<br>
able to do a compressed mov(16)...but would need to whack the channel<br>
enables into 1Q for both halves (if that's possible) or use NoMask<br>
(which I wanted to avoid).<br>
<br>
For SIMD16...you can't move both X's with a single MOV.  A <16,8,1><br>
region would cover it, but sources can't span more than two adjacent<br>
registers.  So that doesn't work.  It might take four MOVs.  Maybe<br>
only two if I could use the above trick.<br>
<br>
Generating MOVs also means that we're relying on the optimizer to<br>
remove them...which it didn't seem to be.  It ended up being much<br>
easier to just point LINTERP at delta_xy[] directly.<br></blockquote><div><br></div><div>Right... I forgot about the odd interleaving.  That seems like a reasonable reason for doing it this way.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div><div class="h5"><br>
> > +   case nir_intrinsic_load_barycentric_at_sample: {<br>
> > +      const glsl_interp_mode interpolation =<br>
> > +         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);<br>
> > +<br>
> > +      nir_const_value *const_sample =<br>
> > nir_src_as_const_value(instr->src[0]);<br>
> > +<br>
> > +      if (const_sample) {<br>
> > +         unsigned msg_data = const_sample->i32[0] << 4;<br>
> > +<br>
> > +         emit_pixel_interpolater_send(bld,<br>
> > +                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,<br>
> > +                                      dest,<br>
> > +                                      fs_reg(), /* src */<br>
> > +                                      brw_imm_ud(msg_data),<br>
> > +                                      interpolation);<br>
> > +      } else {<br>
> > +         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),<br>
> > +                                          BRW_REGISTER_TYPE_UD);<br>
> > +<br>
> > +         if (nir_src_is_dynamically_uniform(instr->src[0])) {<br>
> > +            const fs_reg sample_id = bld.emit_uniformize(sample_src);<br>
> > +            const fs_reg msg_data = vgrf(glsl_type::uint_type);<br>
> > +            bld.exec_all().group(1, 0)<br>
> > +               .SHL(msg_data, sample_id, brw_imm_ud(4u));<br>
> >              emit_pixel_interpolater_send(bld,<br>
> >                                           FS_OPCODE_INTERPOLATE_AT_SAMPLE,<br>
> > -                                         dst_xy,<br>
> > +                                         dest,<br>
> >                                           fs_reg(), /* src */<br>
> > -                                         brw_imm_ud(msg_data),<br>
> > +                                         msg_data,<br>
> >                                           interpolation);<br>
> >           } else {<br>
> > -            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),<br>
> > -                                             BRW_REGISTER_TYPE_UD);<br>
> > -<br>
> > -            if (nir_src_is_dynamically_uniform(instr->src[0])) {<br>
> > -               const fs_reg sample_id = bld.emit_uniformize(sample_src);<br>
> > -               const fs_reg msg_data = vgrf(glsl_type::uint_type);<br>
> > -               bld.exec_all().group(1, 0)<br>
> > -                  .SHL(msg_data, sample_id, brw_imm_ud(4u));<br>
> > +            /* Make a loop that sends a message to the pixel interpolater<br>
> > +             * for the sample number in each live channel. If there are<br>
> > +             * multiple channels with the same sample number then these<br>
> > +             * will be handled simultaneously with a single interation of<br>
> > +             * the loop.<br>
> > +             */<br>
> > +            bld.emit(BRW_OPCODE_DO);<br>
> > +<br>
> > +            /* Get the next live sample number into sample_id_reg */<br>
> > +            const fs_reg sample_id = bld.emit_uniformize(sample_src);<br>
> > +<br>
> > +            /* Set the flag register so that we can perform the send<br>
> > +             * message on all channels that have the same sample number<br>
> > +             */<br>
> > +            bld.CMP(bld.null_reg_ud(),<br>
> > +                    sample_src, sample_id,<br>
> > +                    BRW_CONDITIONAL_EQ);<br>
> > +            const fs_reg msg_data = vgrf(glsl_type::uint_type);<br>
> > +            bld.exec_all().group(1, 0)<br>
> > +               .SHL(msg_data, sample_id, brw_imm_ud(4u));<br>
> > +            fs_inst *inst =<br>
> >                 emit_pixel_interpolater_send(bld,<br>
> ><br>
> >  FS_OPCODE_INTERPOLATE_AT_SAMPLE,<br>
> > -                                            dst_xy,<br>
> > +                                            dest,<br>
> >                                              fs_reg(), /* src */<br>
> >                                              msg_data,<br>
> >                                              interpolation);<br>
> > -            } else {<br>
> > -               /* Make a loop that sends a message to the pixel<br>
> > interpolater<br>
> > -                * for the sample number in each live channel. If there are<br>
> > -                * multiple channels with the same sample number then these<br>
> > -                * will be handled simultaneously with a single interation<br>
> > of<br>
> > -                * the loop.<br>
> > -                */<br>
> > -               bld.emit(BRW_OPCODE_DO);<br>
> > -<br>
> > -               /* Get the next live sample number into sample_id_reg */<br>
> > -               const fs_reg sample_id = bld.emit_uniformize(sample_src);<br>
> > +            set_predicate(BRW_PREDICATE_NORMAL, inst);<br>
> ><br>
> > -               /* Set the flag register so that we can perform the send<br>
> > -                * message on all channels that have the same sample number<br>
> > -                */<br>
> > -               bld.CMP(bld.null_reg_ud(),<br>
> > -                       sample_src, sample_id,<br>
> > -                       BRW_CONDITIONAL_EQ);<br>
> > -               const fs_reg msg_data = vgrf(glsl_type::uint_type);<br>
> > -               bld.exec_all().group(1, 0)<br>
> > -                  .SHL(msg_data, sample_id, brw_imm_ud(4u));<br>
> > -               fs_inst *inst =<br>
> > -                  emit_pixel_interpolater_send(bld,<br>
> > -<br>
> >  FS_OPCODE_INTERPOLATE_AT_SAMPLE,<br>
> > -                                               dst_xy,<br>
> > -                                               fs_reg(), /* src */<br>
> > -                                               msg_data,<br>
> > -                                               interpolation);<br>
> > -               set_predicate(BRW_PREDICATE_NORMAL, inst);<br>
> > -<br>
> > -               /* Continue the loop if there are any live channels left */<br>
> > -               set_predicate_inv(BRW_PREDICATE_NORMAL,<br>
> > -                                 true, /* inverse */<br>
> > -                                 bld.emit(BRW_OPCODE_WHILE));<br>
> > -            }<br>
> > +            /* Continue the loop if there are any live channels left */<br>
> > +            set_predicate_inv(BRW_PREDICATE_NORMAL,<br>
> > +                              true, /* inverse */<br>
> > +                              bld.emit(BRW_OPCODE_WHILE));<br>
> >           }<br>
> > -<br>
> > -         break;<br>
> >        }<br>
> > +      break;<br>
> > +   }<br>
> ><br>
> > -      case nir_intrinsic_interp_var_at_offset: {<br>
> > -         nir_const_value *const_offset =<br>
> > nir_src_as_const_value(instr->src[0]);<br>
> > +   case nir_intrinsic_load_barycentric_at_offset: {<br>
> > +      const glsl_interp_mode interpolation =<br>
> > +         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);<br>
> ><br>
> > -         if (const_offset) {<br>
> > -            unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) &<br>
> > 0xf;<br>
> > -            unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) &<br>
> > 0xf;<br>
> > +      nir_const_value *const_offset =<br>
> > nir_src_as_const_value(instr->src[0]);<br>
> ><br>
> > -            emit_pixel_interpolater_send(bld,<br>
> > -<br>
> >  FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,<br>
> > -                                         dst_xy,<br>
> > -                                         fs_reg(), /* src */<br>
> > -                                         brw_imm_ud(off_x | (off_y << 4)),<br>
> > -                                         interpolation);<br>
> > -         } else {<br>
> > -            fs_reg src = vgrf(glsl_type::ivec2_type);<br>
> > -            fs_reg offset_src = retype(get_nir_src(instr->src[0]),<br>
> > -                                       BRW_REGISTER_TYPE_F);<br>
> > -            for (int i = 0; i < 2; i++) {<br>
> > -               fs_reg temp = vgrf(glsl_type::float_type);<br>
> > -               bld.MUL(temp, offset(offset_src, bld, i),<br>
> > brw_imm_f(16.0f));<br>
> > -               fs_reg itemp = vgrf(glsl_type::int_type);<br>
> > -               /* float to int */<br>
> > -               bld.MOV(itemp, temp);<br>
> > -<br>
> > -               /* Clamp the upper end of the range to +7/16.<br>
> > -                * ARB_gpu_shader5 requires that we support a maximum<br>
> > offset<br>
> > -                * of +0.5, which isn't representable in a S0.4 value -- if<br>
> > -                * we didn't clamp it, we'd end up with -8/16, which is the<br>
> > -                * opposite of what the shader author wanted.<br>
> > -                *<br>
> > -                * This is legal due to ARB_gpu_shader5's quantization<br>
> > -                * rules:<br>
> > -                *<br>
> > -                * "Not all values of <offset> may be supported; x and y<br>
> > -                * offsets may be rounded to fixed-point values with the<br>
> > -                * number of fraction bits given by the<br>
> > -                * implementation-dependent constant<br>
> > -                * FRAGMENT_INTERPOLATION_OFFSET_BITS"<br>
> > -                */<br>
> > -               set_condmod(BRW_CONDITIONAL_L,<br>
> > -                           bld.SEL(offset(src, bld, i), itemp,<br>
> > brw_imm_d(7)));<br>
> > -            }<br>
> > +      if (const_offset) {<br>
> > +         unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;<br>
> > +         unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;<br>
> ><br>
> > -            const enum opcode opcode =<br>
> > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;<br>
> > -            emit_pixel_interpolater_send(bld,<br>
> > -                                         opcode,<br>
> > -                                         dst_xy,<br>
> > -                                         src,<br>
> > -                                         brw_imm_ud(0u),<br>
> > -                                         interpolation);<br>
> > +         emit_pixel_interpolater_send(bld,<br>
> > +<br>
> > FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,<br>
> > +                                      dest,<br>
> > +                                      fs_reg(), /* src */<br>
> > +                                      brw_imm_ud(off_x | (off_y << 4)),<br>
> > +                                      interpolation);<br>
> > +      } else {<br>
> > +         fs_reg src = vgrf(glsl_type::ivec2_type);<br>
> > +         fs_reg offset_src = retype(get_nir_src(instr->src[0]),<br>
> > +                                    BRW_REGISTER_TYPE_F);<br>
> > +         for (int i = 0; i < 2; i++) {<br>
> > +            fs_reg temp = vgrf(glsl_type::float_type);<br>
> > +            bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));<br>
> > +            fs_reg itemp = vgrf(glsl_type::int_type);<br>
> > +            /* float to int */<br>
> > +            bld.MOV(itemp, temp);<br>
> > +<br>
> > +            /* Clamp the upper end of the range to +7/16.<br>
> > +             * ARB_gpu_shader5 requires that we support a maximum offset<br>
> > +             * of +0.5, which isn't representable in a S0.4 value -- if<br>
> > +             * we didn't clamp it, we'd end up with -8/16, which is the<br>
> > +             * opposite of what the shader author wanted.<br>
> > +             *<br>
> > +             * This is legal due to ARB_gpu_shader5's quantization<br>
> > +             * rules:<br>
> > +             *<br>
> > +             * "Not all values of <offset> may be supported; x and y<br>
> > +             * offsets may be rounded to fixed-point values with the<br>
> > +             * number of fraction bits given by the<br>
> > +             * implementation-dependent constant<br>
> > +             * FRAGMENT_INTERPOLATION_OFFSET_BITS"<br>
> > +             */<br>
> > +            set_condmod(BRW_CONDITIONAL_L,<br>
> > +                        bld.SEL(offset(src, bld, i), itemp,<br>
> > brw_imm_d(7)));<br>
> >           }<br>
> > +<br>
> > +         const enum opcode opcode =<br>
> > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;<br>
> > +         emit_pixel_interpolater_send(bld,<br>
> > +                                      opcode,<br>
> > +                                      dest,<br>
> > +                                      src,<br>
> > +                                      brw_imm_ud(0u),<br>
> > +                                      interpolation);<br>
> > +      }<br>
> > +      break;<br>
> > +   }<br>
> > +<br>
> > +   case nir_intrinsic_load_interpolated_input: {<br>
> > +      if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {<br>
> > +         emit_fragcoord_interpolation(dest);<br>
> >           break;<br>
> >        }<br>
> ><br>
> > -      default:<br>
> > -         unreachable("Invalid intrinsic");<br>
> > +      assert(instr->src[0].ssa &&<br>
> > +             instr->src[0].ssa->parent_instr->type ==<br>
> > nir_instr_type_intrinsic);<br>
> > +      nir_intrinsic_instr *bary_intrinsic =<br>
> > +         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);<br>
> > +      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;<br>
> > +      enum glsl_interp_mode interp_mode =<br>
> > +         (enum glsl_interp_mode)<br>
> > nir_intrinsic_interp_mode(bary_intrinsic);<br>
> > +      fs_reg dst_xy;<br>
> > +<br>
> > +      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||<br>
> > +          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {<br>
> > +         /* Use the result of the PI message */<br>
> > +         dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);<br>
> > +      } else {<br>
> > +         /* Use the delta_xy values computed from the payload */<br>
> > +         enum brw_barycentric_mode bary =<br>
> > +            brw_barycentric_mode(interp_mode, bary_intrin);<br>
> > +<br>
> > +         dst_xy = this->delta_xy[bary];<br>
> >        }<br>
> ><br>
> > -      for (unsigned j = 0; j < instr->num_components; j++) {<br>
> > -         fs_reg src = interp_reg(instr->variables[0]->var->data.location,<br>
> > j);<br>
> > -         src.type = dest.type;<br>
> > +      for (unsigned int i = 0; i < instr->num_components; i++) {<br>
> > +         fs_reg interp =<br>
> > +            fs_reg(interp_reg(nir_intrinsic_base(instr),<br>
> > +                              nir_intrinsic_component(instr) + i));<br>
> > +         interp.type = BRW_REGISTER_TYPE_F;<br>
> > +         dest.type = BRW_REGISTER_TYPE_F;<br>
> ><br>
> > -         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);<br>
> > -         dest = offset(dest, bld, 1);<br>
> > +         if (devinfo->needs_unlit_centroid_workaround &&<br>
> > +             bary_intrin == nir_intrinsic_load_barycentric_centroid) {<br>
> > +<br>
> > +            /* Get the pixel/sample mask into f0 so that we know which<br>
> > +             * pixels are lit.  Then, for each channel that is unlit,<br>
> > +             * replace the centroid data with non-centroid data.<br>
> > +             */<br>
> > +            bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);<br>
> > +<br>
> > +            fs_reg dest_i = offset(dest, bld, i);<br>
> > +            fs_reg dst_xy_pixel =<br>
> > +               delta_xy[brw_barycentric_mode(interp_mode,<br>
> > +                  nir_intrinsic_load_barycentric_pixel)];<br>
> > +<br>
> > +            fs_inst *inst;<br>
> > +            inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy_pixel,<br>
> > interp);<br>
> > +            inst->predicate = BRW_PREDICATE_NORMAL;<br>
> > +            inst->predicate_inverse = true;<br>
> > +            inst->no_dd_clear = true;<br>
> > +<br>
> > +            inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy, interp);<br>
> > +            inst->predicate = BRW_PREDICATE_NORMAL;<br>
> > +            inst->predicate_inverse = false;<br>
> > +            inst->no_dd_check = true;<br>
> > +         } else if (devinfo->gen < 6 && interp_mode ==<br>
> > INTERP_MODE_SMOOTH) {<br>
> > +            fs_reg tmp = vgrf(glsl_type::float_type);<br>
> > +            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);<br>
> > +            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);<br>
> > +         } else {<br>
> > +            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy,<br>
> > interp);<br>
> > +         }<br>
> ><br>
><br>
> Ugh... I think I see why you're handling the barycentric load here now.<br>
> Eventually, I think we could probably be doing this better but I'm not<br>
> seeing an easy path at the moment.<br>
<br>
</div></div>Yeah, perhaps.  I don't think it's that bad as is, though.<br>
<div><div class="h5"><br>
> >        }<br>
> >        break;<br>
> >     }<br>
> > +<br>
> >     default:<br>
> >        nir_emit_intrinsic(bld, instr);<br>
> >        break;<br>
> > @@ -3869,26 +3891,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder<br>
> > &bld, nir_intrinsic_instr *instr<br>
> >     }<br>
> ><br>
> >     case nir_intrinsic_load_input: {<br>
> > -      fs_reg src;<br>
> > +      fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);<br>
> >        unsigned num_components = instr->num_components;<br>
> >        enum brw_reg_type type = dest.type;<br>
> ><br>
> > -      if (stage == MESA_SHADER_VERTEX) {<br>
> > -         src = fs_reg(ATTR, instr->const_index[0], dest.type);<br>
> > -      } else {<br>
> > -         assert(type_sz(type) >= 4);<br>
> > -         if (type == BRW_REGISTER_TYPE_DF) {<br>
> > -            /* const_index is in 32-bit type size units that could not be<br>
> > aligned<br>
> > -             * with DF. We need to read the double vector as if it was a<br>
> > float<br>
> > -             * vector of twice the number of components to fetch the<br>
> > right data.<br>
> > -             */<br>
> > -            dest = retype(dest, BRW_REGISTER_TYPE_F);<br>
> > -            num_components *= 2;<br>
> > -         }<br>
> > -         src = offset(retype(nir_inputs, dest.type), bld,<br>
> > -                      instr->const_index[0]);<br>
> > -      }<br>
> ><br>
><br>
> This hunk seems odd... It doesn't look like it really has anything to do<br>
> with FS.  Is that else clause really FS-only or does it apply to GS and<br>
> tess stages?<br>
<br>
</div></div>Yep, this is FS specific.  TCS/TES/GS input intrinsics are all handled<br>
by nir_emit_{tcs,tes,gs}_intrinsic.  This code in the generic intrinsic<br>
handler is only for VS and FS.  And the VS code obviously didn't include<br>
this hunk.<br>
<br>
Sorry this was immensely non-obvious.  This raises a good point, though:<br>
after this patch, I can move the remaining code to nir_emit_vs_intrinsic<br>
so it's clear it only applies to the VS.<br></blockquote><div><br></div><div>I'd like to see that patch if you don't mind.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div class="HOEnZb"><div class="h5"><br>
> > -<br>
> >        nir_const_value *const_offset =<br>
> > nir_src_as_const_value(instr->src[0]);<br>
> >        assert(const_offset && "Indirect input loads not allowed");<br>
> >        src = offset(src, bld, const_offset->u32[0]);<br>
> > diff --git a/src/mesa/drivers/dri/i965/brw_nir.c<br>
> > b/src/mesa/drivers/dri/i965/brw_nir.c<br>
> > index caf9fe0..d1a823a 100644<br>
> > --- a/src/mesa/drivers/dri/i965/brw_nir.c<br>
> > +++ b/src/mesa/drivers/dri/i965/brw_nir.c<br>
> > @@ -30,7 +30,8 @@ static bool<br>
> >  is_input(nir_intrinsic_instr *intrin)<br>
> >  {<br>
> >     return intrin->intrinsic == nir_intrinsic_load_input ||<br>
> > -          intrin->intrinsic == nir_intrinsic_load_per_vertex_input;<br>
> > +          intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||<br>
> > +          intrin->intrinsic == nir_intrinsic_load_interpolated_input;<br>
> >  }<br>
> ><br>
> >  static bool<br>
> > @@ -282,9 +283,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir, const<br>
> > struct brw_vue_map *vue_map)<br>
> >  void<br>
> >  brw_nir_lower_fs_inputs(nir_shader *nir)<br>
> >  {<br>
> > -   nir_assign_var_locations(&nir->inputs, &nir->num_inputs,<br>
> > VARYING_SLOT_VAR0,<br>
> > -                            type_size_scalar);<br>
> > -   nir_lower_io(nir, nir_var_shader_in, type_size_scalar, false);<br>
> > +   foreach_list_typed(nir_variable, var, node, &nir->inputs) {<br>
> > +      var->data.driver_location = var->data.location;<br>
> > +   }<br>
> > +<br>
> > +   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, true);<br>
> > +<br>
> > +   /* This pass needs actual constants */<br>
> > +   nir_opt_constant_folding(nir);<br>
> > +<br>
> > +   add_const_offset_to_base(nir, nir_var_shader_in);<br>
> >  }<br>
> ><br>
> >  void<br>
> > --<br>
> > 2.9.0<br>
> ><br>
> > _______________________________________________<br>
> > mesa-dev mailing list<br>
> > <a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
> > <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
> ><br>
><br>
<br>
</div></div></blockquote></div><br></div></div>