[Mesa-dev] [PATCH 6/7] i965: Rewrite FS input handling to use the new NIR intrinsics.
Jason Ekstrand
jason at jlekstrand.net
Wed Jul 20 01:39:25 UTC 2016
On Tue, Jul 19, 2016 at 5:02 PM, Kenneth Graunke <kenneth at whitecape.org>
wrote:
> On Tuesday, July 19, 2016 1:39:23 PM PDT Jason Ekstrand wrote:
> > On Mon, Jul 18, 2016 at 1:26 PM, Kenneth Graunke <kenneth at whitecape.org>
> > wrote:
> >
> > > This eliminates the need to walk the list of input variables, recurse
> > > into their types (via logic largely redundant with nir_lower_io), and
> > > interpolate all possible inputs up front. The backend no longer has
> > > to care about variables at all, which eliminates complications from
> > > trying to pack multiple variables into the same location. Instead,
> > > each intrinsic specifies exactly what's needed.
> > >
> > > This should unblock Timothy's work on GL_ARB_enhanced_layouts.
> > >
> > > Each load_interpolated_input intrinsic corresponds to PLN instructions,
> > > while load_barycentric_at_* intrinsics correspond to pixel interpolator
> > > messages. The pixel/centroid/sample barycentric intrinsics simply
> refer
> > > to payload fields (delta_xy[]), and don't actually generate any code.
> > >
> > > Because we use a single intrinsic for both centroid-qualified variables
> > > and interpolateAtCentroid(), they become indistinguishable. We stop
> > > sending pixel interpolator messages for those, and instead use the
> > > payload provided data, which should be considerably faster.
> > >
> > > On Broadwell:
> > >
> > > total instructions in shared programs: 9067751 -> 9067570 (-0.00%)
> > > instructions in affected programs: 145902 -> 145721 (-0.12%)
> > > helped: 422
> > > HURT: 209
> > >
> > > total spills in shared programs: 2849 -> 2899 (1.76%)
> > > spills in affected programs: 760 -> 810 (6.58%)
> > > helped: 0
> > > HURT: 10
> > >
> > > total fills in shared programs: 3910 -> 3950 (1.02%)
> > > fills in affected programs: 617 -> 657 (6.48%)
> > > helped: 0
> > > HURT: 10
> > >
> > > LOST: 3
> > > GAINED: 3
> > >
> > > The differences mostly appear to be slight changes in MOVs.
> > >
> > > Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> > > ---
> > > src/mesa/drivers/dri/i965/brw_fs.cpp | 175 ++++---------
> > > src/mesa/drivers/dri/i965/brw_fs.h | 9 +-
> > > src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 410
> > > ++++++++++++++++---------------
> > > src/mesa/drivers/dri/i965/brw_nir.c | 16 +-
> > > 4 files changed, 269 insertions(+), 341 deletions(-)
> > >
> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > index 94127bc..06007fe 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> > > @@ -1067,21 +1067,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg
> > > wpos)
> > > bld.MOV(wpos, this->wpos_w);
> > > }
> > >
> > > -static enum brw_barycentric_mode
> > > -barycentric_mode(enum glsl_interp_mode mode,
> > > - bool is_centroid, bool is_sample)
> > > +enum brw_barycentric_mode
> > > +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
> > > {
> > > - unsigned bary;
> > > -
> > > /* Barycentric modes don't make sense for flat inputs. */
> > > assert(mode != INTERP_MODE_FLAT);
> > >
> > > - if (is_sample) {
> > > - bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> > > - } else if (is_centroid) {
> > > - bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> > > - } else {
> > > + unsigned bary;
> > > + switch (op) {
> > > + case nir_intrinsic_load_barycentric_pixel:
> > > + case nir_intrinsic_load_barycentric_at_offset:
> > > bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
> > > + break;
> > > + case nir_intrinsic_load_barycentric_centroid:
> > > + bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> > > + break;
> > > + case nir_intrinsic_load_barycentric_sample:
> > > + case nir_intrinsic_load_barycentric_at_sample:
> > > + bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> > > + break;
> > > + default:
> > > + assert(!"invalid intrinsic");
> > > }
> > >
> > > if (mode == INTERP_MODE_NOPERSPECTIVE)
> > > @@ -1101,107 +1107,6 @@ centroid_to_pixel(enum brw_barycentric_mode
> bary)
> > > return (enum brw_barycentric_mode) ((unsigned) bary - 1);
> > > }
> > >
> > > -void
> > > -fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
> > > - const glsl_type *type,
> > > - glsl_interp_mode
> > > interpolation_mode,
> > > - int *location, bool
> mod_centroid,
> > > - bool mod_sample)
> > > -{
> > > - assert(stage == MESA_SHADER_FRAGMENT);
> > > - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> > > -
> > > - if (type->is_array() || type->is_matrix()) {
> > > - const glsl_type *elem_type = glsl_get_array_element(type);
> > > - const unsigned length = glsl_get_length(type);
> > > -
> > > - for (unsigned i = 0; i < length; i++) {
> > > - emit_general_interpolation(attr, name, elem_type,
> > > interpolation_mode,
> > > - location, mod_centroid,
> mod_sample);
> > > - }
> > > - } else if (type->is_record()) {
> > > - for (unsigned i = 0; i < type->length; i++) {
> > > - const glsl_type *field_type = type->fields.structure[i].type;
> > > - emit_general_interpolation(attr, name, field_type,
> > > interpolation_mode,
> > > - location, mod_centroid,
> mod_sample);
> > > - }
> > > - } else {
> > > - assert(type->is_scalar() || type->is_vector());
> > > -
> > > - if (prog_data->urb_setup[*location] == -1) {
> > > - /* If there's no incoming setup data for this slot, don't
> > > - * emit interpolation for it.
> > > - */
> > > - *attr = offset(*attr, bld, type->vector_elements);
> > > - (*location)++;
> > > - return;
> > > - }
> > > -
> > > - attr->type = brw_type_for_base_type(type->get_scalar_type());
> > > -
> > > - if (interpolation_mode == INTERP_MODE_FLAT) {
> > > - /* Constant interpolation (flat shading) case. The SF has
> > > - * handed us defined values in only the constant offset
> > > - * field of the setup reg.
> > > - */
> > > - unsigned vector_elements = type->vector_elements;
> > > -
> > > - /* Data starts at suboffet 3 in 32-bit units (12 bytes), so
> it
> > > is not
> > > - * 64-bit aligned and the current implementation fails to
> read
> > > the
> > > - * data properly. Instead, when there is a double input
> varying,
> > > - * read it as vector of floats with twice the number of
> > > components.
> > > - */
> > > - if (attr->type == BRW_REGISTER_TYPE_DF) {
> > > - vector_elements *= 2;
> > > - attr->type = BRW_REGISTER_TYPE_F;
> > > - }
> > > - for (unsigned int i = 0; i < vector_elements; i++) {
> > > - struct brw_reg interp = interp_reg(*location, i);
> > > - interp = suboffset(interp, 3);
> > > - interp.type = attr->type;
> > > - bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
> > > - *attr = offset(*attr, bld, 1);
> > > - }
> > > - } else {
> > > - /* Smooth/noperspective interpolation case. */
> > > - enum brw_barycentric_mode bary =
> > > - barycentric_mode(interpolation_mode, mod_centroid,
> > > mod_sample);
> > > -
> > > - for (unsigned int i = 0; i < type->vector_elements; i++) {
> > > - fs_reg interp(interp_reg(*location, i));
> > > - if (devinfo->needs_unlit_centroid_workaround &&
> mod_centroid)
> > > {
> > > - /* Get the pixel/sample mask into f0 so that we know
> > > - * which pixels are lit. Then, for each channel that
> is
> > > - * unlit, replace the centroid data with non-centroid
> > > - * data.
> > > - */
> > > - bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> > > -
> > > - fs_inst *inst;
> > > - inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> > > - delta_xy[centroid_to_pixel(bary)],
> interp);
> > > - inst->predicate = BRW_PREDICATE_NORMAL;
> > > - inst->predicate_inverse = true;
> > > - inst->no_dd_clear = true;
> > > -
> > > - inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> > > - delta_xy[bary], interp);
> > > - inst->predicate = BRW_PREDICATE_NORMAL;
> > > - inst->predicate_inverse = false;
> > > - inst->no_dd_check = true;
> > > - } else {
> > > - bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary],
> interp);
> > > - }
> > > - if (devinfo->gen < 6 && interpolation_mode ==
> > > INTERP_MODE_SMOOTH) {
> > > - bld.MUL(*attr, *attr, this->pixel_w);
> > > - }
> > > - *attr = offset(*attr, bld, 1);
> > > - }
> > > - }
> > > - (*location)++;
> > > - }
> > > -}
> > > -
> > > fs_reg *
> > > fs_visitor::emit_frontfacing_interpolation()
> > > {
> > > @@ -6327,6 +6232,10 @@ fs_visitor::run_cs()
> > > /**
> > > * Return a bitfield where bit n is set if barycentric interpolation
> mode
> > > n
> > > * (see enum brw_barycentric_mode) is needed by the fragment shader.
> > > + *
> > > + * We examine the load_barycentric intrinsics rather than looking at
> input
> > > + * variables so that we catch interpolateAtCentroid() messages too,
> which
> > > + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set
> up.
> > > */
> > > static unsigned
> > > brw_compute_barycentric_interp_modes(const struct brw_device_info
> > > *devinfo,
> > > @@ -6334,29 +6243,37 @@ brw_compute_barycentric_interp_modes(const
> struct
> > > brw_device_info *devinfo,
> > > {
> > > unsigned barycentric_interp_modes = 0;
> > >
> > > - nir_foreach_variable(var, &shader->inputs) {
> > > - /* Ignore WPOS; it doesn't require interpolation. */
> > > - if (var->data.location == VARYING_SLOT_POS)
> > > + nir_foreach_function(f, shader) {
> > > + if (!f->impl)
> > > continue;
> > >
> > > - /* Flat inputs don't need barycentric modes. */
> > > - if (var->data.interpolation == INTERP_MODE_FLAT)
> > > - continue;
> > > + nir_foreach_block(block, f->impl) {
> > > + nir_foreach_instr(instr, block) {
> > > + if (instr->type != nir_instr_type_intrinsic)
> > > + continue;
> > >
> > > - /* Determine the set (or sets) of barycentric coordinates
> needed to
> > > - * interpolate this variable. Note that when
> > > - * brw->needs_unlit_centroid_workaround is set, centroid
> > > interpolation
> > > - * uses PIXEL interpolation for unlit pixels and CENTROID
> > > interpolation
> > > - * for lit pixels, so we need both sets of barycentric
> coordinates.
> > > - */
> > > - enum brw_barycentric_mode bary_mode =
> > > - barycentric_mode((glsl_interp_mode) var->data.interpolation,
> > > - var->data.centroid, var->data.sample);
> > > + nir_intrinsic_instr *intrin =
> nir_instr_as_intrinsic(instr);
> > > + if (intrin->intrinsic !=
> > > nir_intrinsic_load_interpolated_input)
> > > + continue;
> > >
> >
> > Any particular reason why you're looking at the source of the load rather
> > than just looking for the load_barycentric intrinsic directly?
> >
> >
> > > +
> > > + /* Ignore WPOS; it doesn't require interpolation. */
> > > + if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
> > > + continue;
> > >
> >
> > Ugh...
>
> This is why. gl_FragCoord still generates a load_interpolated_input
> with a load_barycentric_pixel, and I want to skip that. (It might be
> the only user of pixel coordinates, and if so, we don't want them.)
>
> You need the load_interpolated_input intrinsic to know which input it is.
>
> It's pretty clear that gl_FragCoord doesn't behave anything like a
> normal input, so it ought to be a system value. With that cleaned up,
> we could simplify this.
>
> That ended up being more work than I expected, so I was hoping to do
> it as a follow-on series, to unblock Tim sooner rather than later.
>
I'm ok with the "unblock Tim ASAP plan". I would like to see gl_FragCoord
as a system value eventually though. It just seems so much cleaner.
>
> > >
> > > - barycentric_interp_modes |= 1 << bary_mode;
> > > + intrin =
> > > nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
> > > + enum glsl_interp_mode interp = (enum glsl_interp_mode)
> > > + nir_intrinsic_interp_mode(intrin);
> > > + nir_intrinsic_op bary_op = intrin->intrinsic;
> > > + enum brw_barycentric_mode bary =
> > > + brw_barycentric_mode(interp, bary_op);
> > >
> > > - if (var->data.centroid &&
> devinfo->needs_unlit_centroid_workaround)
> > > - barycentric_interp_modes |= 1 <<
> centroid_to_pixel(bary_mode);
> > > + barycentric_interp_modes |= 1 << bary;
> > > +
> > > + if (devinfo->needs_unlit_centroid_workaround &&
> > > + bary_op == nir_intrinsic_load_barycentric_centroid)
> > > + barycentric_interp_modes |= 1 <<
> centroid_to_pixel(bary);
> > > + }
> > > + }
> > > }
> > >
> > > return barycentric_interp_modes;
> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h
> > > b/src/mesa/drivers/dri/i965/brw_fs.h
> > > index 7998f51..574475f 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_fs.h
> > > +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> > > @@ -174,11 +174,6 @@ public:
> > > fs_reg *emit_samplepos_setup();
> > > fs_reg *emit_sampleid_setup();
> > > fs_reg *emit_samplemaskin_setup();
> > > - void emit_general_interpolation(fs_reg *attr, const char *name,
> > > - const glsl_type *type,
> > > - glsl_interp_mode
> interpolation_mode,
> > > - int *location, bool mod_centroid,
> > > - bool mod_sample);
> > > fs_reg *emit_vs_system_value(int location);
> > > void emit_interpolation_setup_gen4();
> > > void emit_interpolation_setup_gen6();
> > > @@ -195,7 +190,6 @@ public:
> > > bool opt_zero_samples();
> > >
> > > void emit_nir_code();
> > > - void nir_setup_inputs();
> > > void nir_setup_single_output_varying(fs_reg *reg, const glsl_type
> > > *type,
> > > unsigned *location);
> > > void nir_setup_outputs();
> > > @@ -511,3 +505,6 @@ void shuffle_64bit_data_for_32bit_write(const
> > > brw::fs_builder &bld,
> > > uint32_t components);
> > > fs_reg setup_imm_df(const brw::fs_builder &bld,
> > > double v);
> > > +
> > > +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode
> mode,
> > > + nir_intrinsic_op op);
> > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > > b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > > index 6265dc6..610c151 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> > > @@ -36,7 +36,6 @@ fs_visitor::emit_nir_code()
> > > /* emit the arrays used for inputs and outputs - load/store
> intrinsics
> > > will
> > > * be converted to reads/writes of these arrays
> > > */
> > > - nir_setup_inputs();
> > > nir_setup_outputs();
> > > nir_setup_uniforms();
> > > nir_emit_system_values();
> > > @@ -50,38 +49,6 @@ fs_visitor::emit_nir_code()
> > > }
> > >
> > > void
> > > -fs_visitor::nir_setup_inputs()
> > > -{
> > > - if (stage != MESA_SHADER_FRAGMENT)
> > > - return;
> > > -
> > > - nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
> > > -
> > > - nir_foreach_variable(var, &nir->inputs) {
> > > - fs_reg input = offset(nir_inputs, bld,
> var->data.driver_location);
> > > -
> > > - fs_reg reg;
> > > - if (var->data.location == VARYING_SLOT_POS) {
> > > - emit_fragcoord_interpolation(input);
> > > - } else if (var->data.location == VARYING_SLOT_LAYER) {
> > > - struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER,
> > > 1), 3);
> > > - reg.type = BRW_REGISTER_TYPE_D;
> > > - bld.emit(FS_OPCODE_CINTERP, retype(input,
> BRW_REGISTER_TYPE_D),
> > > reg);
> > > - } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
> > > - struct brw_reg reg =
> suboffset(interp_reg(VARYING_SLOT_VIEWPORT,
> > > 2), 3);
> > > - reg.type = BRW_REGISTER_TYPE_D;
> > > - bld.emit(FS_OPCODE_CINTERP, retype(input,
> BRW_REGISTER_TYPE_D),
> > > reg);
> > > - } else {
> > > - int location = var->data.location;
> > > - emit_general_interpolation(&input, var->name, var->type,
> > > - (glsl_interp_mode)
> > > var->data.interpolation,
> > > - &location, var->data.centroid,
> > > - var->data.sample);
> > > - }
> > > - }
> > > -}
> > > -
> > > -void
> > > fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
> > > const glsl_type *type,
> > > unsigned *location)
> > > @@ -3063,7 +3030,6 @@ fs_visitor::nir_emit_fs_intrinsic(const
> fs_builder
> > > &bld,
> > > nir_intrinsic_instr *instr)
> > > {
> > > assert(stage == MESA_SHADER_FRAGMENT);
> > > - const struct brw_wm_prog_key *wm_key = (const struct
> brw_wm_prog_key
> > > *) key;
> > >
> > > fs_reg dest;
> > > if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> > > @@ -3120,189 +3086,245 @@ fs_visitor::nir_emit_fs_intrinsic(const
> > > fs_builder &bld,
> > > break;
> > > }
> > >
> > > - case nir_intrinsic_interp_var_at_centroid:
> > > - case nir_intrinsic_interp_var_at_sample:
> > > - case nir_intrinsic_interp_var_at_offset: {
> > > - /* Handle ARB_gpu_shader5 interpolation intrinsics
> > > - *
> > > - * It's worth a quick word of explanation as to why we handle
> the
> > > full
> > > - * variable-based interpolation intrinsic rather than a lowered
> > > version
> > > - * with like we do for other inputs. We have to do that because
> > > the way
> > > - * we set up inputs doesn't allow us to use the already setup
> > > inputs for
> > > - * interpolation. At the beginning of the shader, we go through
> > > all of
> > > - * the input variables and do the initial interpolation and put
> it
> > > in
> > > - * the nir_inputs array based on its location as determined in
> > > - * nir_lower_io. If the input isn't used, dead code cleans up
> and
> > > - * everything works fine. However, when we get to the
> > > ARB_gpu_shader5
> > > - * interpolation intrinsics, we need to reinterpolate the input
> > > - * differently. If we used an intrinsic that just had an index
> it
> > > would
> > > - * only give us the offset into the nir_inputs array. However,
> > > this is
> > > - * useless because that value is post-interpolation and we need
> > > - * pre-interpolation. In order to get the actual location of
> the
> > > bits
> > > - * we get from the vertex fetching hardware, we need the
> variable.
> > > - */
> > > - fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> > > - const glsl_interp_mode interpolation =
> > > - (glsl_interp_mode)
> instr->variables[0]->var->data.interpolation;
> > > + case nir_intrinsic_load_input: {
> > > + /* load_input is only used for flat inputs */
> > > + unsigned base = nir_intrinsic_base(instr);
> > > + unsigned component = nir_intrinsic_component(instr);
> > > + unsigned num_components = instr->num_components;
> > > + enum brw_reg_type type = dest.type;
> > >
> > > - switch (instr->intrinsic) {
> > > - case nir_intrinsic_interp_var_at_centroid:
> > > - emit_pixel_interpolater_send(bld,
> > > -
> FS_OPCODE_INTERPOLATE_AT_CENTROID,
> > > - dst_xy,
> > > - fs_reg(), /* src */
> > > - brw_imm_ud(0u),
> > > - interpolation);
> > > - break;
> > > + /* Special case fields in the VUE header */
> > > + if (base == VARYING_SLOT_LAYER)
> > > + component = 1;
> > > + else if (base == VARYING_SLOT_VIEWPORT)
> > > + component = 2;
> > >
> > > - case nir_intrinsic_interp_var_at_sample: {
> > > - if (!wm_key->multisample_fbo) {
> > > - /* From the ARB_gpu_shader5 specification:
> > > - * "If multisample buffers are not available, the input
> > > varying
> > > - * will be evaluated at the center of the pixel."
> > > - */
> > > - emit_pixel_interpolater_send(bld,
> > > -
> > > FS_OPCODE_INTERPOLATE_AT_CENTROID,
> > > - dst_xy,
> > > - fs_reg(), /* src */
> > > - brw_imm_ud(0u),
> > > - interpolation);
> > > - break;
> > > - }
> > > + if (nir_dest_bit_size(instr->dest) == 64) {
> > > + /* const_index is in 32-bit type size units that could not be
> > > aligned
> > > + * with DF. We need to read the double vector as if it was a
> > > float
> > > + * vector of twice the number of components to fetch the
> right
> > > data.
> > > + */
> > > + type = BRW_REGISTER_TYPE_F;
> > > + num_components *= 2;
> > > + }
> > >
> > > - nir_const_value *const_sample =
> > > nir_src_as_const_value(instr->src[0]);
> > > + for (unsigned int i = 0; i < num_components; i++) {
> > > + struct brw_reg interp = interp_reg(base, component + i);
> > > + interp = suboffset(interp, 3);
> > > + bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld,
> i),
> > > + retype(fs_reg(interp), type));
> > > + }
> > >
> > > - if (const_sample) {
> > > - unsigned msg_data = const_sample->i32[0] << 4;
> > > + if (nir_dest_bit_size(instr->dest) == 64) {
> > > + shuffle_32bit_load_result_to_64bit_data(bld,
> > > + dest,
> > > + retype(dest, type),
> > > +
> instr->num_components);
> > > + }
> > > + break;
> > > + }
> > > +
> > > + case nir_intrinsic_load_barycentric_pixel:
> > > + case nir_intrinsic_load_barycentric_centroid:
> > > + case nir_intrinsic_load_barycentric_sample:
> > > + /* Do nothing - load_interpolated_input handling will handle it
> > > later. */
> > > + break;
> > >
> >
> > I'm very courious why you made this choice. It seems odd to me.
>
> I originally tried emitting MOVs here. The data layout is a bit funky:
>
> R3: X coordinates for Slots 0-7
> R4: Y coordinates for Slots 0-7
> R5: X coordinates for Slots 8-15
> R5: Y coordinates for Slots 8-15
>
> For SIMD8, you could do two separate MOVs (move X, then Y). You might be
> able to do a compressed mov(16)...but would need to whack the channel
> enables into 1Q for both halves (if that's possible) or use NoMask
> (which I wanted to avoid).
>
> For SIMD16...you can't move both X's with a single MOV. A <16,8,1>
> region would cover it, but sources can't span more than two adjacent
> registers. So that doesn't work. It might take four MOVs. Maybe
> only two if I could use the above trick.
>
> Generating MOVs also means that we're relying on the optimizer to
> remove them...which it didn't seem to be. It ended up being much
> easier to just point LINTERP at delta_xy[] directly.
>
Right... I forgot about the odd interleaving. That seems like a reasonable
reason for doing it this way.
>
> > > + case nir_intrinsic_load_barycentric_at_sample: {
> > > + const glsl_interp_mode interpolation =
> > > + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
> > > +
> > > + nir_const_value *const_sample =
> > > nir_src_as_const_value(instr->src[0]);
> > > +
> > > + if (const_sample) {
> > > + unsigned msg_data = const_sample->i32[0] << 4;
> > > +
> > > + emit_pixel_interpolater_send(bld,
> > > + FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > > + dest,
> > > + fs_reg(), /* src */
> > > + brw_imm_ud(msg_data),
> > > + interpolation);
> > > + } else {
> > > + const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> > > + BRW_REGISTER_TYPE_UD);
> > > +
> > > + if (nir_src_is_dynamically_uniform(instr->src[0])) {
> > > + const fs_reg sample_id = bld.emit_uniformize(sample_src);
> > > + const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > > + bld.exec_all().group(1, 0)
> > > + .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > > emit_pixel_interpolater_send(bld,
> > >
> FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > > - dst_xy,
> > > + dest,
> > > fs_reg(), /* src */
> > > - brw_imm_ud(msg_data),
> > > + msg_data,
> > > interpolation);
> > > } else {
> > > - const fs_reg sample_src =
> retype(get_nir_src(instr->src[0]),
> > > - BRW_REGISTER_TYPE_UD);
> > > -
> > > - if (nir_src_is_dynamically_uniform(instr->src[0])) {
> > > - const fs_reg sample_id =
> bld.emit_uniformize(sample_src);
> > > - const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > > - bld.exec_all().group(1, 0)
> > > - .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > > + /* Make a loop that sends a message to the pixel
> interpolater
> > > + * for the sample number in each live channel. If there
> are
> > > + * multiple channels with the same sample number then
> these
> > > + * will be handled simultaneously with a single
> interation of
> > > + * the loop.
> > > + */
> > > + bld.emit(BRW_OPCODE_DO);
> > > +
> > > + /* Get the next live sample number into sample_id_reg */
> > > + const fs_reg sample_id = bld.emit_uniformize(sample_src);
> > > +
> > > + /* Set the flag register so that we can perform the send
> > > + * message on all channels that have the same sample
> number
> > > + */
> > > + bld.CMP(bld.null_reg_ud(),
> > > + sample_src, sample_id,
> > > + BRW_CONDITIONAL_EQ);
> > > + const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > > + bld.exec_all().group(1, 0)
> > > + .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > > + fs_inst *inst =
> > > emit_pixel_interpolater_send(bld,
> > >
> > > FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > > - dst_xy,
> > > + dest,
> > > fs_reg(), /* src */
> > > msg_data,
> > > interpolation);
> > > - } else {
> > > - /* Make a loop that sends a message to the pixel
> > > interpolater
> > > - * for the sample number in each live channel. If
> there are
> > > - * multiple channels with the same sample number then
> these
> > > - * will be handled simultaneously with a single
> interation
> > > of
> > > - * the loop.
> > > - */
> > > - bld.emit(BRW_OPCODE_DO);
> > > -
> > > - /* Get the next live sample number into sample_id_reg
> */
> > > - const fs_reg sample_id =
> bld.emit_uniformize(sample_src);
> > > + set_predicate(BRW_PREDICATE_NORMAL, inst);
> > >
> > > - /* Set the flag register so that we can perform the
> send
> > > - * message on all channels that have the same sample
> number
> > > - */
> > > - bld.CMP(bld.null_reg_ud(),
> > > - sample_src, sample_id,
> > > - BRW_CONDITIONAL_EQ);
> > > - const fs_reg msg_data = vgrf(glsl_type::uint_type);
> > > - bld.exec_all().group(1, 0)
> > > - .SHL(msg_data, sample_id, brw_imm_ud(4u));
> > > - fs_inst *inst =
> > > - emit_pixel_interpolater_send(bld,
> > > -
> > > FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> > > - dst_xy,
> > > - fs_reg(), /* src */
> > > - msg_data,
> > > - interpolation);
> > > - set_predicate(BRW_PREDICATE_NORMAL, inst);
> > > -
> > > - /* Continue the loop if there are any live channels
> left */
> > > - set_predicate_inv(BRW_PREDICATE_NORMAL,
> > > - true, /* inverse */
> > > - bld.emit(BRW_OPCODE_WHILE));
> > > - }
> > > + /* Continue the loop if there are any live channels left
> */
> > > + set_predicate_inv(BRW_PREDICATE_NORMAL,
> > > + true, /* inverse */
> > > + bld.emit(BRW_OPCODE_WHILE));
> > > }
> > > -
> > > - break;
> > > }
> > > + break;
> > > + }
> > >
> > > - case nir_intrinsic_interp_var_at_offset: {
> > > - nir_const_value *const_offset =
> > > nir_src_as_const_value(instr->src[0]);
> > > + case nir_intrinsic_load_barycentric_at_offset: {
> > > + const glsl_interp_mode interpolation =
> > > + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
> > >
> > > - if (const_offset) {
> > > - unsigned off_x = MIN2((int)(const_offset->f32[0] * 16),
> 7) &
> > > 0xf;
> > > - unsigned off_y = MIN2((int)(const_offset->f32[1] * 16),
> 7) &
> > > 0xf;
> > > + nir_const_value *const_offset =
> > > nir_src_as_const_value(instr->src[0]);
> > >
> > > - emit_pixel_interpolater_send(bld,
> > > -
> > > FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> > > - dst_xy,
> > > - fs_reg(), /* src */
> > > - brw_imm_ud(off_x | (off_y <<
> 4)),
> > > - interpolation);
> > > - } else {
> > > - fs_reg src = vgrf(glsl_type::ivec2_type);
> > > - fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> > > - BRW_REGISTER_TYPE_F);
> > > - for (int i = 0; i < 2; i++) {
> > > - fs_reg temp = vgrf(glsl_type::float_type);
> > > - bld.MUL(temp, offset(offset_src, bld, i),
> > > brw_imm_f(16.0f));
> > > - fs_reg itemp = vgrf(glsl_type::int_type);
> > > - /* float to int */
> > > - bld.MOV(itemp, temp);
> > > -
> > > - /* Clamp the upper end of the range to +7/16.
> > > - * ARB_gpu_shader5 requires that we support a maximum
> > > offset
> > > - * of +0.5, which isn't representable in a S0.4 value
> -- if
> > > - * we didn't clamp it, we'd end up with -8/16, which
> is the
> > > - * opposite of what the shader author wanted.
> > > - *
> > > - * This is legal due to ARB_gpu_shader5's quantization
> > > - * rules:
> > > - *
> > > - * "Not all values of <offset> may be supported; x and
> y
> > > - * offsets may be rounded to fixed-point values with
> the
> > > - * number of fraction bits given by the
> > > - * implementation-dependent constant
> > > - * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> > > - */
> > > - set_condmod(BRW_CONDITIONAL_L,
> > > - bld.SEL(offset(src, bld, i), itemp,
> > > brw_imm_d(7)));
> > > - }
> > > + if (const_offset) {
> > > + unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) &
> 0xf;
> > > + unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) &
> 0xf;
> > >
> > > - const enum opcode opcode =
> > > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> > > - emit_pixel_interpolater_send(bld,
> > > - opcode,
> > > - dst_xy,
> > > - src,
> > > - brw_imm_ud(0u),
> > > - interpolation);
> > > + emit_pixel_interpolater_send(bld,
> > > +
> > > FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> > > + dest,
> > > + fs_reg(), /* src */
> > > + brw_imm_ud(off_x | (off_y <<
> 4)),
> > > + interpolation);
> > > + } else {
> > > + fs_reg src = vgrf(glsl_type::ivec2_type);
> > > + fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> > > + BRW_REGISTER_TYPE_F);
> > > + for (int i = 0; i < 2; i++) {
> > > + fs_reg temp = vgrf(glsl_type::float_type);
> > > + bld.MUL(temp, offset(offset_src, bld, i),
> brw_imm_f(16.0f));
> > > + fs_reg itemp = vgrf(glsl_type::int_type);
> > > + /* float to int */
> > > + bld.MOV(itemp, temp);
> > > +
> > > + /* Clamp the upper end of the range to +7/16.
> > > + * ARB_gpu_shader5 requires that we support a maximum
> offset
> > > + * of +0.5, which isn't representable in a S0.4 value --
> if
> > > + * we didn't clamp it, we'd end up with -8/16, which is
> the
> > > + * opposite of what the shader author wanted.
> > > + *
> > > + * This is legal due to ARB_gpu_shader5's quantization
> > > + * rules:
> > > + *
> > > + * "Not all values of <offset> may be supported; x and y
> > > + * offsets may be rounded to fixed-point values with the
> > > + * number of fraction bits given by the
> > > + * implementation-dependent constant
> > > + * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> > > + */
> > > + set_condmod(BRW_CONDITIONAL_L,
> > > + bld.SEL(offset(src, bld, i), itemp,
> > > brw_imm_d(7)));
> > > }
> > > +
> > > + const enum opcode opcode =
> > > FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> > > + emit_pixel_interpolater_send(bld,
> > > + opcode,
> > > + dest,
> > > + src,
> > > + brw_imm_ud(0u),
> > > + interpolation);
> > > + }
> > > + break;
> > > + }
> > > +
> > > + case nir_intrinsic_load_interpolated_input: {
> > > + if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
> > > + emit_fragcoord_interpolation(dest);
> > > break;
> > > }
> > >
> > > - default:
> > > - unreachable("Invalid intrinsic");
> > > + assert(instr->src[0].ssa &&
> > > + instr->src[0].ssa->parent_instr->type ==
> > > nir_instr_type_intrinsic);
> > > + nir_intrinsic_instr *bary_intrinsic =
> > > + nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
> > > + nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
> > > + enum glsl_interp_mode interp_mode =
> > > + (enum glsl_interp_mode)
> > > nir_intrinsic_interp_mode(bary_intrinsic);
> > > + fs_reg dst_xy;
> > > +
> > > + if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
> > > + bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
> > > + /* Use the result of the PI message */
> > > + dst_xy = retype(get_nir_src(instr->src[0]),
> BRW_REGISTER_TYPE_F);
> > > + } else {
> > > + /* Use the delta_xy values computed from the payload */
> > > + enum brw_barycentric_mode bary =
> > > + brw_barycentric_mode(interp_mode, bary_intrin);
> > > +
> > > + dst_xy = this->delta_xy[bary];
> > > }
> > >
> > > - for (unsigned j = 0; j < instr->num_components; j++) {
> > > - fs_reg src =
> interp_reg(instr->variables[0]->var->data.location,
> > > j);
> > > - src.type = dest.type;
> > > + for (unsigned int i = 0; i < instr->num_components; i++) {
> > > + fs_reg interp =
> > > + fs_reg(interp_reg(nir_intrinsic_base(instr),
> > > + nir_intrinsic_component(instr) + i));
> > > + interp.type = BRW_REGISTER_TYPE_F;
> > > + dest.type = BRW_REGISTER_TYPE_F;
> > >
> > > - bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
> > > - dest = offset(dest, bld, 1);
> > > + if (devinfo->needs_unlit_centroid_workaround &&
> > > + bary_intrin == nir_intrinsic_load_barycentric_centroid) {
> > > +
> > > + /* Get the pixel/sample mask into f0 so that we know which
> > > + * pixels are lit. Then, for each channel that is unlit,
> > > + * replace the centroid data with non-centroid data.
> > > + */
> > > + bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> > > +
> > > + fs_reg dest_i = offset(dest, bld, i);
> > > + fs_reg dst_xy_pixel =
> > > + delta_xy[brw_barycentric_mode(interp_mode,
> > > + nir_intrinsic_load_barycentric_pixel)];
> > > +
> > > + fs_inst *inst;
> > > + inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy_pixel,
> > > interp);
> > > + inst->predicate = BRW_PREDICATE_NORMAL;
> > > + inst->predicate_inverse = true;
> > > + inst->no_dd_clear = true;
> > > +
> > > + inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy,
> interp);
> > > + inst->predicate = BRW_PREDICATE_NORMAL;
> > > + inst->predicate_inverse = false;
> > > + inst->no_dd_check = true;
> > > + } else if (devinfo->gen < 6 && interp_mode ==
> > > INTERP_MODE_SMOOTH) {
> > > + fs_reg tmp = vgrf(glsl_type::float_type);
> > > + bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
> > > + bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
> > > + } else {
> > > + bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy,
> > > interp);
> > > + }
> > >
> >
> > Ugh... I think I see why you're handling the barycentric load here now.
> > Eventually, I think we could probably be doing this better but I'm not
> > seeing an easy path at the moment.
>
> Yeah, perhaps. I don't think it's that bad as is, though.
>
> > > }
> > > break;
> > > }
> > > +
> > > default:
> > > nir_emit_intrinsic(bld, instr);
> > > break;
> > > @@ -3869,26 +3891,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder
> > > &bld, nir_intrinsic_instr *instr
> > > }
> > >
> > > case nir_intrinsic_load_input: {
> > > - fs_reg src;
> > > + fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
> > > unsigned num_components = instr->num_components;
> > > enum brw_reg_type type = dest.type;
> > >
> > > - if (stage == MESA_SHADER_VERTEX) {
> > > - src = fs_reg(ATTR, instr->const_index[0], dest.type);
> > > - } else {
> > > - assert(type_sz(type) >= 4);
> > > - if (type == BRW_REGISTER_TYPE_DF) {
> > > - /* const_index is in 32-bit type size units that could
> not be
> > > aligned
> > > - * with DF. We need to read the double vector as if it
> was a
> > > float
> > > - * vector of twice the number of components to fetch the
> > > right data.
> > > - */
> > > - dest = retype(dest, BRW_REGISTER_TYPE_F);
> > > - num_components *= 2;
> > > - }
> > > - src = offset(retype(nir_inputs, dest.type), bld,
> > > - instr->const_index[0]);
> > > - }
> > >
> >
> > This hunk seems odd... It doesn't look like it really has anything to do
> > with FS. Is that else clause really FS-only or does it apply to GS and
> > tess stages?
>
> Yep, this is FS specific. TCS/TES/GS input intrinsics are all handled
> by nir_emit_{tcs,tes,gs}_intrinsic. This code in the generic intrinsic
> handler is only for VS and FS. And the VS code obviously didn't include
> this hunk.
>
> Sorry this was immensely non-obvious. This raises a good point, though:
> after this patch, I can move the remaining code to nir_emit_vs_intrinsic
> so it's clear it only applies to the VS.
>
I'd like to see that patch if you don't mind.
>
> > > -
> > > nir_const_value *const_offset =
> > > nir_src_as_const_value(instr->src[0]);
> > > assert(const_offset && "Indirect input loads not allowed");
> > > src = offset(src, bld, const_offset->u32[0]);
> > > diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
> > > b/src/mesa/drivers/dri/i965/brw_nir.c
> > > index caf9fe0..d1a823a 100644
> > > --- a/src/mesa/drivers/dri/i965/brw_nir.c
> > > +++ b/src/mesa/drivers/dri/i965/brw_nir.c
> > > @@ -30,7 +30,8 @@ static bool
> > > is_input(nir_intrinsic_instr *intrin)
> > > {
> > > return intrin->intrinsic == nir_intrinsic_load_input ||
> > > - intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
> > > + intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
> > > + intrin->intrinsic == nir_intrinsic_load_interpolated_input;
> > > }
> > >
> > > static bool
> > > @@ -282,9 +283,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir, const
> > > struct brw_vue_map *vue_map)
> > > void
> > > brw_nir_lower_fs_inputs(nir_shader *nir)
> > > {
> > > - nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
> > > VARYING_SLOT_VAR0,
> > > - type_size_scalar);
> > > - nir_lower_io(nir, nir_var_shader_in, type_size_scalar, false);
> > > + foreach_list_typed(nir_variable, var, node, &nir->inputs) {
> > > + var->data.driver_location = var->data.location;
> > > + }
> > > +
> > > + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, true);
> > > +
> > > + /* This pass needs actual constants */
> > > + nir_opt_constant_folding(nir);
> > > +
> > > + add_const_offset_to_base(nir, nir_var_shader_in);
> > > }
> > >
> > > void
> > > --
> > > 2.9.0
> > >
> > > _______________________________________________
> > > mesa-dev mailing list
> > > mesa-dev at lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> > >
> >
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160719/44a93a85/attachment-0001.html>
More information about the mesa-dev
mailing list