[Mesa-dev] [PATCH 6/7] i965: Rewrite FS input handling to use the new NIR intrinsics.
Jason Ekstrand
jason at jlekstrand.net
Tue Jul 19 20:39:23 UTC 2016
On Mon, Jul 18, 2016 at 1:26 PM, Kenneth Graunke <kenneth at whitecape.org>
wrote:
> This eliminates the need to walk the list of input variables, recurse
> into their types (via logic largely redundant with nir_lower_io), and
> interpolate all possible inputs up front. The backend no longer has
> to care about variables at all, which eliminates complications from
> trying to pack multiple variables into the same location. Instead,
> each intrinsic specifies exactly what's needed.
>
> This should unblock Timothy's work on GL_ARB_enhanced_layouts.
>
> Each load_interpolated_input intrinsic corresponds to PLN instructions,
> while load_barycentric_at_* intrinsics correspond to pixel interpolator
> messages. The pixel/centroid/sample barycentric intrinsics simply refer
> to payload fields (delta_xy[]), and don't actually generate any code.
>
> Because we use a single intrinsic for both centroid-qualified variables
> and interpolateAtCentroid(), they become indistinguishable. We stop
> sending pixel interpolator messages for those, and instead use the
> payload provided data, which should be considerably faster.
>
> On Broadwell:
>
> total instructions in shared programs: 9067751 -> 9067570 (-0.00%)
> instructions in affected programs: 145902 -> 145721 (-0.12%)
> helped: 422
> HURT: 209
>
> total spills in shared programs: 2849 -> 2899 (1.76%)
> spills in affected programs: 760 -> 810 (6.58%)
> helped: 0
> HURT: 10
>
> total fills in shared programs: 3910 -> 3950 (1.02%)
> fills in affected programs: 617 -> 657 (6.48%)
> helped: 0
> HURT: 10
>
> LOST: 3
> GAINED: 3
>
> The differences mostly appear to be slight changes in MOVs.
>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
> src/mesa/drivers/dri/i965/brw_fs.cpp | 175 ++++---------
> src/mesa/drivers/dri/i965/brw_fs.h | 9 +-
> src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 410
> ++++++++++++++++---------------
> src/mesa/drivers/dri/i965/brw_nir.c | 16 +-
> 4 files changed, 269 insertions(+), 341 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 94127bc..06007fe 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1067,21 +1067,27 @@ fs_visitor::emit_fragcoord_interpolation(fs_reg
> wpos)
> bld.MOV(wpos, this->wpos_w);
> }
>
> -static enum brw_barycentric_mode
> -barycentric_mode(enum glsl_interp_mode mode,
> - bool is_centroid, bool is_sample)
> +enum brw_barycentric_mode
> +brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
> {
> - unsigned bary;
> -
> /* Barycentric modes don't make sense for flat inputs. */
> assert(mode != INTERP_MODE_FLAT);
>
> - if (is_sample) {
> - bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> - } else if (is_centroid) {
> - bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> - } else {
> + unsigned bary;
> + switch (op) {
> + case nir_intrinsic_load_barycentric_pixel:
> + case nir_intrinsic_load_barycentric_at_offset:
> bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
> + break;
> + case nir_intrinsic_load_barycentric_centroid:
> + bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
> + break;
> + case nir_intrinsic_load_barycentric_sample:
> + case nir_intrinsic_load_barycentric_at_sample:
> + bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
> + break;
> + default:
> + assert(!"invalid intrinsic");
> }
>
> if (mode == INTERP_MODE_NOPERSPECTIVE)
> @@ -1101,107 +1107,6 @@ centroid_to_pixel(enum brw_barycentric_mode bary)
> return (enum brw_barycentric_mode) ((unsigned) bary - 1);
> }
>
> -void
> -fs_visitor::emit_general_interpolation(fs_reg *attr, const char *name,
> - const glsl_type *type,
> - glsl_interp_mode
> interpolation_mode,
> - int *location, bool mod_centroid,
> - bool mod_sample)
> -{
> - assert(stage == MESA_SHADER_FRAGMENT);
> - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> -
> - if (type->is_array() || type->is_matrix()) {
> - const glsl_type *elem_type = glsl_get_array_element(type);
> - const unsigned length = glsl_get_length(type);
> -
> - for (unsigned i = 0; i < length; i++) {
> - emit_general_interpolation(attr, name, elem_type,
> interpolation_mode,
> - location, mod_centroid, mod_sample);
> - }
> - } else if (type->is_record()) {
> - for (unsigned i = 0; i < type->length; i++) {
> - const glsl_type *field_type = type->fields.structure[i].type;
> - emit_general_interpolation(attr, name, field_type,
> interpolation_mode,
> - location, mod_centroid, mod_sample);
> - }
> - } else {
> - assert(type->is_scalar() || type->is_vector());
> -
> - if (prog_data->urb_setup[*location] == -1) {
> - /* If there's no incoming setup data for this slot, don't
> - * emit interpolation for it.
> - */
> - *attr = offset(*attr, bld, type->vector_elements);
> - (*location)++;
> - return;
> - }
> -
> - attr->type = brw_type_for_base_type(type->get_scalar_type());
> -
> - if (interpolation_mode == INTERP_MODE_FLAT) {
> - /* Constant interpolation (flat shading) case. The SF has
> - * handed us defined values in only the constant offset
> - * field of the setup reg.
> - */
> - unsigned vector_elements = type->vector_elements;
> -
> - /* Data starts at suboffet 3 in 32-bit units (12 bytes), so it
> is not
> - * 64-bit aligned and the current implementation fails to read
> the
> - * data properly. Instead, when there is a double input varying,
> - * read it as vector of floats with twice the number of
> components.
> - */
> - if (attr->type == BRW_REGISTER_TYPE_DF) {
> - vector_elements *= 2;
> - attr->type = BRW_REGISTER_TYPE_F;
> - }
> - for (unsigned int i = 0; i < vector_elements; i++) {
> - struct brw_reg interp = interp_reg(*location, i);
> - interp = suboffset(interp, 3);
> - interp.type = attr->type;
> - bld.emit(FS_OPCODE_CINTERP, *attr, fs_reg(interp));
> - *attr = offset(*attr, bld, 1);
> - }
> - } else {
> - /* Smooth/noperspective interpolation case. */
> - enum brw_barycentric_mode bary =
> - barycentric_mode(interpolation_mode, mod_centroid,
> mod_sample);
> -
> - for (unsigned int i = 0; i < type->vector_elements; i++) {
> - fs_reg interp(interp_reg(*location, i));
> - if (devinfo->needs_unlit_centroid_workaround && mod_centroid)
> {
> - /* Get the pixel/sample mask into f0 so that we know
> - * which pixels are lit. Then, for each channel that is
> - * unlit, replace the centroid data with non-centroid
> - * data.
> - */
> - bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> -
> - fs_inst *inst;
> - inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> - delta_xy[centroid_to_pixel(bary)], interp);
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - inst->predicate_inverse = true;
> - inst->no_dd_clear = true;
> -
> - inst = bld.emit(FS_OPCODE_LINTERP, *attr,
> - delta_xy[bary], interp);
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - inst->predicate_inverse = false;
> - inst->no_dd_check = true;
> - } else {
> - bld.emit(FS_OPCODE_LINTERP, *attr, delta_xy[bary], interp);
> - }
> - if (devinfo->gen < 6 && interpolation_mode ==
> INTERP_MODE_SMOOTH) {
> - bld.MUL(*attr, *attr, this->pixel_w);
> - }
> - *attr = offset(*attr, bld, 1);
> - }
> - }
> - (*location)++;
> - }
> -}
> -
> fs_reg *
> fs_visitor::emit_frontfacing_interpolation()
> {
> @@ -6327,6 +6232,10 @@ fs_visitor::run_cs()
> /**
> * Return a bitfield where bit n is set if barycentric interpolation mode
> n
> * (see enum brw_barycentric_mode) is needed by the fragment shader.
> + *
> + * We examine the load_barycentric intrinsics rather than looking at input
> + * variables so that we catch interpolateAtCentroid() messages too, which
> + * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
> */
> static unsigned
> brw_compute_barycentric_interp_modes(const struct brw_device_info
> *devinfo,
> @@ -6334,29 +6243,37 @@ brw_compute_barycentric_interp_modes(const struct
> brw_device_info *devinfo,
> {
> unsigned barycentric_interp_modes = 0;
>
> - nir_foreach_variable(var, &shader->inputs) {
> - /* Ignore WPOS; it doesn't require interpolation. */
> - if (var->data.location == VARYING_SLOT_POS)
> + nir_foreach_function(f, shader) {
> + if (!f->impl)
> continue;
>
> - /* Flat inputs don't need barycentric modes. */
> - if (var->data.interpolation == INTERP_MODE_FLAT)
> - continue;
> + nir_foreach_block(block, f->impl) {
> + nir_foreach_instr(instr, block) {
> + if (instr->type != nir_instr_type_intrinsic)
> + continue;
>
> - /* Determine the set (or sets) of barycentric coordinates needed to
> - * interpolate this variable. Note that when
> - * brw->needs_unlit_centroid_workaround is set, centroid
> interpolation
> - * uses PIXEL interpolation for unlit pixels and CENTROID
> interpolation
> - * for lit pixels, so we need both sets of barycentric coordinates.
> - */
> - enum brw_barycentric_mode bary_mode =
> - barycentric_mode((glsl_interp_mode) var->data.interpolation,
> - var->data.centroid, var->data.sample);
> + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
> + if (intrin->intrinsic !=
> nir_intrinsic_load_interpolated_input)
> + continue;
>
Any particular reason why you're looking at the source of the load rather
than just looking for the load_barycentric intrinsic directly?
> +
> + /* Ignore WPOS; it doesn't require interpolation. */
> + if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
> + continue;
>
Ugh...
>
> - barycentric_interp_modes |= 1 << bary_mode;
> + intrin =
> nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
> + enum glsl_interp_mode interp = (enum glsl_interp_mode)
> + nir_intrinsic_interp_mode(intrin);
> + nir_intrinsic_op bary_op = intrin->intrinsic;
> + enum brw_barycentric_mode bary =
> + brw_barycentric_mode(interp, bary_op);
>
> - if (var->data.centroid && devinfo->needs_unlit_centroid_workaround)
> - barycentric_interp_modes |= 1 << centroid_to_pixel(bary_mode);
> + barycentric_interp_modes |= 1 << bary;
> +
> + if (devinfo->needs_unlit_centroid_workaround &&
> + bary_op == nir_intrinsic_load_barycentric_centroid)
> + barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
> + }
> + }
> }
>
> return barycentric_interp_modes;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h
> b/src/mesa/drivers/dri/i965/brw_fs.h
> index 7998f51..574475f 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -174,11 +174,6 @@ public:
> fs_reg *emit_samplepos_setup();
> fs_reg *emit_sampleid_setup();
> fs_reg *emit_samplemaskin_setup();
> - void emit_general_interpolation(fs_reg *attr, const char *name,
> - const glsl_type *type,
> - glsl_interp_mode interpolation_mode,
> - int *location, bool mod_centroid,
> - bool mod_sample);
> fs_reg *emit_vs_system_value(int location);
> void emit_interpolation_setup_gen4();
> void emit_interpolation_setup_gen6();
> @@ -195,7 +190,6 @@ public:
> bool opt_zero_samples();
>
> void emit_nir_code();
> - void nir_setup_inputs();
> void nir_setup_single_output_varying(fs_reg *reg, const glsl_type
> *type,
> unsigned *location);
> void nir_setup_outputs();
> @@ -511,3 +505,6 @@ void shuffle_64bit_data_for_32bit_write(const
> brw::fs_builder &bld,
> uint32_t components);
> fs_reg setup_imm_df(const brw::fs_builder &bld,
> double v);
> +
> +enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode,
> + nir_intrinsic_op op);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 6265dc6..610c151 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -36,7 +36,6 @@ fs_visitor::emit_nir_code()
> /* emit the arrays used for inputs and outputs - load/store intrinsics
> will
> * be converted to reads/writes of these arrays
> */
> - nir_setup_inputs();
> nir_setup_outputs();
> nir_setup_uniforms();
> nir_emit_system_values();
> @@ -50,38 +49,6 @@ fs_visitor::emit_nir_code()
> }
>
> void
> -fs_visitor::nir_setup_inputs()
> -{
> - if (stage != MESA_SHADER_FRAGMENT)
> - return;
> -
> - nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
> -
> - nir_foreach_variable(var, &nir->inputs) {
> - fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
> -
> - fs_reg reg;
> - if (var->data.location == VARYING_SLOT_POS) {
> - emit_fragcoord_interpolation(input);
> - } else if (var->data.location == VARYING_SLOT_LAYER) {
> - struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER,
> 1), 3);
> - reg.type = BRW_REGISTER_TYPE_D;
> - bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D),
> reg);
> - } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
> - struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT,
> 2), 3);
> - reg.type = BRW_REGISTER_TYPE_D;
> - bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D),
> reg);
> - } else {
> - int location = var->data.location;
> - emit_general_interpolation(&input, var->name, var->type,
> - (glsl_interp_mode)
> var->data.interpolation,
> - &location, var->data.centroid,
> - var->data.sample);
> - }
> - }
> -}
> -
> -void
> fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
> const glsl_type *type,
> unsigned *location)
> @@ -3063,7 +3030,6 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder
> &bld,
> nir_intrinsic_instr *instr)
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> - const struct brw_wm_prog_key *wm_key = (const struct brw_wm_prog_key
> *) key;
>
> fs_reg dest;
> if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> @@ -3120,189 +3086,245 @@ fs_visitor::nir_emit_fs_intrinsic(const
> fs_builder &bld,
> break;
> }
>
> - case nir_intrinsic_interp_var_at_centroid:
> - case nir_intrinsic_interp_var_at_sample:
> - case nir_intrinsic_interp_var_at_offset: {
> - /* Handle ARB_gpu_shader5 interpolation intrinsics
> - *
> - * It's worth a quick word of explanation as to why we handle the
> full
> - * variable-based interpolation intrinsic rather than a lowered
> version
> - * with like we do for other inputs. We have to do that because
> the way
> - * we set up inputs doesn't allow us to use the already setup
> inputs for
> - * interpolation. At the beginning of the shader, we go through
> all of
> - * the input variables and do the initial interpolation and put it
> in
> - * the nir_inputs array based on its location as determined in
> - * nir_lower_io. If the input isn't used, dead code cleans up and
> - * everything works fine. However, when we get to the
> ARB_gpu_shader5
> - * interpolation intrinsics, we need to reinterpolate the input
> - * differently. If we used an intrinsic that just had an index it
> would
> - * only give us the offset into the nir_inputs array. However,
> this is
> - * useless because that value is post-interpolation and we need
> - * pre-interpolation. In order to get the actual location of the
> bits
> - * we get from the vertex fetching hardware, we need the variable.
> - */
> - fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
> - const glsl_interp_mode interpolation =
> - (glsl_interp_mode) instr->variables[0]->var->data.interpolation;
> + case nir_intrinsic_load_input: {
> + /* load_input is only used for flat inputs */
> + unsigned base = nir_intrinsic_base(instr);
> + unsigned component = nir_intrinsic_component(instr);
> + unsigned num_components = instr->num_components;
> + enum brw_reg_type type = dest.type;
>
> - switch (instr->intrinsic) {
> - case nir_intrinsic_interp_var_at_centroid:
> - emit_pixel_interpolater_send(bld,
> - FS_OPCODE_INTERPOLATE_AT_CENTROID,
> - dst_xy,
> - fs_reg(), /* src */
> - brw_imm_ud(0u),
> - interpolation);
> - break;
> + /* Special case fields in the VUE header */
> + if (base == VARYING_SLOT_LAYER)
> + component = 1;
> + else if (base == VARYING_SLOT_VIEWPORT)
> + component = 2;
>
> - case nir_intrinsic_interp_var_at_sample: {
> - if (!wm_key->multisample_fbo) {
> - /* From the ARB_gpu_shader5 specification:
> - * "If multisample buffers are not available, the input
> varying
> - * will be evaluated at the center of the pixel."
> - */
> - emit_pixel_interpolater_send(bld,
> -
> FS_OPCODE_INTERPOLATE_AT_CENTROID,
> - dst_xy,
> - fs_reg(), /* src */
> - brw_imm_ud(0u),
> - interpolation);
> - break;
> - }
> + if (nir_dest_bit_size(instr->dest) == 64) {
> + /* const_index is in 32-bit type size units that could not be
> aligned
> + * with DF. We need to read the double vector as if it was a
> float
> + * vector of twice the number of components to fetch the right
> data.
> + */
> + type = BRW_REGISTER_TYPE_F;
> + num_components *= 2;
> + }
>
> - nir_const_value *const_sample =
> nir_src_as_const_value(instr->src[0]);
> + for (unsigned int i = 0; i < num_components; i++) {
> + struct brw_reg interp = interp_reg(base, component + i);
> + interp = suboffset(interp, 3);
> + bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
> + retype(fs_reg(interp), type));
> + }
>
> - if (const_sample) {
> - unsigned msg_data = const_sample->i32[0] << 4;
> + if (nir_dest_bit_size(instr->dest) == 64) {
> + shuffle_32bit_load_result_to_64bit_data(bld,
> + dest,
> + retype(dest, type),
> + instr->num_components);
> + }
> + break;
> + }
> +
> + case nir_intrinsic_load_barycentric_pixel:
> + case nir_intrinsic_load_barycentric_centroid:
> + case nir_intrinsic_load_barycentric_sample:
> + /* Do nothing - load_interpolated_input handling will handle it
> later. */
> + break;
>
I'm very courious why you made this choice. It seems odd to me.
>
> + case nir_intrinsic_load_barycentric_at_sample: {
> + const glsl_interp_mode interpolation =
> + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
> +
> + nir_const_value *const_sample =
> nir_src_as_const_value(instr->src[0]);
> +
> + if (const_sample) {
> + unsigned msg_data = const_sample->i32[0] << 4;
> +
> + emit_pixel_interpolater_send(bld,
> + FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> + dest,
> + fs_reg(), /* src */
> + brw_imm_ud(msg_data),
> + interpolation);
> + } else {
> + const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> + BRW_REGISTER_TYPE_UD);
> +
> + if (nir_src_is_dynamically_uniform(instr->src[0])) {
> + const fs_reg sample_id = bld.emit_uniformize(sample_src);
> + const fs_reg msg_data = vgrf(glsl_type::uint_type);
> + bld.exec_all().group(1, 0)
> + .SHL(msg_data, sample_id, brw_imm_ud(4u));
> emit_pixel_interpolater_send(bld,
> FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> - dst_xy,
> + dest,
> fs_reg(), /* src */
> - brw_imm_ud(msg_data),
> + msg_data,
> interpolation);
> } else {
> - const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> - BRW_REGISTER_TYPE_UD);
> -
> - if (nir_src_is_dynamically_uniform(instr->src[0])) {
> - const fs_reg sample_id = bld.emit_uniformize(sample_src);
> - const fs_reg msg_data = vgrf(glsl_type::uint_type);
> - bld.exec_all().group(1, 0)
> - .SHL(msg_data, sample_id, brw_imm_ud(4u));
> + /* Make a loop that sends a message to the pixel interpolater
> + * for the sample number in each live channel. If there are
> + * multiple channels with the same sample number then these
> + * will be handled simultaneously with a single interation of
> + * the loop.
> + */
> + bld.emit(BRW_OPCODE_DO);
> +
> + /* Get the next live sample number into sample_id_reg */
> + const fs_reg sample_id = bld.emit_uniformize(sample_src);
> +
> + /* Set the flag register so that we can perform the send
> + * message on all channels that have the same sample number
> + */
> + bld.CMP(bld.null_reg_ud(),
> + sample_src, sample_id,
> + BRW_CONDITIONAL_EQ);
> + const fs_reg msg_data = vgrf(glsl_type::uint_type);
> + bld.exec_all().group(1, 0)
> + .SHL(msg_data, sample_id, brw_imm_ud(4u));
> + fs_inst *inst =
> emit_pixel_interpolater_send(bld,
>
> FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> - dst_xy,
> + dest,
> fs_reg(), /* src */
> msg_data,
> interpolation);
> - } else {
> - /* Make a loop that sends a message to the pixel
> interpolater
> - * for the sample number in each live channel. If there are
> - * multiple channels with the same sample number then these
> - * will be handled simultaneously with a single interation
> of
> - * the loop.
> - */
> - bld.emit(BRW_OPCODE_DO);
> -
> - /* Get the next live sample number into sample_id_reg */
> - const fs_reg sample_id = bld.emit_uniformize(sample_src);
> + set_predicate(BRW_PREDICATE_NORMAL, inst);
>
> - /* Set the flag register so that we can perform the send
> - * message on all channels that have the same sample number
> - */
> - bld.CMP(bld.null_reg_ud(),
> - sample_src, sample_id,
> - BRW_CONDITIONAL_EQ);
> - const fs_reg msg_data = vgrf(glsl_type::uint_type);
> - bld.exec_all().group(1, 0)
> - .SHL(msg_data, sample_id, brw_imm_ud(4u));
> - fs_inst *inst =
> - emit_pixel_interpolater_send(bld,
> -
> FS_OPCODE_INTERPOLATE_AT_SAMPLE,
> - dst_xy,
> - fs_reg(), /* src */
> - msg_data,
> - interpolation);
> - set_predicate(BRW_PREDICATE_NORMAL, inst);
> -
> - /* Continue the loop if there are any live channels left */
> - set_predicate_inv(BRW_PREDICATE_NORMAL,
> - true, /* inverse */
> - bld.emit(BRW_OPCODE_WHILE));
> - }
> + /* Continue the loop if there are any live channels left */
> + set_predicate_inv(BRW_PREDICATE_NORMAL,
> + true, /* inverse */
> + bld.emit(BRW_OPCODE_WHILE));
> }
> -
> - break;
> }
> + break;
> + }
>
> - case nir_intrinsic_interp_var_at_offset: {
> - nir_const_value *const_offset =
> nir_src_as_const_value(instr->src[0]);
> + case nir_intrinsic_load_barycentric_at_offset: {
> + const glsl_interp_mode interpolation =
> + (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
>
> - if (const_offset) {
> - unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) &
> 0xf;
> - unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) &
> 0xf;
> + nir_const_value *const_offset =
> nir_src_as_const_value(instr->src[0]);
>
> - emit_pixel_interpolater_send(bld,
> -
> FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> - dst_xy,
> - fs_reg(), /* src */
> - brw_imm_ud(off_x | (off_y << 4)),
> - interpolation);
> - } else {
> - fs_reg src = vgrf(glsl_type::ivec2_type);
> - fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> - BRW_REGISTER_TYPE_F);
> - for (int i = 0; i < 2; i++) {
> - fs_reg temp = vgrf(glsl_type::float_type);
> - bld.MUL(temp, offset(offset_src, bld, i),
> brw_imm_f(16.0f));
> - fs_reg itemp = vgrf(glsl_type::int_type);
> - /* float to int */
> - bld.MOV(itemp, temp);
> -
> - /* Clamp the upper end of the range to +7/16.
> - * ARB_gpu_shader5 requires that we support a maximum
> offset
> - * of +0.5, which isn't representable in a S0.4 value -- if
> - * we didn't clamp it, we'd end up with -8/16, which is the
> - * opposite of what the shader author wanted.
> - *
> - * This is legal due to ARB_gpu_shader5's quantization
> - * rules:
> - *
> - * "Not all values of <offset> may be supported; x and y
> - * offsets may be rounded to fixed-point values with the
> - * number of fraction bits given by the
> - * implementation-dependent constant
> - * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> - */
> - set_condmod(BRW_CONDITIONAL_L,
> - bld.SEL(offset(src, bld, i), itemp,
> brw_imm_d(7)));
> - }
> + if (const_offset) {
> + unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
> + unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
>
> - const enum opcode opcode =
> FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> - emit_pixel_interpolater_send(bld,
> - opcode,
> - dst_xy,
> - src,
> - brw_imm_ud(0u),
> - interpolation);
> + emit_pixel_interpolater_send(bld,
> +
> FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
> + dest,
> + fs_reg(), /* src */
> + brw_imm_ud(off_x | (off_y << 4)),
> + interpolation);
> + } else {
> + fs_reg src = vgrf(glsl_type::ivec2_type);
> + fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> + BRW_REGISTER_TYPE_F);
> + for (int i = 0; i < 2; i++) {
> + fs_reg temp = vgrf(glsl_type::float_type);
> + bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
> + fs_reg itemp = vgrf(glsl_type::int_type);
> + /* float to int */
> + bld.MOV(itemp, temp);
> +
> + /* Clamp the upper end of the range to +7/16.
> + * ARB_gpu_shader5 requires that we support a maximum offset
> + * of +0.5, which isn't representable in a S0.4 value -- if
> + * we didn't clamp it, we'd end up with -8/16, which is the
> + * opposite of what the shader author wanted.
> + *
> + * This is legal due to ARB_gpu_shader5's quantization
> + * rules:
> + *
> + * "Not all values of <offset> may be supported; x and y
> + * offsets may be rounded to fixed-point values with the
> + * number of fraction bits given by the
> + * implementation-dependent constant
> + * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> + */
> + set_condmod(BRW_CONDITIONAL_L,
> + bld.SEL(offset(src, bld, i), itemp,
> brw_imm_d(7)));
> }
> +
> + const enum opcode opcode =
> FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
> + emit_pixel_interpolater_send(bld,
> + opcode,
> + dest,
> + src,
> + brw_imm_ud(0u),
> + interpolation);
> + }
> + break;
> + }
> +
> + case nir_intrinsic_load_interpolated_input: {
> + if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
> + emit_fragcoord_interpolation(dest);
> break;
> }
>
> - default:
> - unreachable("Invalid intrinsic");
> + assert(instr->src[0].ssa &&
> + instr->src[0].ssa->parent_instr->type ==
> nir_instr_type_intrinsic);
> + nir_intrinsic_instr *bary_intrinsic =
> + nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
> + nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
> + enum glsl_interp_mode interp_mode =
> + (enum glsl_interp_mode)
> nir_intrinsic_interp_mode(bary_intrinsic);
> + fs_reg dst_xy;
> +
> + if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
> + bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
> + /* Use the result of the PI message */
> + dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
> + } else {
> + /* Use the delta_xy values computed from the payload */
> + enum brw_barycentric_mode bary =
> + brw_barycentric_mode(interp_mode, bary_intrin);
> +
> + dst_xy = this->delta_xy[bary];
> }
>
> - for (unsigned j = 0; j < instr->num_components; j++) {
> - fs_reg src = interp_reg(instr->variables[0]->var->data.location,
> j);
> - src.type = dest.type;
> + for (unsigned int i = 0; i < instr->num_components; i++) {
> + fs_reg interp =
> + fs_reg(interp_reg(nir_intrinsic_base(instr),
> + nir_intrinsic_component(instr) + i));
> + interp.type = BRW_REGISTER_TYPE_F;
> + dest.type = BRW_REGISTER_TYPE_F;
>
> - bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
> - dest = offset(dest, bld, 1);
> + if (devinfo->needs_unlit_centroid_workaround &&
> + bary_intrin == nir_intrinsic_load_barycentric_centroid) {
> +
> + /* Get the pixel/sample mask into f0 so that we know which
> + * pixels are lit. Then, for each channel that is unlit,
> + * replace the centroid data with non-centroid data.
> + */
> + bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
> +
> + fs_reg dest_i = offset(dest, bld, i);
> + fs_reg dst_xy_pixel =
> + delta_xy[brw_barycentric_mode(interp_mode,
> + nir_intrinsic_load_barycentric_pixel)];
> +
> + fs_inst *inst;
> + inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy_pixel,
> interp);
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + inst->predicate_inverse = true;
> + inst->no_dd_clear = true;
> +
> + inst = bld.emit(FS_OPCODE_LINTERP, dest_i, dst_xy, interp);
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + inst->predicate_inverse = false;
> + inst->no_dd_check = true;
> + } else if (devinfo->gen < 6 && interp_mode ==
> INTERP_MODE_SMOOTH) {
> + fs_reg tmp = vgrf(glsl_type::float_type);
> + bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
> + bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
> + } else {
> + bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy,
> interp);
> + }
>
Ugh... I think I see why you're handling the barycentric load here now.
Eventually, I think we could probably be doing this better but I'm not
seeing an easy path at the moment.
> }
> break;
> }
> +
> default:
> nir_emit_intrinsic(bld, instr);
> break;
> @@ -3869,26 +3891,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder
> &bld, nir_intrinsic_instr *instr
> }
>
> case nir_intrinsic_load_input: {
> - fs_reg src;
> + fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
> unsigned num_components = instr->num_components;
> enum brw_reg_type type = dest.type;
>
> - if (stage == MESA_SHADER_VERTEX) {
> - src = fs_reg(ATTR, instr->const_index[0], dest.type);
> - } else {
> - assert(type_sz(type) >= 4);
> - if (type == BRW_REGISTER_TYPE_DF) {
> - /* const_index is in 32-bit type size units that could not be
> aligned
> - * with DF. We need to read the double vector as if it was a
> float
> - * vector of twice the number of components to fetch the
> right data.
> - */
> - dest = retype(dest, BRW_REGISTER_TYPE_F);
> - num_components *= 2;
> - }
> - src = offset(retype(nir_inputs, dest.type), bld,
> - instr->const_index[0]);
> - }
>
This hunk seems odd... It doesn't look like it really has anything to do
with FS. Is that else clause really FS-only or does it apply to GS and
tess stages?
> -
> nir_const_value *const_offset =
> nir_src_as_const_value(instr->src[0]);
> assert(const_offset && "Indirect input loads not allowed");
> src = offset(src, bld, const_offset->u32[0]);
> diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
> b/src/mesa/drivers/dri/i965/brw_nir.c
> index caf9fe0..d1a823a 100644
> --- a/src/mesa/drivers/dri/i965/brw_nir.c
> +++ b/src/mesa/drivers/dri/i965/brw_nir.c
> @@ -30,7 +30,8 @@ static bool
> is_input(nir_intrinsic_instr *intrin)
> {
> return intrin->intrinsic == nir_intrinsic_load_input ||
> - intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
> + intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
> + intrin->intrinsic == nir_intrinsic_load_interpolated_input;
> }
>
> static bool
> @@ -282,9 +283,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir, const
> struct brw_vue_map *vue_map)
> void
> brw_nir_lower_fs_inputs(nir_shader *nir)
> {
> - nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
> VARYING_SLOT_VAR0,
> - type_size_scalar);
> - nir_lower_io(nir, nir_var_shader_in, type_size_scalar, false);
> + foreach_list_typed(nir_variable, var, node, &nir->inputs) {
> + var->data.driver_location = var->data.location;
> + }
> +
> + nir_lower_io(nir, nir_var_shader_in, type_size_vec4, true);
> +
> + /* This pass needs actual constants */
> + nir_opt_constant_folding(nir);
> +
> + add_const_offset_to_base(nir, nir_var_shader_in);
> }
>
> void
> --
> 2.9.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160719/0f11e518/attachment-0001.html>
More information about the mesa-dev
mailing list