[Mesa-dev] [PATCH v2 2/2] i965/fs: Handle non-const sample number in interpolateAtSample

Francisco Jerez currojerez at riseup.net
Fri Oct 2 06:16:49 PDT 2015


Matt Turner <mattst88 at gmail.com> writes:

> On Fri, Aug 7, 2015 at 9:31 AM, Neil Roberts <neil at linux.intel.com> wrote:
>> If a non-const sample number is given to interpolateAtSample it will
>> now generate an indirect send message with the sample ID similar to
>> how non-const sampler array indexing works. Previously non-const
>> values were ignored and instead it ended up using a constant 0 value.
>>
>> The generator will try to determine if the sample ID is dynamically
>> uniform via nir_src_is_dynamically_uniform. If not it will query the
>> pixel interpolator in a loop, once for each possible sample number.
>> This is necessary because the indirect send message doesn't seem to
>> have a way to specify a different value for each fragment.
>
> Heh, I think this was the solution Curro had in mind a couple of years ago. :)
>
> I've Cc'd him. It'd be good for him to review this.
>
Sigh, it's really awful that our hardware only supports a single sample
index for the whole SIMD thread...  I was thinking though that there
might be a better alternative to running the sample-index interpolator
query in a loop: The "Per Slot Offset" interpolator query does allow
independent offsets for each channel, couldn't we pass the hard-coded
table of sample offsets (see brw_multisample_state.h) to the shader
(e.g. as push constants or hard-coded as vector immediates), use VxH
indirect indexing to fetch the right offset for each channel and then
lower it into a non-dynamically-uniform interpolateAtOffset, which the
hardware can handle natively?

>> The range of possible sample numbers is determined using
>> STATE_NUM_SAMPLES. When linking the shader it will now add a reference
>> to this state if any dynamically non-uniform calls to
>> interpolateAtSample are found.
>>
>> This fixes the following two Piglit tests:
>>
>> arb_gpu_shader5-interpolateAtSample-nonconst
>> arb_gpu_shader5-interpolateAtSample-dynamically-nonuniform
>>
>> v2: Handle dynamically non-uniform sample ids.
>> ---
>>  src/mesa/drivers/dri/i965/brw_eu.h             |   2 +-
>>  src/mesa/drivers/dri/i965/brw_eu_emit.c        |  34 ++++---
>>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp |   5 +-
>>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp       | 119 +++++++++++++++++++++----
>>  src/mesa/drivers/dri/i965/brw_program.c        |  54 +++++++++++
>>  src/mesa/drivers/dri/i965/brw_program.h        |   1 +
>>  src/mesa/drivers/dri/i965/brw_shader.cpp       |   2 +
>>  7 files changed, 185 insertions(+), 32 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
>> index 761aa0e..0ac1ad9 100644
>> --- a/src/mesa/drivers/dri/i965/brw_eu.h
>> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
>> @@ -461,7 +461,7 @@ brw_pixel_interpolator_query(struct brw_codegen *p,
>>                               struct brw_reg mrf,
>>                               bool noperspective,
>>                               unsigned mode,
>> -                             unsigned data,
>> +                             struct brw_reg data,
>>                               unsigned msg_length,
>>                               unsigned response_length);
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
>> index 4d39762..25524d4 100644
>> --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
>> +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
>> @@ -3192,26 +3192,38 @@ brw_pixel_interpolator_query(struct brw_codegen *p,
>>                               struct brw_reg mrf,
>>                               bool noperspective,
>>                               unsigned mode,
>> -                             unsigned data,
>> +                             struct brw_reg data,
>>                               unsigned msg_length,
>>                               unsigned response_length)
>>  {
>>     const struct brw_device_info *devinfo = p->devinfo;
>> -   struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
>> +   struct brw_inst *insn;
>> +   uint16_t exec_size;
>>
>> -   brw_set_dest(p, insn, dest);
>> -   brw_set_src0(p, insn, mrf);
>> -   brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
>> -                              msg_length, response_length,
>> -                              false /* header is never present for PI */,
>> -                              false);
>> +   if (data.file == BRW_IMMEDIATE_VALUE) {
>> +      insn = next_insn(p, BRW_OPCODE_SEND);
>> +      brw_set_dest(p, insn, dest);
>> +      brw_set_src0(p, insn, mrf);
>> +      brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
>> +                                 msg_length, response_length,
>> +                                 false /* header is never present for PI */,
>> +                                 false);
>> +      brw_inst_set_pi_message_data(devinfo, insn, data.dw1.ud);
>> +   } else {
>> +      insn = brw_send_indirect_message(p,
>> +                                       GEN7_SFID_PIXEL_INTERPOLATOR,
>> +                                       dest,
>> +                                       mrf,
>> +                                       vec1(data));
>> +      brw_inst_set_mlen(devinfo, insn, msg_length);
>> +      brw_inst_set_rlen(devinfo, insn, response_length);
>> +   }
>>
>> -   brw_inst_set_pi_simd_mode(
>> -         devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16);
>> +   exec_size = brw_inst_exec_size(devinfo, p->current);
>> +   brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
>>     brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
>>     brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
>>     brw_inst_set_pi_message_type(devinfo, insn, mode);
>> -   brw_inst_set_pi_message_data(devinfo, insn, data);
>>  }
>>
>>  void
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> index c86ca04..88dbc62 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> @@ -1328,15 +1328,14 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
>>                                                  struct brw_reg msg_data,
>>                                                  unsigned msg_type)
>>  {
>> -   assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
>> -          msg_data.type == BRW_REGISTER_TYPE_UD);
>> +   assert(msg_data.type == BRW_REGISTER_TYPE_UD);
>>
>>     brw_pixel_interpolator_query(p,
>>           retype(dst, BRW_REGISTER_TYPE_UW),
>>           src,
>>           inst->pi_noperspective,
>>           msg_type,
>> -         msg_data.dw1.ud,
>> +         msg_data,
>>           inst->mlen,
>>           inst->regs_written);
>>  }
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> index ee964a0..0fdb9ae 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> @@ -1167,6 +1167,47 @@ fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
>>     }
>>  }
>>
>> +/* For most messages, we need one reg of ignored data; the hardware requires
>> + * mlen==1 even when there is no payload. in the per-slot offset case, we'll
>> + * replace this with the proper source data.
>> + */
>> +static void
>> +setup_pixel_interpolater_instruction(fs_visitor *v,
>> +                                     nir_intrinsic_instr *instr,
>> +                                     fs_inst *inst,
>> +                                     int mlen = 1)
>> +{
>> +   inst->mlen = mlen;
>> +   /* 2 floats per slot returned */
>> +   inst->regs_written = 2 * v->dispatch_width / 8;
>> +   inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
>> +      INTERP_QUALIFIER_NOPERSPECTIVE;
>> +}
>> +
>> +static fs_reg
>> +get_num_samples_reg(fs_visitor *v)
>> +{
>> +   struct gl_program_parameter_list *params = v->prog->Parameters;
>> +   static gl_state_index tokens[STATE_LENGTH] = {
>
> I suspect this isn't thread-safe.
>
>> +      STATE_NUM_SAMPLES
>> +   };
>> +   GLuint index = _mesa_add_state_reference(params, tokens);
>> +   unsigned i;
>> +
>> +   /* Try to find an existing copy of the uniform */
>> +   for (i = 0; i < v->uniforms; i++) {
>> +      if (v->stage_prog_data->param[i] ==
>> +          &v->prog->Parameters->ParameterValues[index][0])
>> +         goto found;
>> +   }
>> +
>> +   v->stage_prog_data->param[v->uniforms++] =
>> +      &v->prog->Parameters->ParameterValues[index][0];
>> +
>> +found:
>> +   return retype(fs_reg(UNIFORM, i), BRW_REGISTER_TYPE_UD);
>> +}
>> +
>>  void
>>  fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
>>  {
>> @@ -1438,27 +1479,73 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>>
>>        fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
>>
>> -      /* For most messages, we need one reg of ignored data; the hardware
>> -       * requires mlen==1 even when there is no payload. in the per-slot
>> -       * offset case, we'll replace this with the proper source data.
>> -       */
>>        fs_reg src = vgrf(glsl_type::float_type);
>> -      int mlen = 1;     /* one reg unless overriden */
>>        fs_inst *inst;
>>
>>        switch (instr->intrinsic) {
>>        case nir_intrinsic_interp_var_at_centroid:
>>           inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_CENTROID,
>>                           dst_xy, src, fs_reg(0u));
>> +         setup_pixel_interpolater_instruction(this, instr, inst);
>>           break;
>>
>>        case nir_intrinsic_interp_var_at_sample: {
>> -         /* XXX: We should probably handle non-constant sample id's */
>>           nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
>> -         assert(const_sample);
>> -         unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
>> -         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
>> -                         fs_reg(msg_data));
>> +
>> +         if (const_sample) {
>> +            unsigned msg_data = const_sample->i[0] << 4;
>> +
>> +            inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
>> +                            fs_reg(msg_data));
>> +
>> +            setup_pixel_interpolater_instruction(this, instr, inst);
>> +         } else {
>> +            fs_reg sample_src = retype(get_nir_src(instr->src[0]),
>> +                                       BRW_REGISTER_TYPE_UD);
>> +            fs_reg sample_id_reg;
>> +
>> +            if (nir_src_is_dynamically_uniform(instr->src[0])) {
>> +               sample_id_reg = vgrf(glsl_type::uint_type);
>> +               bld.SHL(sample_id_reg, sample_src, fs_reg(4u));
>> +               sample_id_reg = bld.emit_uniformize(sample_id_reg);
>> +               inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
>> +                               sample_id_reg);
>> +               setup_pixel_interpolater_instruction(this, instr, inst);
>> +            } else {
>> +               /* Make a loop that sends a message to the pixel interpolator
>> +                * for each possible sample number so that each individual
>> +                * message will be dynamically uniform. The number of samples
>> +                * is determined by accessing the STATE_NUM_SAMPLES state var.
>> +                */
>> +               fs_reg i_reg = vgrf(glsl_type::uint_type);
>> +               fs_reg sample_id_reg = vgrf(glsl_type::uint_type);
>> +               fs_reg num_samples_reg = get_num_samples_reg(this);
>> +
>> +               bld.MOV(i_reg, fs_reg(0u));
>> +
>> +               bld.emit(BRW_OPCODE_DO);
>> +
>> +               bld.CMP(bld.null_reg_ud(),
>> +                       sample_src, i_reg,
>> +                       BRW_CONDITIONAL_EQ);
>
> I think you might be able to put all of this on one line.
>
>> +               bld.IF(BRW_PREDICATE_NORMAL);
>> +               bld.SHL(sample_id_reg, i_reg, fs_reg(4u));
>> +               sample_id_reg = bld.emit_uniformize(sample_id_reg);
>> +               inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
>> +                               sample_id_reg);
>> +               setup_pixel_interpolater_instruction(this, instr, inst);
>> +               bld.emit(BRW_OPCODE_ENDIF);
>> +
>> +               bld.ADD(i_reg, i_reg, fs_reg(1u));
>> +               bld.CMP(bld.null_reg_ud(),
>> +                       i_reg, num_samples_reg,
>> +                       BRW_CONDITIONAL_GE);
>
> I think you might be able to put all of this on one line.
>
> If you want, you could reverse the order of the loop (that is, iterate
> from num_samples_reg down to 0) and then save the
> opt_cmod_propagation() pass some work and just emit an ADD with a a
> conditional_mod in order to get rid of the CMP instruction.
>
>> +               inst = bld.emit(BRW_OPCODE_BREAK);
>> +               inst->predicate = BRW_PREDICATE_NORMAL;
>> +               bld.emit(BRW_OPCODE_WHILE);
>
> You can actually get rid of the BREAK by predicating the WHILE (and
> setting predicate_inverse on it). There's also a nice
> set_predicate(predicate, inst) function you should use.
>
> If you want to do that, I think it'll take a small amount of work in
> the BRW_OPCODE_WHILE case in brw_cfg.cpp to understand that a
> predicated WHILE has two successors.
>
>> +            }
>> +         }
>> +
>>           break;
>>        }
>>
>> @@ -1471,6 +1558,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>>
>>              inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
>>                              fs_reg(off_x | (off_y << 4)));
>> +            setup_pixel_interpolater_instruction(this, instr, inst);
>>           } else {
>>              src = vgrf(glsl_type::ivec2_type);
>>              fs_reg offset_src = retype(get_nir_src(instr->src[0]),
>> @@ -1500,9 +1588,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>>                             bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
>>              }
>>
>> -            mlen = 2 * dispatch_width / 8;
>>              inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
>>                              fs_reg(0u));
>> +            setup_pixel_interpolater_instruction(this,
>> +                                                 instr,
>> +                                                 inst,
>> +                                                 2 * dispatch_width / 8);
>>           }
>>           break;
>>        }
>> @@ -1511,12 +1602,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>>           unreachable("Invalid intrinsic");
>>        }
>>
>> -      inst->mlen = mlen;
>> -      /* 2 floats per slot returned */
>> -      inst->regs_written = 2 * dispatch_width / 8;
>> -      inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
>> -                               INTERP_QUALIFIER_NOPERSPECTIVE;
>> -
>>        for (unsigned j = 0; j < instr->num_components; j++) {
>>           fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
>>           src.type = dest.type;
>> diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
>> index 467a893..6430c5d 100644
>> --- a/src/mesa/drivers/dri/i965/brw_program.c
>> +++ b/src/mesa/drivers/dri/i965/brw_program.c
>> @@ -146,6 +146,8 @@ brwProgramStringNotify(struct gl_context *ctx,
>>           prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
>>        }
>>
>> +      brw_add_interpolate_at_sample_params(prog);
>> +
>>        brw_fs_precompile(ctx, NULL, prog);
>>        break;
>>     }
>> @@ -246,6 +248,58 @@ brw_add_texrect_params(struct gl_program *prog)
>>     }
>>  }
>>
>> +static bool
>> +find_interpolate_at_sample_in_block(nir_block *block,
>> +                                    void *data)
>> +{
>> +   nir_foreach_instr_safe(block, instr) {
>
> I don't think you need _safe here. At least I can't see why it's necessary.
>
>> +      if (instr->type != nir_instr_type_intrinsic)
>> +         continue;
>> +
>> +      nir_intrinsic_instr *intrinsic_instr = nir_instr_as_intrinsic(instr);
>> +
>> +      if (intrinsic_instr->intrinsic != nir_intrinsic_interp_var_at_sample)
>> +         continue;
>> +
>> +      /* If the sample number is known to be dynamically uniform then
>> +       * the generator won't need the num_samples state.
>> +       */
>> +      if (nir_src_is_dynamically_uniform(intrinsic_instr->src[0]))
>> +         continue;
>> +
>> +      return false;
>> +   }
>> +
>> +   return true;
>> +}
>> +
>> +void
>> +brw_add_interpolate_at_sample_params(struct gl_program *prog)
>> +{
>> +   static gl_state_index tokens[STATE_LENGTH] = {
>
> I suspect this isn't thread-safe.
>
>> +      STATE_NUM_SAMPLES
>> +   };
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 212 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20151002/52636dfd/attachment-0001.sig>


More information about the mesa-dev mailing list