[Mesa-dev] [PATCH v2 2/2] i965/fs: Handle non-const sample number in interpolateAtSample
Matt Turner
mattst88 at gmail.com
Thu Oct 1 16:15:05 PDT 2015
On Fri, Aug 7, 2015 at 9:31 AM, Neil Roberts <neil at linux.intel.com> wrote:
> If a non-const sample number is given to interpolateAtSample it will
> now generate an indirect send message with the sample ID similar to
> how non-const sampler array indexing works. Previously non-const
> values were ignored and instead it ended up using a constant 0 value.
>
> The generator will try to determine if the sample ID is dynamically
> uniform via nir_src_is_dynamically_uniform. If not it will query the
> pixel interpolator in a loop, once for each possible sample number.
> This is necessary because the indirect send message doesn't seem to
> have a way to specify a different value for each fragment.
Heh, I think this was the solution Curro had in mind a couple of years ago. :)
I've Cc'd him. It'd be good for him to review this.
> The range of possible sample numbers is determined using
> STATE_NUM_SAMPLES. When linking the shader it will now add a reference
> to this state if any dynamically non-uniform calls to
> interpolateAtSample are found.
>
> This fixes the following two Piglit tests:
>
> arb_gpu_shader5-interpolateAtSample-nonconst
> arb_gpu_shader5-interpolateAtSample-dynamically-nonuniform
>
> v2: Handle dynamically non-uniform sample ids.
> ---
> src/mesa/drivers/dri/i965/brw_eu.h | 2 +-
> src/mesa/drivers/dri/i965/brw_eu_emit.c | 34 ++++---
> src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 5 +-
> src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 119 +++++++++++++++++++++----
> src/mesa/drivers/dri/i965/brw_program.c | 54 +++++++++++
> src/mesa/drivers/dri/i965/brw_program.h | 1 +
> src/mesa/drivers/dri/i965/brw_shader.cpp | 2 +
> 7 files changed, 185 insertions(+), 32 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
> index 761aa0e..0ac1ad9 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu.h
> +++ b/src/mesa/drivers/dri/i965/brw_eu.h
> @@ -461,7 +461,7 @@ brw_pixel_interpolator_query(struct brw_codegen *p,
> struct brw_reg mrf,
> bool noperspective,
> unsigned mode,
> - unsigned data,
> + struct brw_reg data,
> unsigned msg_length,
> unsigned response_length);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
> index 4d39762..25524d4 100644
> --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
> @@ -3192,26 +3192,38 @@ brw_pixel_interpolator_query(struct brw_codegen *p,
> struct brw_reg mrf,
> bool noperspective,
> unsigned mode,
> - unsigned data,
> + struct brw_reg data,
> unsigned msg_length,
> unsigned response_length)
> {
> const struct brw_device_info *devinfo = p->devinfo;
> - struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
> + struct brw_inst *insn;
> + uint16_t exec_size;
>
> - brw_set_dest(p, insn, dest);
> - brw_set_src0(p, insn, mrf);
> - brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
> - msg_length, response_length,
> - false /* header is never present for PI */,
> - false);
> + if (data.file == BRW_IMMEDIATE_VALUE) {
> + insn = next_insn(p, BRW_OPCODE_SEND);
> + brw_set_dest(p, insn, dest);
> + brw_set_src0(p, insn, mrf);
> + brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
> + msg_length, response_length,
> + false /* header is never present for PI */,
> + false);
> + brw_inst_set_pi_message_data(devinfo, insn, data.dw1.ud);
> + } else {
> + insn = brw_send_indirect_message(p,
> + GEN7_SFID_PIXEL_INTERPOLATOR,
> + dest,
> + mrf,
> + vec1(data));
> + brw_inst_set_mlen(devinfo, insn, msg_length);
> + brw_inst_set_rlen(devinfo, insn, response_length);
> + }
>
> - brw_inst_set_pi_simd_mode(
> - devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16);
> + exec_size = brw_inst_exec_size(devinfo, p->current);
> + brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
> brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
> brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
> brw_inst_set_pi_message_type(devinfo, insn, mode);
> - brw_inst_set_pi_message_data(devinfo, insn, data);
> }
>
> void
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> index c86ca04..88dbc62 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> @@ -1328,15 +1328,14 @@ fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
> struct brw_reg msg_data,
> unsigned msg_type)
> {
> - assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
> - msg_data.type == BRW_REGISTER_TYPE_UD);
> + assert(msg_data.type == BRW_REGISTER_TYPE_UD);
>
> brw_pixel_interpolator_query(p,
> retype(dst, BRW_REGISTER_TYPE_UW),
> src,
> inst->pi_noperspective,
> msg_type,
> - msg_data.dw1.ud,
> + msg_data,
> inst->mlen,
> inst->regs_written);
> }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index ee964a0..0fdb9ae 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -1167,6 +1167,47 @@ fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
> }
> }
>
> +/* For most messages, we need one reg of ignored data; the hardware requires
> + * mlen==1 even when there is no payload. in the per-slot offset case, we'll
> + * replace this with the proper source data.
> + */
> +static void
> +setup_pixel_interpolater_instruction(fs_visitor *v,
> + nir_intrinsic_instr *instr,
> + fs_inst *inst,
> + int mlen = 1)
> +{
> + inst->mlen = mlen;
> + /* 2 floats per slot returned */
> + inst->regs_written = 2 * v->dispatch_width / 8;
> + inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
> + INTERP_QUALIFIER_NOPERSPECTIVE;
> +}
> +
> +static fs_reg
> +get_num_samples_reg(fs_visitor *v)
> +{
> + struct gl_program_parameter_list *params = v->prog->Parameters;
> + static gl_state_index tokens[STATE_LENGTH] = {
I suspect this isn't thread-safe.
> + STATE_NUM_SAMPLES
> + };
> + GLuint index = _mesa_add_state_reference(params, tokens);
> + unsigned i;
> +
> + /* Try to find an existing copy of the uniform */
> + for (i = 0; i < v->uniforms; i++) {
> + if (v->stage_prog_data->param[i] ==
> + &v->prog->Parameters->ParameterValues[index][0])
> + goto found;
> + }
> +
> + v->stage_prog_data->param[v->uniforms++] =
> + &v->prog->Parameters->ParameterValues[index][0];
> +
> +found:
> + return retype(fs_reg(UNIFORM, i), BRW_REGISTER_TYPE_UD);
> +}
> +
> void
> fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
> {
> @@ -1438,27 +1479,73 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>
> fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
>
> - /* For most messages, we need one reg of ignored data; the hardware
> - * requires mlen==1 even when there is no payload. in the per-slot
> - * offset case, we'll replace this with the proper source data.
> - */
> fs_reg src = vgrf(glsl_type::float_type);
> - int mlen = 1; /* one reg unless overriden */
> fs_inst *inst;
>
> switch (instr->intrinsic) {
> case nir_intrinsic_interp_var_at_centroid:
> inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_CENTROID,
> dst_xy, src, fs_reg(0u));
> + setup_pixel_interpolater_instruction(this, instr, inst);
> break;
>
> case nir_intrinsic_interp_var_at_sample: {
> - /* XXX: We should probably handle non-constant sample id's */
> nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
> - assert(const_sample);
> - unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
> - inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
> - fs_reg(msg_data));
> +
> + if (const_sample) {
> + unsigned msg_data = const_sample->i[0] << 4;
> +
> + inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
> + fs_reg(msg_data));
> +
> + setup_pixel_interpolater_instruction(this, instr, inst);
> + } else {
> + fs_reg sample_src = retype(get_nir_src(instr->src[0]),
> + BRW_REGISTER_TYPE_UD);
> + fs_reg sample_id_reg;
> +
> + if (nir_src_is_dynamically_uniform(instr->src[0])) {
> + sample_id_reg = vgrf(glsl_type::uint_type);
> + bld.SHL(sample_id_reg, sample_src, fs_reg(4u));
> + sample_id_reg = bld.emit_uniformize(sample_id_reg);
> + inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
> + sample_id_reg);
> + setup_pixel_interpolater_instruction(this, instr, inst);
> + } else {
> + /* Make a loop that sends a message to the pixel interpolator
> + * for each possible sample number so that each individual
> + * message will be dynamically uniform. The number of samples
> + * is determined by accessing the STATE_NUM_SAMPLES state var.
> + */
> + fs_reg i_reg = vgrf(glsl_type::uint_type);
> + fs_reg sample_id_reg = vgrf(glsl_type::uint_type);
> + fs_reg num_samples_reg = get_num_samples_reg(this);
> +
> + bld.MOV(i_reg, fs_reg(0u));
> +
> + bld.emit(BRW_OPCODE_DO);
> +
> + bld.CMP(bld.null_reg_ud(),
> + sample_src, i_reg,
> + BRW_CONDITIONAL_EQ);
I think you might be able to put all of this on one line.
> + bld.IF(BRW_PREDICATE_NORMAL);
> + bld.SHL(sample_id_reg, i_reg, fs_reg(4u));
> + sample_id_reg = bld.emit_uniformize(sample_id_reg);
> + inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
> + sample_id_reg);
> + setup_pixel_interpolater_instruction(this, instr, inst);
> + bld.emit(BRW_OPCODE_ENDIF);
> +
> + bld.ADD(i_reg, i_reg, fs_reg(1u));
> + bld.CMP(bld.null_reg_ud(),
> + i_reg, num_samples_reg,
> + BRW_CONDITIONAL_GE);
I think you might be able to put all of this on one line.
If you want, you could reverse the order of the loop (that is, iterate
from num_samples_reg down to 0) and then save the
opt_cmod_propagation() pass some work and just emit an ADD with a a
conditional_mod in order to get rid of the CMP instruction.
> + inst = bld.emit(BRW_OPCODE_BREAK);
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + bld.emit(BRW_OPCODE_WHILE);
You can actually get rid of the BREAK by predicating the WHILE (and
setting predicate_inverse on it). There's also a nice
set_predicate(predicate, inst) function you should use.
If you want to do that, I think it'll take a small amount of work in
the BRW_OPCODE_WHILE case in brw_cfg.cpp to understand that a
predicated WHILE has two successors.
> + }
> + }
> +
> break;
> }
>
> @@ -1471,6 +1558,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
>
> inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
> fs_reg(off_x | (off_y << 4)));
> + setup_pixel_interpolater_instruction(this, instr, inst);
> } else {
> src = vgrf(glsl_type::ivec2_type);
> fs_reg offset_src = retype(get_nir_src(instr->src[0]),
> @@ -1500,9 +1588,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
> bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
> }
>
> - mlen = 2 * dispatch_width / 8;
> inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
> fs_reg(0u));
> + setup_pixel_interpolater_instruction(this,
> + instr,
> + inst,
> + 2 * dispatch_width / 8);
> }
> break;
> }
> @@ -1511,12 +1602,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
> unreachable("Invalid intrinsic");
> }
>
> - inst->mlen = mlen;
> - /* 2 floats per slot returned */
> - inst->regs_written = 2 * dispatch_width / 8;
> - inst->pi_noperspective = instr->variables[0]->var->data.interpolation ==
> - INTERP_QUALIFIER_NOPERSPECTIVE;
> -
> for (unsigned j = 0; j < instr->num_components; j++) {
> fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
> src.type = dest.type;
> diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
> index 467a893..6430c5d 100644
> --- a/src/mesa/drivers/dri/i965/brw_program.c
> +++ b/src/mesa/drivers/dri/i965/brw_program.c
> @@ -146,6 +146,8 @@ brwProgramStringNotify(struct gl_context *ctx,
> prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
> }
>
> + brw_add_interpolate_at_sample_params(prog);
> +
> brw_fs_precompile(ctx, NULL, prog);
> break;
> }
> @@ -246,6 +248,58 @@ brw_add_texrect_params(struct gl_program *prog)
> }
> }
>
> +static bool
> +find_interpolate_at_sample_in_block(nir_block *block,
> + void *data)
> +{
> + nir_foreach_instr_safe(block, instr) {
I don't think you need _safe here. At least I can't see why it's necessary.
> + if (instr->type != nir_instr_type_intrinsic)
> + continue;
> +
> + nir_intrinsic_instr *intrinsic_instr = nir_instr_as_intrinsic(instr);
> +
> + if (intrinsic_instr->intrinsic != nir_intrinsic_interp_var_at_sample)
> + continue;
> +
> + /* If the sample number is known to be dynamically uniform then
> + * the generator won't need the num_samples state.
> + */
> + if (nir_src_is_dynamically_uniform(intrinsic_instr->src[0]))
> + continue;
> +
> + return false;
> + }
> +
> + return true;
> +}
> +
> +void
> +brw_add_interpolate_at_sample_params(struct gl_program *prog)
> +{
> + static gl_state_index tokens[STATE_LENGTH] = {
I suspect this isn't thread-safe.
> + STATE_NUM_SAMPLES
> + };
More information about the mesa-dev
mailing list