[Mesa-dev] [PATCH 4/4] i965: Allow indirect GS input indexing in the scalar backend.

Wed Nov 18 13:51:02 PST 2015

On Sat, Nov 7, 2015 at 9:04 PM, Kenneth Graunke <kenneth at whitecape.org> wrote:
> This allows arbitrary non-constant indices on GS input arrays,
> both for the vertex index, and any array offsets beyond that.
>
> All indirects are handled via the pull model.  We could potentially
> handle indirect addressing of pushed data as well, but it would add
> additional code complexity, and we usually have to pull inputs anyway
> due to the sheer volume of input data.  Plus, marking pushed inputs
> as live due to indirect addressing could exacerbate register pressure
> problems pretty badly.  We'd need to be careful.

I like how this gets rid of the loop to rewrite
SHADER_OPCODE_URB_READ_SIMD8 insts in assign_gs_urb_setup().

This looks good - assuming rebase on master and update to use
SHADER_OPCODE_MOV_INDIRECT,

Reviewed-by: Kristian Høgsberg <krh at bitplanet.net>

> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp     |  17 ----
>  src/mesa/drivers/dri/i965/brw_fs.h       |   3 +-
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 129 ++++++++++++++++++++++++-------
>  src/mesa/drivers/dri/i965/brw_shader.cpp |   3 +
>  4 files changed, 106 insertions(+), 46 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index ee10c9d..9d00379 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1636,24 +1636,7 @@ fs_visitor::assign_gs_urb_setup()
>     first_non_payload_grf +=
>        8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
>
> -   const unsigned first_icp_handle = payload.num_regs -
> -      (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
> -
>     foreach_block_and_inst(block, fs_inst, inst, cfg) {
> -      /* Lower URB_READ_SIMD8 opcodes into real messages. */
> -      if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
> -         assert(inst->src[0].file == IMM);
> -         inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
> -                                            inst->src[0].fixed_hw_reg.dw1.ud,
> -                                            0), BRW_REGISTER_TYPE_UD);
> -         /* for now, assume constant - we can do per-slot offsets later */
> -         assert(inst->src[1].file == IMM);
> -         inst->offset = inst->src[1].fixed_hw_reg.dw1.ud;
> -         inst->src[1] = fs_reg();
> -         inst->mlen = 1;
> -         inst->base_mrf = -1;
> -      }
> -
>        /* Rewrite all ATTR file references to HW_REGs. */
>        convert_attr_sources_to_hw_regs(inst);
>     }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index fb70f0c..67f9f59 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -302,7 +302,8 @@ public:
>                         unsigned stream_id);
>     void emit_gs_thread_end();
>     void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
> -                           unsigned offset, unsigned num_components);
> +                           const fs_reg &indirect_offset, unsigned imm_offset,
> +                           unsigned num_components);
>     void emit_cs_terminate();
>     fs_reg *emit_cs_local_invocation_id_setup();
>     fs_reg *emit_cs_work_group_id_setup();
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 7f033f2..3bc02c5 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -1543,42 +1543,112 @@ fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
>  void
>  fs_visitor::emit_gs_input_load(const fs_reg &dst,
>                                 const nir_src &vertex_src,
> -                               unsigned input_offset,
> +                               const fs_reg &indirect_offset,
> +                               unsigned imm_offset,
>                                 unsigned num_components)
>  {
> -   const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
> -   const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
> +   struct brw_gs_prog_data *gs_prog_data = (struct brw_gs_prog_data *) prog_data;
>
> -   const unsigned array_stride = vue_prog_data->urb_read_length * 8;
> +   /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
> +    * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
> +    * gl_PointSize is available as a GS input, however, so it must be that.
> +    */
> +   const bool is_point_size =
> +      indirect_offset.file == BAD_FILE && imm_offset == 0;
> +
> +   nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
> +   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
> +
> +   if (indirect_offset.file == BAD_FILE && vertex_const != NULL &&
> +       4 * imm_offset < push_reg_count) {
> +      imm_offset = 4 * imm_offset + vertex_const->u[0] * push_reg_count;
> +      /* This input was pushed into registers. */
> +      if (is_point_size) {
> +         /* gl_PointSize comes in .w */
> +         bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
> +      } else {
> +         for (unsigned i = 0; i < num_components; i++) {
> +            bld.MOV(offset(dst, bld, i),
> +                    fs_reg(ATTR, imm_offset + i, dst.type));
> +         }
> +      }
> +   } else {
> +      /* Resort to the pull model.  Ensure the VUE handles are provided. */
> +      gs_prog_data->base.include_vue_handles = true;
>
> -   const bool pushed = 4 * input_offset < array_stride;
> +      unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
> +      fs_reg icp_handle;
>
> -   if (input_offset == 0) {
> -      /* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
> -       * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
> -       * Only gl_PointSize is available as a GS input, so they must
> -       * be asking for that input.
> -       */
> -      if (pushed) {
> -         bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
> +      if (vertex_const) {
> +         /* The vertex index is constant; just select the proper URB handle. */
> +         icp_handle =
> +            retype(brw_vec8_grf(first_icp_handle + vertex_const->i[0], 0),
> +                   BRW_REGISTER_TYPE_UD);
>        } else {
> -         fs_reg tmp = bld.vgrf(dst.type, 4);
> -         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
> -                                  fs_reg(vertex), fs_reg(0));
> -         inst->regs_written = 4;
> -         bld.MOV(dst, offset(tmp, bld, 3));
> +         /* The vertex index is non-constant.  We need to use indirect
> +          * addressing to fetch the proper URB handle.
> +          *
> +          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
> +          * indicating that channel <n> should read the handle from
> +          * DWord <n>.  We convert that to bytes by multiplying by 4.
> +          *
> +          * Next, we convert the vertex index to bytes by multiplying
> +          * by 32 (shifting by 5), and add the two together.  This is
> +          * the final indirect byte offset.
> +          */
> +         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
> +         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +
> +         /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
> +         bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
> +         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
> +         bld.SHL(channel_offsets, sequence, fs_reg(2u));
> +         /* Convert vertex_index to bytes (multiply by 32) */
> +         bld.SHL(vertex_offset_bytes,
> +                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
> +                 brw_imm_ud(5u));
> +         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
> +
> +         /* Use first_icp_handle as the base offset.  There is one register
> +          * of URB handles per vertex, so inform the register allocator that
> +          * we might read up to nir->info.gs.vertices_in registers.
> +          */
> +         bld.emit(SHADER_OPCODE_INDIRECT_THREAD_PAYLOAD_MOV, icp_handle,
> +                  fs_reg(first_icp_handle * REG_SIZE), icp_offset_bytes,
> +                  fs_reg(nir->info.gs.vertices_in));
>        }
> -   } else {
> -      if (pushed) {
> -         int index = vertex * array_stride + 4 * input_offset;
> -         for (unsigned i = 0; i < num_components; i++) {
> -            bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
> -         }
> +
> +      fs_inst *inst;
> +      if (indirect_offset.file == BAD_FILE) {
> +         /* Constant indexing - use global offset. */
> +         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
> +         inst->offset = imm_offset;
> +         inst->base_mrf = -1;
> +         inst->mlen = 1;
> +         inst->regs_written = num_components;
>        } else {
> -         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
> -                                  fs_reg(vertex), fs_reg(input_offset));
> +         /* Indirect indexing - use per-slot offsets as well. */
> +         const fs_reg srcs[] = { icp_handle, indirect_offset };
> +         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
> +         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
> +
> +         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
> +         inst->offset = imm_offset;
> +         inst->base_mrf = -1;
> +         inst->mlen = 2;
>           inst->regs_written = num_components;
>        }
> +
> +      if (is_point_size) {
> +         /* Read the whole VUE header (because of alignment) and read .w. */
> +         fs_reg tmp = bld.vgrf(dst.type, 4);
> +         inst->dst = tmp;
> +         inst->regs_written = 4;
> +         bld.MOV(dst, offset(tmp, bld, 3));
> +      }
>     }
>  }
>
> @@ -1618,6 +1688,7 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
>                                    nir_intrinsic_instr *instr)
>  {
>     assert(stage == MESA_SHADER_GEOMETRY);
> +   fs_reg indirect_offset;
>
>     fs_reg dest;
>     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> @@ -1636,9 +1707,11 @@ fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
>        unreachable("load_input intrinsics are invalid for the GS stage");
>
>     case nir_intrinsic_load_per_vertex_input_indirect:
> -      assert(!"Not allowed");
> +      indirect_offset = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_D);
> +      /* fallthrough */
>     case nir_intrinsic_load_per_vertex_input:
> -      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
> +      emit_gs_input_load(dest, instr->src[0],
> +                         indirect_offset, instr->const_index[0],
>                           instr->num_components);
>        break;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index fef5246..9ef59ec 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -150,6 +150,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
>        compiler->glsl_compiler_options[i].NirOptions = nir_options;
>     }
>
> +   if (compiler->scalar_gs)
> +      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
> +
>     return compiler;
>  }
>
> --
> 2.6.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev