[Mesa-dev] [PATCH] i965/gen7: Use predicated rendering for indirect compute

Ben Widawsky ben at bwidawsk.net
Tue Feb 16 20:03:10 UTC 2016


On Tue, Feb 16, 2016 at 10:09:50AM -0800, Jordan Justen wrote:
> On gen7 (Ivy Bridge, Haswell), we will get a GPU hang if an indirect
> dispatch is used, but one of the dimensions is 0.
> 
> Therefore we use predicated rendering on the GPGPU_WALKER command to
> handle this case.
> 
> Fixes piglit test: spec/arb_compute_shader/zero-dispatch-size
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94100
> Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> Cc: Ben Widawsky <ben at bwidawsk.net>
> Cc: Ilia Mirkin <imirkin at alum.mit.edu>
> ---
>  src/mesa/drivers/dri/i965/brw_compute.c | 104 +++++++++++++++++++++++++++-----
>  src/mesa/drivers/dri/i965/brw_defines.h |   1 +
>  2 files changed, 91 insertions(+), 14 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
> index d9f181a..bbb8ce3 100644
> --- a/src/mesa/drivers/dri/i965/brw_compute.c
> +++ b/src/mesa/drivers/dri/i965/brw_compute.c
> @@ -35,6 +35,92 @@
>  
>  
>  static void
> +brw_prepare_indirect_gpgpu_walker(struct brw_context *brw)
> +{

Just FYI:
There is a blurb in the predicate text:
To ensure the memory sources of the MI_LOAD_REGISTER_MEM commands are coherent
with previous 3D_PIPECONTROL store-DWord operations, software can use the new
Pipe Control Flush Enable bit in the PIPE_CONTROL command.

I suppose it's never the case that we'll be writing these with PIPE_CONTROL, so
it's safe to ignore this.

> +   GLintptr indirect_offset = brw->compute.num_work_groups_offset;
> +   drm_intel_bo *bo = brw->compute.num_work_groups_bo;
> +
> +   brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo,
> +                         I915_GEM_DOMAIN_VERTEX, 0,
> +                         indirect_offset + 0);
> +   brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo,
> +                         I915_GEM_DOMAIN_VERTEX, 0,
> +                         indirect_offset + 4);
> +   brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo,
> +                         I915_GEM_DOMAIN_VERTEX, 0,
> +                         indirect_offset + 8);
> +
> +   if (brw->gen > 7)
> +      return;
> +
> +   /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1
> +    */
> +   BEGIN_BATCH(7);
> +   OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2));
> +   OUT_BATCH(MI_PREDICATE_SRC0 + 4);
> +   OUT_BATCH(0u);
> +   OUT_BATCH(MI_PREDICATE_SRC1 + 0);
> +   OUT_BATCH(0u);
> +   OUT_BATCH(MI_PREDICATE_SRC1 + 4);
> +   OUT_BATCH(0u);
> +   ADVANCE_BATCH();
> +
> +   /* Load compute_dispatch_indirect_x_size into SRC0
> +    */
> +   brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo,
> +                         I915_GEM_DOMAIN_INSTRUCTION, 0,
> +                         indirect_offset + 0);
> +
> +   /* predicate = (compute_dispatch_indirect_x_size == 0);
> +    */
> +   BEGIN_BATCH(1);
> +   OUT_BATCH(GEN7_MI_PREDICATE |
> +             MI_PREDICATE_LOADOP_LOAD |
> +             MI_PREDICATE_COMBINEOP_SET |
> +             MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
> +   ADVANCE_BATCH();
> +
> +   /* Load compute_dispatch_indirect_y_size into SRC0
> +    */
> +   brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo,
> +                         I915_GEM_DOMAIN_INSTRUCTION, 0,
> +                         indirect_offset + 4);
> +
> +   /* predicate |= (compute_dispatch_indirect_y_size == 0);
> +    */
> +   BEGIN_BATCH(1);
> +   OUT_BATCH(GEN7_MI_PREDICATE |
> +             MI_PREDICATE_LOADOP_LOAD |
> +             MI_PREDICATE_COMBINEOP_OR |
> +             MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
> +   ADVANCE_BATCH();
> +
> +   /* Load compute_dispatch_indirect_z_size into SRC0
> +    */
> +   brw_load_register_mem(brw, MI_PREDICATE_SRC0, bo,
> +                         I915_GEM_DOMAIN_INSTRUCTION, 0,
> +                         indirect_offset + 8);
> +
> +   /* predicate |= (compute_dispatch_indirect_z_size == 0);
> +    */
> +   BEGIN_BATCH(1);
> +   OUT_BATCH(GEN7_MI_PREDICATE |
> +             MI_PREDICATE_LOADOP_LOAD |
> +             MI_PREDICATE_COMBINEOP_OR |
> +             MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
> +   ADVANCE_BATCH();
> +
> +   /* predicate = !predicate;
> +    */
> +   BEGIN_BATCH(1);
> +   OUT_BATCH(GEN7_MI_PREDICATE |
> +             MI_PREDICATE_LOADOP_LOADINV |
> +             MI_PREDICATE_COMBINEOP_OR |
> +             MI_PREDICATE_COMPAREOP_FALSE);
> +   ADVANCE_BATCH();
> +}

I think all of your comments would fit on one line...

Just summing up our conversation, I believe you could have slightly simpler code
using DELTAS_EQUAL, but it's not entirely clear.

> +
> +static void
>  brw_emit_gpgpu_walker(struct brw_context *brw)
>  {
>     const struct brw_cs_prog_data *prog_data = brw->cs.prog_data;
> @@ -45,20 +131,10 @@ brw_emit_gpgpu_walker(struct brw_context *brw)
>     if (brw->compute.num_work_groups_bo == NULL) {
>        indirect_flag = 0;
>     } else {
> -      GLintptr indirect_offset = brw->compute.num_work_groups_offset;
> -      drm_intel_bo *bo = brw->compute.num_work_groups_bo;
> -
> -      indirect_flag = GEN7_GPGPU_INDIRECT_PARAMETER_ENABLE;
> -
> -      brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMX, bo,
> -                            I915_GEM_DOMAIN_VERTEX, 0,
> -                            indirect_offset + 0);
> -      brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMY, bo,
> -                            I915_GEM_DOMAIN_VERTEX, 0,
> -                            indirect_offset + 4);
> -      brw_load_register_mem(brw, GEN7_GPGPU_DISPATCHDIMZ, bo,
> -                            I915_GEM_DOMAIN_VERTEX, 0,
> -                            indirect_offset + 8);
> +      indirect_flag =
> +         GEN7_GPGPU_INDIRECT_PARAMETER_ENABLE |
> +         ((brw->gen == 7) ? GEN7_GPGPU_PREDICATE_ENABLE : 0);
> +      brw_prepare_indirect_gpgpu_walker(brw);
>     }
>  
>     const unsigned simd_size = prog_data->simd_size;
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index b1fa559..60b696c 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -2938,6 +2938,7 @@ enum brw_wm_barycentric_interp_mode {
>  #define GPGPU_WALKER                            0x7105
>  /* GEN7 DW0 */
>  # define GEN7_GPGPU_INDIRECT_PARAMETER_ENABLE   (1 << 10)
> +# define GEN7_GPGPU_PREDICATE_ENABLE            (1 << 8)
>  /* GEN8+ DW2 */
>  # define GPGPU_WALKER_INDIRECT_LENGTH_SHIFT     0
>  # define GPGPU_WALKER_INDIRECT_LENGTH_MASK      INTEL_MASK(15, 0)

Just making sure, but the API expects nothing happens if ANY of the workgroup
sizes are 0? Because that's what's going to happen now, isn't it?


More information about the mesa-dev mailing list