[Mesa-dev] [PATCH 2/2] i965: Implement ARB_indirect_parameters

Tue Sep 26 00:32:19 UTC 2017

On Tuesday, August 29, 2017 9:24:01 AM PDT Plamena Manolova wrote:
> We can implement ARB_indirect_parameters for i965 by
> taking advantage of the conditional rendering mechanism.
> This works by issuing maxdrawcount draw calls and using
> conditional rendering to predicate each of them with
> "drawcount > gl_DrawID"
> 
> Signed-off-by: Plamena Manolova <plamena.manolova at intel.com>
> ---
>  src/mesa/drivers/dri/i965/brw_context.h      |  3 +
>  src/mesa/drivers/dri/i965/brw_draw.c         | 97 ++++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_draw.h         | 11 ++++
>  src/mesa/drivers/dri/i965/intel_extensions.c |  2 +
>  4 files changed, 113 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 2274fe5..4639d5b 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -831,6 +831,9 @@ struct brw_context
>        int gl_drawid;
>        struct brw_bo *draw_id_bo;
>        uint32_t draw_id_offset;
> +
> +      struct brw_bo *draw_params_count_bo;
> +      uint32_t draw_params_count_offset;
>     } draw;
>  
>     struct {
> diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
> index 7597bae..473958c 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw.c
> @@ -820,6 +820,11 @@ brw_end_draw_prims(struct gl_context *ctx,
>     brw_program_cache_check_size(brw);
>     brw_postdraw_reconcile_align_wa_slices(brw);
>     brw_postdraw_set_buffers_need_resolve(brw);
> +
> +   if (brw->draw.draw_params_count_bo) {
> +      brw_bo_unreference(brw->draw.draw_params_count_bo);
> +      brw->draw.draw_params_count_bo = NULL;
> +   }
>  }
>  
>  void
> @@ -837,6 +842,8 @@ brw_draw_prims(struct gl_context *ctx,
>     struct brw_context *brw = brw_context(ctx);
>     const struct gl_vertex_array **arrays = ctx->Array._DrawArrays;
>     int i;
> +   int predicate_state = brw->predicate.state;
> +   int combine_op = MI_PREDICATE_COMBINEOP_SET;
>     struct brw_transform_feedback_object *xfb_obj =
>        (struct brw_transform_feedback_object *) gl_xfb_obj;
>  
> @@ -890,12 +897,101 @@ brw_draw_prims(struct gl_context *ctx,
>      * to it.
>      */
>  
> +   if (brw->draw.draw_params_count_bo &&
> +       predicate_state == BRW_PREDICATE_STATE_USE_BIT) {
> +      /* We do this to empty the MI_PREDICATE_DATA register */
> +      BEGIN_BATCH(4);
> +      OUT_BATCH(MI_PREDICATE_DATA);
> +      OUT_BATCH(0u);
> +      OUT_BATCH(MI_PREDICATE_DATA + 4);
> +      OUT_BATCH(0u);
> +      ADVANCE_BATCH();
> +
> +      combine_op = MI_PREDICATE_COMBINEOP_AND;
> +   }
> +
>     for (i = 0; i < nr_prims; i++) {
> +      if (brw->draw.draw_params_count_bo) {
> +         struct brw_bo *draw_id_bo = brw_bo_alloc(brw->bufmgr, "draw_id", 4, 4);
> +
> +         brw_bo_reference(draw_id_bo);

brw_bo_alloc starts you with a reference count of 1, so you don't want
to call brw_bo_reference here - that would increase the refcount to 2,
and your brw_bo_unreference() call below would only drop it to 1, which
would leak the BO.

> +         brw_bo_subdata(draw_id_bo, 0, 4, &prims[i].draw_id);

Buffer objects are always page-aligned sizes, so we're actually
allocating a full 4 kilobytes for this 32-bit integer.  That's kind
of a bummer.  To avoid this, we have a streaming upload buffer which we
use to append random data we want to use in the batch.  I'd use that
instead:

   struct brw_bo *draw_id_bo;
   uint32_t draw_id_offset;

   intel_upload_data(brw, &prims[i].draw_id, 4, 4,
                     &draw_id_bo, &draw_id_offset);

(I'm assuming prims[i].draw_id is 4-byte aligned...it should be unless
people have explicitly marked the _mesa_prim struct PACKED...)

> +
> +         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_FLUSH_ENABLE);
> +
> +         brw_load_register_mem(brw, MI_PREDICATE_SRC0,
> +                               brw->draw.draw_params_count_bo,
> +                               brw->draw.draw_params_count_offset);
> +         brw_load_register_mem(brw, MI_PREDICATE_SRC1, draw_id_bo, 0);

then make sure to use the offset:

         brw_load_register_mem(brw, MI_PREDICATE_SRC1,
                               draw_id_bo, draw_id_offset);

Note that intel_upload_* is only meant to be used for a single batch...
if you need it to persist across multiple batches, you probably want to
allocate your own BO like you originally did here.

> +
> +         BEGIN_BATCH(1);
> +         OUT_BATCH(GEN7_MI_PREDICATE |
> +                   MI_PREDICATE_LOADOP_LOADINV | combine_op |
> +                   MI_PREDICATE_COMPAREOP_DELTAS_EQUAL);
> +         ADVANCE_BATCH();
> +
> +         brw->predicate.state = BRW_PREDICATE_STATE_USE_BIT;
> +
> +         brw_bo_unreference(draw_id_bo);
> +      }
> +
>        brw_try_draw_prim(ctx, arrays, &prims[i], ib, index_bounds_valid,
>                          min_index, max_index, xfb_obj, stream, indirect);
>     }
> +
>     brw_end_draw_prims(ctx, arrays, prims, nr_prims, ib, index_bounds_valid,
>                        min_index, max_index, xfb_obj, stream, indirect);
> +
> +   brw->predicate.state = predicate_state;
> +}
> +
> +void
> +brw_draw_indirect_prims(struct gl_context *ctx,
> +                        GLuint mode,
> +                        struct gl_buffer_object *indirect_data,
> +                        GLsizeiptr indirect_offset,
> +                        unsigned draw_count,
> +                        unsigned stride,
> +                        struct gl_buffer_object *indirect_params,
> +                        GLsizeiptr indirect_params_offset,
> +                        const struct _mesa_index_buffer *ib)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct _mesa_prim *prim;
> +   GLsizei i;
> +
> +   prim = calloc(draw_count, sizeof(*prim));
> +   if (prim == NULL) {
> +      _mesa_error(ctx, GL_OUT_OF_MEMORY, "gl%sDraw%sIndirect%s",
> +                  (draw_count > 1) ? "Multi" : "",
> +                  ib ? "Elements" : "Arrays",
> +                  indirect_params ? "CountARB" : "");
> +      return;
> +   }
> +
> +   prim[0].begin = 1;
> +   prim[draw_count - 1].end = 1;
> +   for (i = 0; i < draw_count; ++i, indirect_offset += stride) {
> +      prim[i].mode = mode;
> +      prim[i].indexed = !!ib;
> +      prim[i].indirect_offset = indirect_offset;
> +      prim[i].is_indirect = 1;
> +      prim[i].draw_id = i;
> +   }
> +
> +   if (indirect_params) {
> +      brw->draw.draw_params_count_bo =
> +         intel_buffer_object(indirect_params)->buffer;
> +      brw_bo_reference(brw->draw.draw_params_count_bo);
> +      brw->draw.draw_params_count_offset = indirect_params_offset;
> +   }
> +
> +   brw_draw_prims(ctx, prim, draw_count,
> +                  ib, false, 0, ~0,
> +                  NULL, 0,
> +                  indirect_data);

One thing I considered here was that brw_draw_prims may flush the batch,
causing this draw to span across multiple batches.  I think that's okay
here, as you're using the high level function, which takes care of
requiring space, flushing, and reflagging state as necessary.  If you
were calling brw_try_draw_prim directly, I'd be more concerned about it.

So, I think there's nothing to do here.

> +
> +   free(prim);
>  }
>  
>  void
> @@ -907,6 +1003,7 @@ brw_draw_init(struct brw_context *brw)
>     /* Register our drawing function:
>      */
>     vbo->draw_prims = brw_draw_prims;
> +   vbo->draw_indirect_prims = brw_draw_indirect_prims;
>  
>     for (int i = 0; i < VERT_ATTRIB_MAX; i++)
>        brw->vb.inputs[i].buffer = -1;
> diff --git a/src/mesa/drivers/dri/i965/brw_draw.h b/src/mesa/drivers/dri/i965/brw_draw.h
> index 3b99915..07aab1c 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw.h
> +++ b/src/mesa/drivers/dri/i965/brw_draw.h
> @@ -58,6 +58,17 @@ void brw_draw_prims(struct gl_context *ctx,
>  void brw_draw_init( struct brw_context *brw );
>  void brw_draw_destroy( struct brw_context *brw );
>  
> +void
> +brw_draw_indirect_prims(struct gl_context *ctx,
> +                        GLuint mode,
> +                        struct gl_buffer_object *indirect_data,
> +                        GLsizeiptr indirect_offset,
> +                        unsigned draw_count,
> +                        unsigned stride,
> +                        struct gl_buffer_object *indirect_params,
> +                        GLsizeiptr indirect_params_offset,
> +                        const struct _mesa_index_buffer *ib);
> +
>  void brw_prepare_shader_draw_parameters(struct brw_context *);
>  
>  /* brw_primitive_restart.c */
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
> index deacd0d..c116d43 100644
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c
> @@ -230,6 +230,8 @@ intelInitExtensions(struct gl_context *ctx)
>  
>        if (can_do_pipelined_register_writes(brw->screen)) {
>           ctx->Extensions.ARB_draw_indirect = true;
> +         if (ctx->Extensions.ARB_conditional_render_inverted)
> +            ctx->Extensions.ARB_indirect_parameters = true;

This isn't quite right - we enable ARB_conditional_render_inverted
unconditionally on Gen6+, but it might be implemented via fallbacks.

What you really want is access to the hardware predicate support.
Add it to the block below:

         if (can_do_predicate_writes(brw->screen)) {
            brw->predicate.supported = true;
            ctx->Extensions.ARB_indirect_parameters = true;
         }

>           ctx->Extensions.ARB_transform_feedback2 = true;
>           ctx->Extensions.ARB_transform_feedback3 = true;
>           ctx->Extensions.ARB_transform_feedback_instanced = true;
> 

With that fixed, this looks good to me!  Great work :)

I'll give official Reviewed-bys when I see a v2.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170925/ebcdd2db/attachment.sig>