[Mesa-dev] [PATCH] i965: Defer the throttle until we submit new commands

Wed Mar 11 09:11:41 PDT 2015

I've sent my substantive feedback in other messages in the thread.
Below is just some nitpicky formatting kinds of stuff.

On 03/11/2015 05:36 AM, Chris Wilson wrote:
> Currently, we throttle before the user begins preparing commands for the
> next frame when we acquire the draw/read buffers. However, construction
> of the command buffer can itself take significant time relative to the
> frame time. If we move the throttle from the buffer acquire to the
> command submit phase we can allow the user to improve concurrency
> between the CPU and GPU (i.e. reduce the amount of time we waste inside
> the throttle).
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> Cc: Ben Widawsky <ben at bwidawsk.net>
> Cc: Kristian Høgsberg <krh at bitplanet.net>
> Cc: Chad Versace <chad.versace at linux.intel.com>
> ---
>  src/mesa/drivers/dri/i965/brw_context.c       | 34 ----------------------
>  src/mesa/drivers/dri/i965/intel_batchbuffer.c | 41 +++++++++++++++++++++++++++
>  2 files changed, 41 insertions(+), 34 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 8257fb6..88685cd 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -1231,40 +1231,6 @@ intel_prepare_render(struct brw_context *brw)
>      */
>     if (brw_is_front_buffer_drawing(ctx->DrawBuffer))
>        brw->front_buffer_dirty = true;
> -
> -   /* Wait for the swapbuffers before the one we just emitted, so we
> -    * don't get too many swaps outstanding for apps that are GPU-heavy
> -    * but not CPU-heavy.
> -    *
> -    * We're using intelDRI2Flush (called from the loader before
> -    * swapbuffer) and glFlush (for front buffer rendering) as the
> -    * indicator that a frame is done and then throttle when we get
> -    * here as we prepare to render the next frame.  At this point for
> -    * round trips for swap/copy and getting new buffers are done and
> -    * we'll spend less time waiting on the GPU.
> -    *
> -    * Unfortunately, we don't have a handle to the batch containing
> -    * the swap, and getting our hands on that doesn't seem worth it,
> -    * so we just us the first batch we emitted after the last swap.
> -    */
> -   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
> -      if (brw->throttle_batch[1]) {
> -         if (!brw->disable_throttling)
> -            drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
> -         drm_intel_bo_unreference(brw->throttle_batch[1]);
> -      }
> -      brw->throttle_batch[1] = brw->throttle_batch[0];
> -      brw->throttle_batch[0] = NULL;
> -      brw->need_swap_throttle = false;
> -      /* Throttling here is more precise than the throttle ioctl, so skip it */
> -      brw->need_flush_throttle = false;
> -   }
> -
> -   if (brw->need_flush_throttle) {
> -      __DRIscreen *psp = brw->intelScreen->driScrnPriv;
> -      drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
> -      brw->need_flush_throttle = false;
> -   }
>  }
>  
>  /**
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> index 87862cd..8d7741b 100644
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> @@ -33,6 +33,9 @@
>  #include "intel_fbo.h"
>  #include "brw_context.h"
>  
> +#include <xf86drm.h>
> +#include <i915_drm.h>
> +
>  static void
>  intel_batchbuffer_reset(struct brw_context *brw);
>  
> @@ -226,6 +229,43 @@ brw_finish_batch(struct brw_context *brw)
>     brw->cache.bo_used_by_gpu = true;
>  }
>  
> +static void throttle(struct brw_context *brw)

static void
throttle(struct brw_context *brw)

(For other readers: This is so you can use 'git grep ^throttle' to find
a function definition.)

> +{
> +   /* Wait for the swapbuffers before the one we just emitted, so we
> +    * don't get too many swaps outstanding for apps that are GPU-heavy
> +    * but not CPU-heavy.
> +    *
> +    * We're using intelDRI2Flush (called from the loader before
> +    * swapbuffer) and glFlush (for front buffer rendering) as the
> +    * indicator that a frame is done and then throttle when we get
> +    * here as we prepare to render the next frame.  At this point for
> +    * round trips for swap/copy and getting new buffers are done and
> +    * we'll spend less time waiting on the GPU.
> +    *
> +    * Unfortunately, we don't have a handle to the batch containing
> +    * the swap, and getting our hands on that doesn't seem worth it,
> +    * so we just us the first batch we emitted after the last swap.
                    ^^ use

> +    */
> +   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
> +      if (brw->throttle_batch[1]) {
> +         if (!brw->disable_throttling)
> +            drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
> +         drm_intel_bo_unreference(brw->throttle_batch[1]);
> +      }
> +      brw->throttle_batch[1] = brw->throttle_batch[0];
> +      brw->throttle_batch[0] = NULL;
> +      brw->need_swap_throttle = false;
> +      /* Throttling here is more precise than the throttle ioctl, so skip it */
> +      brw->need_flush_throttle = false;
> +   }
> +
> +   if (brw->need_flush_throttle) {
> +      __DRIscreen *psp = brw->intelScreen->driScrnPriv;
> +      drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
> +      brw->need_flush_throttle = false;
> +   }
> +}
> +
>  /* TODO: Push this whole function into bufmgr.
>   */
>  static int
> @@ -260,6 +300,7 @@ do_flush_locked(struct brw_context *brw)
>        if (ret == 0) {
>           if (unlikely(INTEL_DEBUG & DEBUG_AUB))
>              brw_annotate_aub(brw);

Usually there would be a blank line here...

> +	 throttle(brw);

...and here.

>  	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
>  	    ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
>  					flags);
>