[Mesa-dev] [PATCH 8/9] i965: Use 'available' fence for polling query results

Wed Jun 14 23:10:21 UTC 2017

On Friday, June 9, 2017 6:01:39 AM PDT Chris Wilson wrote:
> If we always write the 'available' flag after writing the final result
> of the query, we can probe that predicate to quickly query whether the
> result is ready from userspace. The primary advantage of checking the
> predicate is that it allows for more fine-grained queries, we do not
> have to wait for the batch to finish before the query is marked as
> ready.
> 
> We still do check the status of the batch after probing the query so
> that if the worst happens and the batch did hang without completing the
> query, we do not spin forever (although it is not as nice as completely
> eliminating the ioctl, the busy-ioctl is lightweight!).
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> Cc: Matt Turner <mattst88 at gmail.com>
> ---
>  src/mesa/drivers/dri/i965/brw_context.h   |  4 +--
>  src/mesa/drivers/dri/i965/gen6_queryobj.c | 54 +++++++++++++------------------
>  2 files changed, 25 insertions(+), 33 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 117b1ecdca..44e0d31c6d 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -428,8 +428,8 @@ struct brw_query_object {
>     bool flushed;
>  };
>  
> -#define GEN6_QUERY_PREDICATE (2)
> -#define GEN6_QUERY_RESULTS (0)
> +#define GEN6_QUERY_PREDICATE (0)
> +#define GEN6_QUERY_RESULTS (1)
>  
>  static inline unsigned gen6_query_predicate_offset(const struct brw_query_object *query)
>  {
> diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> index 5c95a4bae9..ae7fd06c1c 100644
> --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
> +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> @@ -40,8 +40,7 @@
>  #include "intel_buffer_objects.h"
>  
>  static inline void
> -set_query_availability(struct brw_context *brw, struct brw_query_object *query,
> -                       bool available)
> +set_query_available(struct brw_context *brw, struct brw_query_object *query)
>  {
>     /* For platforms that support ARB_query_buffer_object, we write the
>      * query availability for "pipelined" queries.
> @@ -58,22 +57,12 @@ set_query_availability(struct brw_context *brw, struct brw_query_object *query,
>      * PIPE_CONTROL with an immediate write will synchronize with
>      * those earlier writes, so we write 1 when the value has landed.
>      */
> -   if (brw->ctx.Extensions.ARB_query_buffer_object &&
> -       brw_is_query_pipelined(query)) {
> -      unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
>  
> -      if (available) {
> -         /* Order available *after* the query results. */
> -         flags |= PIPE_CONTROL_FLUSH_ENABLE;
> -      } else {
> -         /* Make it unavailable *before* any pipelined reads. */
> -         flags |= PIPE_CONTROL_CS_STALL;
> -      }
> -
> -      brw_emit_pipe_control_write(brw, flags,
> -                                  query->bo, gen6_query_predicate_offset(query),
> -                                  available, 0);
> -   }
> +   brw_emit_pipe_control_write(brw,
> +                               PIPE_CONTROL_WRITE_IMMEDIATE |
> +                               PIPE_CONTROL_FLUSH_ENABLE,
> +                               query->bo, gen6_query_predicate_offset(query),
> +                               true, 0);
>  }
>  
>  static void
> @@ -139,12 +128,12 @@ write_xfb_overflow_streams(struct gl_context *ctx,
>  }
>  
>  static bool
> -check_xfb_overflow_streams(uint64_t *results, int count)
> +check_xfb_overflow_streams(const uint64_t *results, int count)
>  {
>     bool overflow = false;
>  
>     for (int i = 0; i < count; i++) {
> -      uint64_t *result_i = &results[4 * i];
> +      const uint64_t *result_i = &results[4 * i];
>  
>        if ((result_i[3] - result_i[2]) != (result_i[1] - result_i[0])) {
>           overflow = true;
> @@ -214,16 +203,14 @@ emit_pipeline_stat(struct brw_context *brw, struct brw_bo *bo,
>   */
>  static void
>  gen6_queryobj_get_results(struct gl_context *ctx,
> -                          struct brw_query_object *query)
> +                          struct brw_query_object *query,
> +                          const uint64_t *results)
>  {
>     struct brw_context *brw = brw_context(ctx);
>  
>     if (query->bo == NULL)
>        return;
>  
> -   brw_bo_map_sync(brw, query->bo, MAP_READ | MAP_COHERENT);
> -   uint64_t *results = query->results;
> -
>     switch (query->Base.Target) {
>     case GL_TIME_ELAPSED:
>        /* The query BO contains the starting and ending timestamps.
> @@ -319,10 +306,10 @@ static int gen6_alloc_query(struct brw_context *brw,
>     brw_bo_set_cache_coherent(query->bo);
>  
>     query->results = brw_bo_map(brw, query->bo,
> -                               MAP_READ | MAP_COHERENT | MAP_ASYNC);
> +                               MAP_READ | MAP_WRITE | MAP_COHERENT | MAP_ASYNC);
>  
>     /* For ARB_query_buffer_object: The result is not available */
> -   set_query_availability(brw, query, false);
> +   query->results[GEN6_QUERY_PREDICATE] = false;

Writing this value via the coherent-mapping rather than via PIPE_CONTROL
(and the corresponding set_query_available cleanups) should probably be
a separate patch.  They look good to me.

>  
>     return 0;
>  }
> @@ -477,7 +464,7 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
>     query->flushed = false;
>  
>     /* For ARB_query_buffer_object: The result is now available */
> -   set_query_availability(brw, query, true);
> +   set_query_available(brw, query);
>  }
>  
>  /**
> @@ -513,7 +500,11 @@ static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
>      */
>     flush_batch_if_needed(brw, query);
>  
> -   gen6_queryobj_get_results(ctx, query);
> +   uint64_t *results = query->results;
> +   if (!results[GEN6_QUERY_PREDICATE]) /* not yet available, must wait */
> +       brw_bo_map_sync(brw, query->bo, MAP_READ | MAP_COHERENT);
> +
> +   gen6_queryobj_get_results(ctx, query, results + GEN6_QUERY_RESULTS);
>  }
>  
>  /**
> @@ -542,9 +533,10 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
>      */
>     flush_batch_if_needed(brw, query);
>  
> -   if (!brw_bo_busy(query->bo)) {
> -      gen6_queryobj_get_results(ctx, query);
> -   }
> +   uint64_t *results = query->results;
> +   if (results[GEN6_QUERY_PREDICATE] || /* already available, can read async */
> +       !brw_bo_write_busy(query->bo)) /* GPU hang, results incomplete? */
> +      gen6_queryobj_get_results(ctx, query, results + GEN6_QUERY_RESULTS);
>  }
>  
>  static void
> @@ -555,7 +547,7 @@ gen6_query_counter(struct gl_context *ctx, struct gl_query_object *q)
>     const int idx = gen6_alloc_query(brw, query) + GEN6_QUERY_RESULTS;
>  
>     brw_write_timestamp(brw, query->bo, idx);
> -   set_query_availability(brw, query, true);
> +   set_query_available(brw, query);
>  
>     query->flushed = false;
>  }
> 

I'm glad to see this happen, finally - Eric and I had patches to do this
for years, but never had the performance data to support it.  It's
definitely the right thing to do.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170614/8d724c7c/attachment.sig>