[Mesa-dev] [PATCH 10/10] i965: Implement transform feedback query support in hardware on Gen6+.

Mon May 20 10:57:59 PDT 2013

On 17 May 2013 10:18, Kenneth Graunke <kenneth at whitecape.org> wrote:

> Now that we have hardware contexts and can use MI_STORE_REGISTER_MEM,
> we can use the GPU's pipeline statistics counters rather than going out
> of our way to count primitives in software.
>
> Aside from being simpler, this also paves the way for Geometry Shaders,
> which can output an arbitrary number of primitives on the GPU.
>

While you're giving rationales, another justification is that this will let
us use hardware primitive restart support when transform feedback and/or
primitive counting is in use.

>
> The GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN query is easy: it
> corresponds to the SO_NUM_PRIMS_WRITTEN/SO_NUM_PRIMS_WRITTEN0_IVB
> counters.
>
> The GL_PRIMITIVES_GENERATED query is trickier.  Gen provides several
> statistics registers which /almost/ match the semantics required:
> - IA_PRIMITIVES_COUNT
>   The number of primitives fetched by the VF or IA (input assembler).
>   This undercounts when GS is enabled, as it can output many primitives.
> - GS_PRIMITIVES_COUNT
>   The number of primitives output by the GS.  Unfortunately, this
>   doesn't increment unless the GS unit is actually enabled, and it
>   usually isn't.
> - SO_PRIM_STORAGE_NEEDED*_IVB
>   The amount of space needed to write primitives output by transform
>   feedback.  These naturally only work when transform feedback is on.
>   We'd also have to add the counters for all four streams.
> - CL_INVOCATION_COUNT
>   The number of primitives processed by the clipper.  This doesn't work
>   if the GS or SOL throw away primitives for rasterizer discard.
>   However, it does increment even if the clipper is in REJECT_ALL mode.
>
> Dynamically switching between counters would be painfully complicated,
> especially since GS, rasterizer discard, and transform feedback can all
> be switched on and off repeatedly during a single query.
>
> The most usable counter is CL_INVOCATION_COUNT.  The previous two
> patches reworked rasterizer discard support so that all primitives hit
> the clipper, making this work.
>
> Cc: Eric Anholt <eric at anholt.net>
> Cc: Paul Berry <stereotype441 at gmail.com>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/gen6_queryobj.c | 105
> +++++++++++++++++++-----------
>  1 file changed, 66 insertions(+), 39 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c
> b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> index 28af8d7..a032227 100644
> --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
> +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> @@ -94,6 +94,57 @@ write_depth_count(struct intel_context *intel,
> drm_intel_bo *query_bo, int idx)
>     ADVANCE_BATCH();
>  }
>
> +/*
> + * Write an arbitrary 64-bit register to a buffer via
> MI_STORE_REGISTER_MEM.
> + *
> + * Only TIMESTAMP and PS_DEPTH_COUNT have special PIPE_CONTROL support;
> other
> + * counters have to be read via the generic MI_STORE_REGISTER_MEM.  This
> + * function also performs a pipeline flush for proper synchronization.
> + */
> +static void
> +write_reg(struct intel_context *intel,
> +          drm_intel_bo *query_bo, uint32_t reg, int idx)
> +{
> +   assert(intel->gen >= 6);
> +
> +   intel_batchbuffer_emit_mi_flush(intel);
> +
> +   /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
> +    * read a full 64-bit register, we need to do two of them.
> +    */
> +   BEGIN_BATCH(3);
> +   OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
> +   OUT_BATCH(reg);
> +   OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
> +             idx * sizeof(uint64_t));
> +   ADVANCE_BATCH();
> +
> +   BEGIN_BATCH(3);
> +   OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
> +   OUT_BATCH(reg + sizeof(uint32_t));
> +   OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
> +             sizeof(uint32_t) + idx * sizeof(uint64_t));
> +   ADVANCE_BATCH();
> +}
> +
> +static void
> +write_primitives_generated(struct intel_context *intel,
> +                           drm_intel_bo *query_bo, int idx)
> +{
> +   write_reg(intel, query_bo, CL_INVOCATION_COUNT, idx);
> +}
> +
> +static void
> +write_xfb_primitives_written(struct intel_context *intel,
> +                             drm_intel_bo *query_bo, int idx)
> +{
> +   if (intel->gen >= 7) {
> +      write_reg(intel, query_bo, SO_NUM_PRIMS_WRITTEN0_IVB, idx);
> +   } else {
> +      write_reg(intel, query_bo, SO_NUM_PRIMS_WRITTEN, idx);
> +   }
> +}
> +
>  /**
>   * Wait on the query object's BO and calculate the final result.
>   */
> @@ -152,21 +203,20 @@ gen6_queryobj_get_results(struct gl_context *ctx,
>        query->Base.Result &= (1ull << 36) - 1;
>        break;
>
> -   case GL_SAMPLES_PASSED_ARB:
> -      query->Base.Result += results[1] - results[0];
> -      break;
> -
>     case GL_ANY_SAMPLES_PASSED:
>     case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
> -      query->Base.Result = results[0] != results[1];
> +      if (results[0] != results[1])
> +         query->Base.Result = true;
>

This looks like it got squashed into the wrong patch of the series.

With that fixed, this patch is:

Reviewed-by: Paul Berry <stereotype441 at gmail.com>

>        break;
>
> +   case GL_SAMPLES_PASSED_ARB:
>     case GL_PRIMITIVES_GENERATED:
>     case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
> -      /* We don't actually query the hardware for this value, so query->bo
> -       * should always be NULL and execution should never reach here.
> +      /* We need to use += rather than = here since some BLT-based
> operations
> +       * may have added additional samples to our occlusion query value.
> +       * It shouldn't matter for geometry queries, but is harmless.
>         */
> -      assert(!"Unreachable");
> +      query->Base.Result += results[1] - results[0];
>        break;
>
>     default:
> @@ -191,10 +241,13 @@ gen6_queryobj_get_results(struct gl_context *ctx,
>  static void
>  gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
>  {
> -   struct brw_context *brw = brw_context(ctx);
>     struct intel_context *intel = intel_context(ctx);
>     struct brw_query_object *query = (struct brw_query_object *)q;
>
> +   /* Since we're starting a new query, we need to throw away old
> results. */
> +   drm_intel_bo_unreference(query->bo);
> +   query->bo = drm_intel_bo_alloc(intel->bufmgr, "query results", 4096,
> 4096);
> +
>     switch (query->Base.Target) {
>     case GL_TIME_ELAPSED:
>        /* For timestamp queries, we record the starting time right away so
> that
> @@ -216,36 +269,21 @@ gen6_begin_query(struct gl_context *ctx, struct
> gl_query_object *q)
>         * obtain the time elapsed.  Notably, this includes time elapsed
> while
>         * the system was doing other work, such as running other
> applications.
>         */
> -      drm_intel_bo_unreference(query->bo);
> -      query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query", 4096,
> 4096);
>        write_timestamp(intel, query->bo, 0);
>        break;
>
>     case GL_ANY_SAMPLES_PASSED:
>     case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
>     case GL_SAMPLES_PASSED_ARB:
> -      /* Since we're starting a new query, we need to be sure to throw
> away
> -       * any previous occlusion query results.
> -       */
> -      drm_intel_bo_unreference(query->bo);
> -      query->bo = drm_intel_bo_alloc(intel->bufmgr, "occl. query", 4096,
> 4096);
>        write_depth_count(intel, query->bo, 0);
>        break;
>
>     case GL_PRIMITIVES_GENERATED:
> -      /* We don't actually query the hardware for this value; we keep
> track of
> -       * it a software counter.  So just reset the counter.
> -       */
> -      brw->sol.primitives_generated = 0;
> -      brw->sol.counting_primitives_generated = true;
> +      write_primitives_generated(intel, query->bo, 0);
>        break;
>
>     case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
> -      /* We don't actually query the hardware for this value; we keep
> track of
> -       * it a software counter.  So just reset the counter.
> -       */
> -      brw->sol.primitives_written = 0;
> -      brw->sol.counting_primitives_written = true;
> +      write_xfb_primitives_written(intel, query->bo, 0);
>        break;
>
>     default:
> @@ -265,7 +303,6 @@ gen6_begin_query(struct gl_context *ctx, struct
> gl_query_object *q)
>  static void
>  gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
>  {
> -   struct brw_context *brw = brw_context(ctx);
>     struct intel_context *intel = intel_context(ctx);
>     struct brw_query_object *query = (struct brw_query_object *)q;
>
> @@ -281,21 +318,11 @@ gen6_end_query(struct gl_context *ctx, struct
> gl_query_object *q)
>        break;
>
>     case GL_PRIMITIVES_GENERATED:
> -      /* We don't actually query the hardware for this value; we keep
> track of
> -       * it in a software counter.  So just read the counter and store it
> in
> -       * the query object.
> -       */
> -      query->Base.Result = brw->sol.primitives_generated;
> -      brw->sol.counting_primitives_generated = false;
> +      write_primitives_generated(intel, query->bo, 1);
>        break;
>
>     case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
> -      /* We don't actually query the hardware for this value; we keep
> track of
> -       * it in a software counter.  So just read the counter and store it
> in
> -       * the query object.
> -       */
> -      query->Base.Result = brw->sol.primitives_written;
> -      brw->sol.counting_primitives_written = false;
> +      write_xfb_primitives_written(intel, query->bo, 1);
>        break;
>
>     default:
> --
> 1.8.2.3
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20130520/c53c50e4/attachment-0001.html>