[Mesa-dev] [PATCH 10/10] i965: Implement transform feedback query support in hardware on Gen6+.

Fri May 17 10:18:01 PDT 2013

Now that we have hardware contexts and can use MI_STORE_REGISTER_MEM,
we can use the GPU's pipeline statistics counters rather than going out
of our way to count primitives in software.

Aside from being simpler, this also paves the way for Geometry Shaders,
which can output an arbitrary number of primitives on the GPU.

The GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN query is easy: it
corresponds to the SO_NUM_PRIMS_WRITTEN/SO_NUM_PRIMS_WRITTEN0_IVB
counters.

The GL_PRIMITIVES_GENERATED query is trickier.  Gen provides several
statistics registers which /almost/ match the semantics required:
- IA_PRIMITIVES_COUNT
  The number of primitives fetched by the VF or IA (input assembler).
  This undercounts when GS is enabled, as it can output many primitives.
- GS_PRIMITIVES_COUNT
  The number of primitives output by the GS.  Unfortunately, this
  doesn't increment unless the GS unit is actually enabled, and it
  usually isn't.
- SO_PRIM_STORAGE_NEEDED*_IVB
  The amount of space needed to write primitives output by transform
  feedback.  These naturally only work when transform feedback is on.
  We'd also have to add the counters for all four streams.
- CL_INVOCATION_COUNT
  The number of primitives processed by the clipper.  This doesn't work
  if the GS or SOL throw away primitives for rasterizer discard.
  However, it does increment even if the clipper is in REJECT_ALL mode.

Dynamically switching between counters would be painfully complicated,
especially since GS, rasterizer discard, and transform feedback can all
be switched on and off repeatedly during a single query.

The most usable counter is CL_INVOCATION_COUNT.  The previous two
patches reworked rasterizer discard support so that all primitives hit
the clipper, making this work.

Cc: Eric Anholt <eric at anholt.net>
Cc: Paul Berry <stereotype441 at gmail.com>
Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/gen6_queryobj.c | 105 +++++++++++++++++++-----------
 1 file changed, 66 insertions(+), 39 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index 28af8d7..a032227 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -94,6 +94,57 @@ write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
    ADVANCE_BATCH();
 }
 
+/*
+ * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
+ *
+ * Only TIMESTAMP and PS_DEPTH_COUNT have special PIPE_CONTROL support; other
+ * counters have to be read via the generic MI_STORE_REGISTER_MEM.  This
+ * function also performs a pipeline flush for proper synchronization.
+ */
+static void
+write_reg(struct intel_context *intel,
+          drm_intel_bo *query_bo, uint32_t reg, int idx)
+{
+   assert(intel->gen >= 6);
+
+   intel_batchbuffer_emit_mi_flush(intel);
+
+   /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
+    * read a full 64-bit register, we need to do two of them.
+    */
+   BEGIN_BATCH(3);
+   OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
+   OUT_BATCH(reg);
+   OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+             idx * sizeof(uint64_t));
+   ADVANCE_BATCH();
+
+   BEGIN_BATCH(3);
+   OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
+   OUT_BATCH(reg + sizeof(uint32_t));
+   OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+             sizeof(uint32_t) + idx * sizeof(uint64_t));
+   ADVANCE_BATCH();
+}
+
+static void
+write_primitives_generated(struct intel_context *intel,
+                           drm_intel_bo *query_bo, int idx)
+{
+   write_reg(intel, query_bo, CL_INVOCATION_COUNT, idx);
+}
+
+static void
+write_xfb_primitives_written(struct intel_context *intel,
+                             drm_intel_bo *query_bo, int idx)
+{
+   if (intel->gen >= 7) {
+      write_reg(intel, query_bo, SO_NUM_PRIMS_WRITTEN0_IVB, idx);
+   } else {
+      write_reg(intel, query_bo, SO_NUM_PRIMS_WRITTEN, idx);
+   }
+}
+
 /**
  * Wait on the query object's BO and calculate the final result.
  */
@@ -152,21 +203,20 @@ gen6_queryobj_get_results(struct gl_context *ctx,
       query->Base.Result &= (1ull << 36) - 1;
       break;
 
-   case GL_SAMPLES_PASSED_ARB:
-      query->Base.Result += results[1] - results[0];
-      break;
-
    case GL_ANY_SAMPLES_PASSED:
    case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
-      query->Base.Result = results[0] != results[1];
+      if (results[0] != results[1])
+         query->Base.Result = true;
       break;
 
+   case GL_SAMPLES_PASSED_ARB:
    case GL_PRIMITIVES_GENERATED:
    case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
-      /* We don't actually query the hardware for this value, so query->bo
-       * should always be NULL and execution should never reach here.
+      /* We need to use += rather than = here since some BLT-based operations
+       * may have added additional samples to our occlusion query value.
+       * It shouldn't matter for geometry queries, but is harmless.
        */
-      assert(!"Unreachable");
+      query->Base.Result += results[1] - results[0];
       break;
 
    default:
@@ -191,10 +241,13 @@ gen6_queryobj_get_results(struct gl_context *ctx,
 static void
 gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
 {
-   struct brw_context *brw = brw_context(ctx);
    struct intel_context *intel = intel_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
+   /* Since we're starting a new query, we need to throw away old results. */
+   drm_intel_bo_unreference(query->bo);
+   query->bo = drm_intel_bo_alloc(intel->bufmgr, "query results", 4096, 4096);
+
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED:
       /* For timestamp queries, we record the starting time right away so that
@@ -216,36 +269,21 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
        * obtain the time elapsed.  Notably, this includes time elapsed while
        * the system was doing other work, such as running other applications.
        */
-      drm_intel_bo_unreference(query->bo);
-      query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query", 4096, 4096);
       write_timestamp(intel, query->bo, 0);
       break;
 
    case GL_ANY_SAMPLES_PASSED:
    case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
    case GL_SAMPLES_PASSED_ARB:
-      /* Since we're starting a new query, we need to be sure to throw away
-       * any previous occlusion query results.
-       */
-      drm_intel_bo_unreference(query->bo);
-      query->bo = drm_intel_bo_alloc(intel->bufmgr, "occl. query", 4096, 4096);
       write_depth_count(intel, query->bo, 0);
       break;
 
    case GL_PRIMITIVES_GENERATED:
-      /* We don't actually query the hardware for this value; we keep track of
-       * it a software counter.  So just reset the counter.
-       */
-      brw->sol.primitives_generated = 0;
-      brw->sol.counting_primitives_generated = true;
+      write_primitives_generated(intel, query->bo, 0);
       break;
 
    case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
-      /* We don't actually query the hardware for this value; we keep track of
-       * it a software counter.  So just reset the counter.
-       */
-      brw->sol.primitives_written = 0;
-      brw->sol.counting_primitives_written = true;
+      write_xfb_primitives_written(intel, query->bo, 0);
       break;
 
    default:
@@ -265,7 +303,6 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
 static void
 gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
 {
-   struct brw_context *brw = brw_context(ctx);
    struct intel_context *intel = intel_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
@@ -281,21 +318,11 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
       break;
 
    case GL_PRIMITIVES_GENERATED:
-      /* We don't actually query the hardware for this value; we keep track of
-       * it in a software counter.  So just read the counter and store it in
-       * the query object.
-       */
-      query->Base.Result = brw->sol.primitives_generated;
-      brw->sol.counting_primitives_generated = false;
+      write_primitives_generated(intel, query->bo, 1);
       break;
 
    case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
-      /* We don't actually query the hardware for this value; we keep track of
-       * it in a software counter.  So just read the counter and store it in
-       * the query object.
-       */
-      query->Base.Result = brw->sol.primitives_written;
-      brw->sol.counting_primitives_written = false;
+      write_xfb_primitives_written(intel, query->bo, 1);
       break;
 
    default:
-- 
1.8.2.3