[Mesa-dev] [PATCH 05/10] i965: Rely on hardware contexts for query objects on Gen6+.

Kenneth Graunke kenneth at whitecape.org
Fri May 17 10:17:56 PDT 2013


Hardware contexts greatly simplify the query object code.  The pipeline
statistics counters get saved and restored with the context, which means
that we don't need to worry about other workloads polluting them.

This means that we can simply write a single pair of values (one at
BeginQuery and one at EndQuery) rather than a series of pairs.  This
also means we don't need to worry about the BO getting full.  We also
don't need to delay BO allocation and starting snapshot until the first
draw.

The generation split here is a little off: technically, Ironlake can also
support hardware contexts.  However, the kernel currently doesn't, and
even if it were to do so someday, we'd need to wait a while before
bumping the kernel requirement to take advantage of it.

Cc: Eric Anholt <eric at anholt.net>
Cc: Paul Berry <stereotype441 at gmail.com>
Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/Makefile.sources |   1 +
 src/mesa/drivers/dri/i965/brw_context.c    |   2 +
 src/mesa/drivers/dri/i965/brw_context.h    |   3 +
 src/mesa/drivers/dri/i965/brw_queryobj.c   |  83 ++-----
 src/mesa/drivers/dri/i965/gen6_queryobj.c  | 354 +++++++++++++++++++++++++++++
 5 files changed, 383 insertions(+), 60 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/gen6_queryobj.c

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index a0ffd3a..d67a5a4 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -104,6 +104,7 @@ i965_FILES = \
 	gen6_depthstencil.c \
 	gen6_gs_state.c \
         gen6_multisample_state.c \
+	gen6_queryobj.c \
 	gen6_sampler_state.c \
 	gen6_scissor_state.c \
 	gen6_sf_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 2f5fedb..beade5c 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -88,6 +88,8 @@ static void brwInitDriverFunctions(struct intel_screen *screen,
 
    brwInitFragProgFuncs( functions );
    brw_init_queryobj_functions(functions);
+   if (screen->gen >= 6)
+      gen6_reinit_queryobj_functions(functions);
 
    functions->QuerySamplesForFormat = brw_query_samples_for_format;
    functions->BeginTransformFeedback = brw_begin_transform_feedback;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 9baf57b..9ef6aca 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1164,6 +1164,9 @@ void brw_init_queryobj_functions(struct dd_function_table *functions);
 void brw_emit_query_begin(struct brw_context *brw);
 void brw_emit_query_end(struct brw_context *brw);
 
+/** gen6_queryobj.c */
+void gen6_reinit_queryobj_functions(struct dd_function_table *functions);
+
 /*======================================================================
  * brw_state_dump.c
  */
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index 40f926b..1c1e0b4 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -94,40 +94,21 @@ write_timestamp(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
 static void
 write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
 {
-   if (intel->gen >= 6) {
-      /* Emit Sandybridge workaround flush: */
-      if (intel->gen == 6)
-         intel_emit_post_sync_nonzero_flush(intel);
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(PIPE_CONTROL_DEPTH_STALL |
-                PIPE_CONTROL_WRITE_DEPTH_COUNT);
-      OUT_RELOC(query_bo,
-                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                PIPE_CONTROL_GLOBAL_GTT_WRITE |
-                (idx * sizeof(uint64_t)));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
-                PIPE_CONTROL_DEPTH_STALL |
-                PIPE_CONTROL_WRITE_DEPTH_COUNT);
-      /* This object could be mapped cacheable, but we don't have an exposed
-       * mechanism to support that.  Since it's going uncached, tell GEM that
-       * we're writing to it.  The usual clflush should be all that's required
-       * to pick up the results.
-       */
-      OUT_RELOC(query_bo,
-                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                PIPE_CONTROL_GLOBAL_GTT_WRITE |
-                (idx * sizeof(uint64_t)));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
+   BEGIN_BATCH(4);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
+             PIPE_CONTROL_DEPTH_STALL | PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   /* This object could be mapped cacheable, but we don't have an exposed
+    * mechanism to support that.  Since it's going uncached, tell GEM that
+    * we're writing to it.  The usual clflush should be all that's required
+    * to pick up the results.
+    */
+   OUT_RELOC(query_bo,
+             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+             PIPE_CONTROL_GLOBAL_GTT_WRITE |
+             (idx * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
 }
 
 /**
@@ -165,36 +146,12 @@ brw_queryobj_get_results(struct gl_context *ctx,
       /* The query BO contains the starting and ending timestamps.
        * Subtract the two and convert to nanoseconds.
        */
-      if (intel->gen >= 6)
-	 query->Base.Result += 80 * (results[1] - results[0]);
-      else
-	 query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
+      query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
       break;
 
    case GL_TIMESTAMP:
       /* The query BO contains a single timestamp value in results[0]. */
-      if (intel->gen >= 6) {
-         /* Our timer is a clock that increments every 80ns (regardless of
-          * other clock scaling in the system).  The timestamp register we can
-          * read for glGetTimestamp() masks out the top 32 bits, so we do that
-          * here too to let the two counters be compared against each other.
-          *
-          * If we just multiplied that 32 bits of data by 80, it would roll
-          * over at a non-power-of-two, so an application couldn't use
-          * GL_QUERY_COUNTER_BITS to handle rollover correctly.  Instead, we
-          * report 36 bits and truncate at that (rolling over 5 times as often
-          * as the HW counter), and when the 32-bit counter rolls over, it
-          * happens to also be at a rollover in the reported value from near
-          * (1<<36) to 0.
-          *
-          * The low 32 bits rolls over in ~343 seconds.  Our 36-bit result
-          * rolls over every ~69 seconds.
-          */
-	 query->Base.Result = 80 * (results[0] & 0xffffffff);
-         query->Base.Result &= (1ull << 36) - 1;
-      } else {
-	 query->Base.Result = 1000 * (results[0] >> 32);
-      }
+      query->Base.Result = 1000 * (results[0] >> 32);
       break;
 
    case GL_SAMPLES_PASSED_ARB:
@@ -545,6 +502,9 @@ brw_emit_query_begin(struct brw_context *brw)
    struct gl_context *ctx = &intel->ctx;
    struct brw_query_object *query = brw->query.obj;
 
+   if (intel->hw_ctx)
+      return;
+
    /* Skip if we're not doing any queries, or we've already recorded the
     * initial query value for this batchbuffer.
     */
@@ -569,6 +529,9 @@ brw_emit_query_end(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    struct brw_query_object *query = brw->query.obj;
 
+   if (intel->hw_ctx)
+      return;
+
    if (!brw->query.begin_emitted)
       return;
 
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
new file mode 100644
index 0000000..28af8d7
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric at anholt.net>
+ *    Kenneth Graunke <kenneth at whitecape.org>
+ */
+
+/** @file gen6_queryobj.c
+ *
+ * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query,
+ * GL_EXT_transform_feedback, and friends) on platforms that support
+ * hardware contexts (Gen6+).
+ */
+#include "main/imports.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+/**
+ * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
+ */
+static void
+write_timestamp(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
+{
+   /* Emit workaround flushes: */
+   if (intel->gen == 6) {
+      /* The timestamp write below is a non-zero post-sync op, which on
+       * Gen6 necessitates a CS stall.  CS stalls need stall at scoreboard
+       * set.  See the comments for intel_emit_post_sync_nonzero_flush().
+       */
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
+      OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   BEGIN_BATCH(5);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+   OUT_BATCH(PIPE_CONTROL_WRITE_TIMESTAMP);
+   OUT_RELOC(query_bo,
+             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+             PIPE_CONTROL_GLOBAL_GTT_WRITE |
+             idx * sizeof(uint64_t));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+/**
+ * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer.
+ */
+static void
+write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
+{
+   /* Emit Sandybridge workaround flush: */
+   if (intel->gen == 6)
+      intel_emit_post_sync_nonzero_flush(intel);
+
+   BEGIN_BATCH(5);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+   OUT_BATCH(PIPE_CONTROL_DEPTH_STALL |
+             PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   OUT_RELOC(query_bo,
+             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+             PIPE_CONTROL_GLOBAL_GTT_WRITE |
+             (idx * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+/**
+ * Wait on the query object's BO and calculate the final result.
+ */
+static void
+gen6_queryobj_get_results(struct gl_context *ctx,
+                          struct brw_query_object *query)
+{
+   struct intel_context *intel = intel_context(ctx);
+
+   if (query->bo == NULL)
+      return;
+
+   /* If the application has requested the query result, but this batch is
+    * still contributing to it, flush it now so the results will be present
+    * when mapped.
+    */
+   if (drm_intel_bo_references(intel->batch.bo, query->bo))
+      intel_batchbuffer_flush(intel);
+
+   if (unlikely(intel->perf_debug)) {
+      if (drm_intel_bo_busy(query->bo)) {
+         perf_debug("Stalling on the GPU waiting for a query object.\n");
+      }
+   }
+
+   drm_intel_bo_map(query->bo, false);
+   uint64_t *results = query->bo->virtual;
+   switch (query->Base.Target) {
+   case GL_TIME_ELAPSED:
+      /* The query BO contains the starting and ending timestamps.
+       * Subtract the two and convert to nanoseconds.
+       */
+      query->Base.Result += 80 * (results[1] - results[0]);
+      break;
+
+   case GL_TIMESTAMP:
+      /* Our timer is a clock that increments every 80ns (regardless of
+       * other clock scaling in the system).  The timestamp register we can
+       * read for glGetTimestamp() masks out the top 32 bits, so we do that
+       * here too to let the two counters be compared against each other.
+       *
+       * If we just multiplied that 32 bits of data by 80, it would roll
+       * over at a non-power-of-two, so an application couldn't use
+       * GL_QUERY_COUNTER_BITS to handle rollover correctly.  Instead, we
+       * report 36 bits and truncate at that (rolling over 5 times as often
+       * as the HW counter), and when the 32-bit counter rolls over, it
+       * happens to also be at a rollover in the reported value from near
+       * (1<<36) to 0.
+       *
+       * The low 32 bits rolls over in ~343 seconds.  Our 36-bit result
+       * rolls over every ~69 seconds.
+       *
+       * The query BO contains a single timestamp value in results[0].
+       */
+      query->Base.Result = 80 * (results[0] & 0xffffffff);
+      query->Base.Result &= (1ull << 36) - 1;
+      break;
+
+   case GL_SAMPLES_PASSED_ARB:
+      query->Base.Result += results[1] - results[0];
+      break;
+
+   case GL_ANY_SAMPLES_PASSED:
+   case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
+      query->Base.Result = results[0] != results[1];
+      break;
+
+   case GL_PRIMITIVES_GENERATED:
+   case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
+      /* We don't actually query the hardware for this value, so query->bo
+       * should always be NULL and execution should never reach here.
+       */
+      assert(!"Unreachable");
+      break;
+
+   default:
+      assert(!"Unrecognized query target in brw_queryobj_get_results()");
+      break;
+   }
+   drm_intel_bo_unmap(query->bo);
+
+   /* Now that we've processed the data stored in the query's buffer object,
+    * we can release it.
+    */
+   drm_intel_bo_unreference(query->bo);
+   query->bo = NULL;
+}
+
+/**
+ * Driver hook for glBeginQuery().
+ *
+ * Initializes driver structures and emits any GPU commands required to begin
+ * recording data for the query.
+ */
+static void
+gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   switch (query->Base.Target) {
+   case GL_TIME_ELAPSED:
+      /* For timestamp queries, we record the starting time right away so that
+       * we measure the full time between BeginQuery and EndQuery.  There's
+       * some debate about whether this is the right thing to do.  Our decision
+       * is based on the following text from the ARB_timer_query extension:
+       *
+       * "(5) Should the extension measure total time elapsed between the full
+       *      completion of the BeginQuery and EndQuery commands, or just time
+       *      spent in the graphics library?
+       *
+       *  RESOLVED:  This extension will measure the total time elapsed
+       *  between the full completion of these commands.  Future extensions
+       *  may implement a query to determine time elapsed at different stages
+       *  of the graphics pipeline."
+       *
+       * We write a starting timestamp now (at index 0).  At EndQuery() time,
+       * we'll write a second timestamp (at index 1), and subtract the two to
+       * obtain the time elapsed.  Notably, this includes time elapsed while
+       * the system was doing other work, such as running other applications.
+       */
+      drm_intel_bo_unreference(query->bo);
+      query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query", 4096, 4096);
+      write_timestamp(intel, query->bo, 0);
+      break;
+
+   case GL_ANY_SAMPLES_PASSED:
+   case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
+   case GL_SAMPLES_PASSED_ARB:
+      /* Since we're starting a new query, we need to be sure to throw away
+       * any previous occlusion query results.
+       */
+      drm_intel_bo_unreference(query->bo);
+      query->bo = drm_intel_bo_alloc(intel->bufmgr, "occl. query", 4096, 4096);
+      write_depth_count(intel, query->bo, 0);
+      break;
+
+   case GL_PRIMITIVES_GENERATED:
+      /* We don't actually query the hardware for this value; we keep track of
+       * it a software counter.  So just reset the counter.
+       */
+      brw->sol.primitives_generated = 0;
+      brw->sol.counting_primitives_generated = true;
+      break;
+
+   case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
+      /* We don't actually query the hardware for this value; we keep track of
+       * it a software counter.  So just reset the counter.
+       */
+      brw->sol.primitives_written = 0;
+      brw->sol.counting_primitives_written = true;
+      break;
+
+   default:
+      assert(!"Unrecognized query target in brw_begin_query()");
+      break;
+   }
+}
+
+/**
+ * Driver hook for glEndQuery().
+ *
+ * Emits GPU commands to record a final query value, ending any data capturing.
+ * However, the final result isn't necessarily available until the GPU processes
+ * those commands.  brw_queryobj_get_results() processes the captured data to
+ * produce the final result.
+ */
+static void
+gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   switch (query->Base.Target) {
+   case GL_TIME_ELAPSED:
+      write_timestamp(intel, query->bo, 1);
+      break;
+
+   case GL_ANY_SAMPLES_PASSED:
+   case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
+   case GL_SAMPLES_PASSED_ARB:
+      write_depth_count(intel, query->bo, 1);
+      break;
+
+   case GL_PRIMITIVES_GENERATED:
+      /* We don't actually query the hardware for this value; we keep track of
+       * it in a software counter.  So just read the counter and store it in
+       * the query object.
+       */
+      query->Base.Result = brw->sol.primitives_generated;
+      brw->sol.counting_primitives_generated = false;
+      break;
+
+   case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
+      /* We don't actually query the hardware for this value; we keep track of
+       * it in a software counter.  So just read the counter and store it in
+       * the query object.
+       */
+      query->Base.Result = brw->sol.primitives_written;
+      brw->sol.counting_primitives_written = false;
+      break;
+
+   default:
+      assert(!"Unrecognized query target in brw_end_query()");
+      break;
+   }
+}
+
+/**
+ * The WaitQuery() driver hook.
+ *
+ * Wait for a query result to become available and return it.  This is the
+ * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname.
+ */
+static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   gen6_queryobj_get_results(ctx, query);
+   query->Base.Ready = true;
+}
+
+/**
+ * The CheckQuery() driver hook.
+ *
+ * Checks whether a query result is ready yet.  If not, flushes.
+ * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname.
+ */
+static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* From the GL_ARB_occlusion_query spec:
+    *
+    *     "Instead of allowing for an infinite loop, performing a
+    *      QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is
+    *      not ready yet on the first time it is queried.  This ensures that
+    *      the async query will return true in finite time.
+    */
+   if (query->bo && drm_intel_bo_references(intel->batch.bo, query->bo))
+      intel_batchbuffer_flush(intel);
+
+   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
+      gen6_queryobj_get_results(ctx, query);
+      query->Base.Ready = true;
+   }
+}
+
+void gen6_reinit_queryobj_functions(struct dd_function_table *functions)
+{
+   functions->BeginQuery = gen6_begin_query;
+   functions->EndQuery = gen6_end_query;
+   functions->CheckQuery = gen6_check_query;
+   functions->WaitQuery = gen6_wait_query;
+}
-- 
1.8.2.3



More information about the mesa-dev mailing list