Mesa (master): i965: Rely on hardware contexts for query objects on Gen6+.

Mon May 20 20:03:43 UTC 2013

Module: Mesa
Branch: master
Commit: e32cd5ffbb7231f8d4bb44189492c89c9a4fbfa9
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=e32cd5ffbb7231f8d4bb44189492c89c9a4fbfa9

Author: Kenneth Graunke <kenneth at whitecape.org>
Date:   Wed May 15 21:33:01 2013 -0700

i965: Rely on hardware contexts for query objects on Gen6+.

Hardware contexts greatly simplify the query object code.  The pipeline
statistics counters get saved and restored with the context, which means
that we don't need to worry about other workloads polluting them.

This means that we can simply write a single pair of values (one at
BeginQuery and one at EndQuery) rather than a series of pairs.  This
also means we don't need to worry about the BO getting full.  We also
don't need to delay BO allocation and starting snapshot until the first
draw.

The generation split here is a little off: technically, Ironlake can also
support hardware contexts.  However, the kernel currently doesn't, and
even if it were to do so someday, we'd need to wait a while before
bumping the kernel requirement to take advantage of it.

v2: Incorporate Paul's feedback.
- Clarify which functions are Gen4/5-only via assertions and comments.
- Change how driver hook initialization happens.
- Update comments.
- Squash a bug fix from a later commit here where it belongs.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
Reviewed-by: Eric Anholt <eric at anholt.net> [v1]
Acked-by: Paul Berry <stereotype441 at gmail.com>

---

 src/mesa/drivers/dri/i965/Makefile.sources |    1 +
 src/mesa/drivers/dri/i965/brw_context.c    |    6 +-
 src/mesa/drivers/dri/i965/brw_context.h    |    6 +-
 src/mesa/drivers/dri/i965/brw_queryobj.c   |  124 ++++------
 src/mesa/drivers/dri/i965/gen6_queryobj.c  |  359 ++++++++++++++++++++++++++++
 5 files changed, 423 insertions(+), 73 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index a0ffd3a..d67a5a4 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -104,6 +104,7 @@ i965_FILES = \
 	gen6_depthstencil.c \
 	gen6_gs_state.c \
         gen6_multisample_state.c \
+	gen6_queryobj.c \
 	gen6_sampler_state.c \
 	gen6_scissor_state.c \
 	gen6_sf_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 2f5fedb..405580f 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -87,7 +87,11 @@ static void brwInitDriverFunctions(struct intel_screen *screen,
    intelInitDriverFunctions( functions );
 
    brwInitFragProgFuncs( functions );
-   brw_init_queryobj_functions(functions);
+   brw_init_common_queryobj_functions(functions);
+   if (screen->gen >= 6)
+      gen6_init_queryobj_functions(functions);
+   else
+      gen4_init_queryobj_functions(functions);
 
    functions->QuerySamplesForFormat = brw_query_samples_for_format;
    functions->BeginTransformFeedback = brw_begin_transform_feedback;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 9baf57b..4b43878 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1160,10 +1160,14 @@ void brw_workaround_depthstencil_alignment(struct brw_context *brw,
 /*======================================================================
  * brw_queryobj.c
  */
-void brw_init_queryobj_functions(struct dd_function_table *functions);
+void brw_init_common_queryobj_functions(struct dd_function_table *functions);
+void gen4_init_queryobj_functions(struct dd_function_table *functions);
 void brw_emit_query_begin(struct brw_context *brw);
 void brw_emit_query_end(struct brw_context *brw);
 
+/** gen6_queryobj.c */
+void gen6_init_queryobj_functions(struct dd_function_table *functions);
+
 /*======================================================================
  * brw_state_dump.c
  */
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index 40f926b..8579993 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -94,40 +94,23 @@ write_timestamp(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
 static void
 write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
 {
-   if (intel->gen >= 6) {
-      /* Emit Sandybridge workaround flush: */
-      if (intel->gen == 6)
-         intel_emit_post_sync_nonzero_flush(intel);
-
-      BEGIN_BATCH(5);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
-      OUT_BATCH(PIPE_CONTROL_DEPTH_STALL |
-                PIPE_CONTROL_WRITE_DEPTH_COUNT);
-      OUT_RELOC(query_bo,
-                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                PIPE_CONTROL_GLOBAL_GTT_WRITE |
-                (idx * sizeof(uint64_t)));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
-                PIPE_CONTROL_DEPTH_STALL |
-                PIPE_CONTROL_WRITE_DEPTH_COUNT);
-      /* This object could be mapped cacheable, but we don't have an exposed
-       * mechanism to support that.  Since it's going uncached, tell GEM that
-       * we're writing to it.  The usual clflush should be all that's required
-       * to pick up the results.
-       */
-      OUT_RELOC(query_bo,
-                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                PIPE_CONTROL_GLOBAL_GTT_WRITE |
-                (idx * sizeof(uint64_t)));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   }
+   assert(intel->gen < 6);
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
+             PIPE_CONTROL_DEPTH_STALL | PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   /* This object could be mapped cacheable, but we don't have an exposed
+    * mechanism to support that.  Since it's going uncached, tell GEM that
+    * we're writing to it.  The usual clflush should be all that's required
+    * to pick up the results.
+    */
+   OUT_RELOC(query_bo,
+             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+             PIPE_CONTROL_GLOBAL_GTT_WRITE |
+             (idx * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
 }
 
 /**
@@ -142,6 +125,8 @@ brw_queryobj_get_results(struct gl_context *ctx,
    int i;
    uint64_t *results;
 
+   assert(intel->gen < 6);
+
    if (query->bo == NULL)
       return;
 
@@ -165,36 +150,12 @@ brw_queryobj_get_results(struct gl_context *ctx,
       /* The query BO contains the starting and ending timestamps.
        * Subtract the two and convert to nanoseconds.
        */
-      if (intel->gen >= 6)
-	 query->Base.Result += 80 * (results[1] - results[0]);
-      else
-	 query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
+      query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
       break;
 
    case GL_TIMESTAMP:
       /* The query BO contains a single timestamp value in results[0]. */
-      if (intel->gen >= 6) {
-         /* Our timer is a clock that increments every 80ns (regardless of
-          * other clock scaling in the system).  The timestamp register we can
-          * read for glGetTimestamp() masks out the top 32 bits, so we do that
-          * here too to let the two counters be compared against each other.
-          *
-          * If we just multiplied that 32 bits of data by 80, it would roll
-          * over at a non-power-of-two, so an application couldn't use
-          * GL_QUERY_COUNTER_BITS to handle rollover correctly.  Instead, we
-          * report 36 bits and truncate at that (rolling over 5 times as often
-          * as the HW counter), and when the 32-bit counter rolls over, it
-          * happens to also be at a rollover in the reported value from near
-          * (1<<36) to 0.
-          *
-          * The low 32 bits rolls over in ~343 seconds.  Our 36-bit result
-          * rolls over every ~69 seconds.
-          */
-	 query->Base.Result = 80 * (results[0] & 0xffffffff);
-         query->Base.Result &= (1ull << 36) - 1;
-      } else {
-	 query->Base.Result = 1000 * (results[0] >> 32);
-      }
+      query->Base.Result = 1000 * (results[0] >> 32);
       break;
 
    case GL_SAMPLES_PASSED_ARB:
@@ -280,7 +241,7 @@ brw_delete_query(struct gl_context *ctx, struct gl_query_object *q)
 }
 
 /**
- * Driver hook for glBeginQuery().
+ * Gen4-5 driver hook for glBeginQuery().
  *
  * Initializes driver structures and emits any GPU commands required to begin
  * recording data for the query.
@@ -292,6 +253,8 @@ brw_begin_query(struct gl_context *ctx, struct gl_query_object *q)
    struct intel_context *intel = intel_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
+   assert(intel->gen < 6);
+
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED_EXT:
       /* For timestamp queries, we record the starting time right away so that
@@ -365,7 +328,7 @@ brw_begin_query(struct gl_context *ctx, struct gl_query_object *q)
 }
 
 /**
- * Driver hook for glEndQuery().
+ * Gen4-5 driver hook for glEndQuery().
  *
  * Emits GPU commands to record a final query value, ending any data capturing.
  * However, the final result isn't necessarily available until the GPU processes
@@ -379,6 +342,8 @@ brw_end_query(struct gl_context *ctx, struct gl_query_object *q)
    struct intel_context *intel = intel_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
+   assert(intel->gen < 6);
+
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED_EXT:
       /* Write the final timestamp. */
@@ -450,7 +415,7 @@ brw_end_query(struct gl_context *ctx, struct gl_query_object *q)
 }
 
 /**
- * The WaitQuery() driver hook.
+ * The Gen4-5 WaitQuery() driver hook.
  *
  * Wait for a query result to become available and return it.  This is the
  * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname.
@@ -459,12 +424,14 @@ static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q)
 {
    struct brw_query_object *query = (struct brw_query_object *)q;
 
+   assert(intel_context(ctx)->gen < 6);
+
    brw_queryobj_get_results(ctx, query);
    query->Base.Ready = true;
 }
 
 /**
- * The CheckQuery() driver hook.
+ * The Gen4-5 CheckQuery() driver hook.
  *
  * Checks whether a query result is ready yet.  If not, flushes.
  * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname.
@@ -474,6 +441,8 @@ static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q)
    struct intel_context *intel = intel_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
+   assert(intel->gen < 6);
+
    /* From the GL_ARB_occlusion_query spec:
     *
     *     "Instead of allowing for an infinite loop, performing a
@@ -501,6 +470,8 @@ ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query)
 {
    struct intel_context *intel = intel_context(ctx);
 
+   assert(intel->gen < 6);
+
    if (!query->bo || query->last_index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
 
       if (query->bo != NULL) {
@@ -534,9 +505,7 @@ ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query)
  * produces the final expected value.
  *
  * In a world with hardware contexts, PS_DEPTH_COUNT is saved and restored
- * as part of the context state, so this is unnecessary.  We could simply
- * read two values and subtract them.  However, it's safe to continue using
- * the old approach.
+ * as part of the context state, so this is unnecessary, and skipped.
  */
 void
 brw_emit_query_begin(struct brw_context *brw)
@@ -545,6 +514,9 @@ brw_emit_query_begin(struct brw_context *brw)
    struct gl_context *ctx = &intel->ctx;
    struct brw_query_object *query = brw->query.obj;
 
+   if (intel->hw_ctx)
+      return;
+
    /* Skip if we're not doing any queries, or we've already recorded the
     * initial query value for this batchbuffer.
     */
@@ -559,7 +531,8 @@ brw_emit_query_begin(struct brw_context *brw)
 }
 
 /**
- * Called at batchbuffer flush to get an ending PS_DEPTH_COUNT.
+ * Called at batchbuffer flush to get an ending PS_DEPTH_COUNT
+ * (for non-hardware context platforms).
  *
  * See the explanation in brw_emit_query_begin().
  */
@@ -569,6 +542,9 @@ brw_emit_query_end(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    struct brw_query_object *query = brw->query.obj;
 
+   if (intel->hw_ctx)
+      return;
+
    if (!brw->query.begin_emitted)
       return;
 
@@ -619,14 +595,20 @@ brw_get_timestamp(struct gl_context *ctx)
    return result;
 }
 
-void brw_init_queryobj_functions(struct dd_function_table *functions)
+/* Initialize query object functions used on all generations. */
+void brw_init_common_queryobj_functions(struct dd_function_table *functions)
 {
    functions->NewQueryObject = brw_new_query_object;
    functions->DeleteQuery = brw_delete_query;
+   functions->QueryCounter = brw_query_counter;
+   functions->GetTimestamp = brw_get_timestamp;
+}
+
+/* Initialize Gen4/5-specific query object functions. */
+void gen4_init_queryobj_functions(struct dd_function_table *functions)
+{
    functions->BeginQuery = brw_begin_query;
    functions->EndQuery = brw_end_query;
-   functions->QueryCounter = brw_query_counter;
    functions->CheckQuery = brw_check_query;
    functions->WaitQuery = brw_wait_query;
-   functions->GetTimestamp = brw_get_timestamp;
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
new file mode 100644
index 0000000..3f2ed00
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric at anholt.net>
+ *    Kenneth Graunke <kenneth at whitecape.org>
+ */
+
+/** @file gen6_queryobj.c
+ *
+ * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query,
+ * GL_EXT_transform_feedback, and friends) on platforms that support
+ * hardware contexts (Gen6+).
+ */
+#include "main/imports.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+/**
+ * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
+ */
+static void
+write_timestamp(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
+{
+   /* Emit workaround flushes: */
+   if (intel->gen == 6) {
+      /* The timestamp write below is a non-zero post-sync op, which on
+       * Gen6 necessitates a CS stall.  CS stalls need stall at scoreboard
+       * set.  See the comments for intel_emit_post_sync_nonzero_flush().
+       */
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
+      OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+
+   BEGIN_BATCH(5);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+   OUT_BATCH(PIPE_CONTROL_WRITE_TIMESTAMP);
+   OUT_RELOC(query_bo,
+             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+             PIPE_CONTROL_GLOBAL_GTT_WRITE |
+             idx * sizeof(uint64_t));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+/**
+ * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer.
+ */
+static void
+write_depth_count(struct intel_context *intel, drm_intel_bo *query_bo, int idx)
+{
+   /* Emit Sandybridge workaround flush: */
+   if (intel->gen == 6)
+      intel_emit_post_sync_nonzero_flush(intel);
+
+   BEGIN_BATCH(5);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
+   OUT_BATCH(PIPE_CONTROL_DEPTH_STALL |
+             PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   OUT_RELOC(query_bo,
+             I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+             PIPE_CONTROL_GLOBAL_GTT_WRITE |
+             (idx * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+}
+
+/**
+ * Wait on the query object's BO and calculate the final result.
+ */
+static void
+gen6_queryobj_get_results(struct gl_context *ctx,
+                          struct brw_query_object *query)
+{
+   struct intel_context *intel = intel_context(ctx);
+
+   if (query->bo == NULL)
+      return;
+
+   /* If the application has requested the query result, but this batch is
+    * still contributing to it, flush it now so the results will be present
+    * when mapped.
+    */
+   if (drm_intel_bo_references(intel->batch.bo, query->bo))
+      intel_batchbuffer_flush(intel);
+
+   if (unlikely(intel->perf_debug)) {
+      if (drm_intel_bo_busy(query->bo)) {
+         perf_debug("Stalling on the GPU waiting for a query object.\n");
+      }
+   }
+
+   drm_intel_bo_map(query->bo, false);
+   uint64_t *results = query->bo->virtual;
+   switch (query->Base.Target) {
+   case GL_TIME_ELAPSED:
+      /* The query BO contains the starting and ending timestamps.
+       * Subtract the two and convert to nanoseconds.
+       */
+      query->Base.Result += 80 * (results[1] - results[0]);
+      break;
+
+   case GL_TIMESTAMP:
+      /* Our timer is a clock that increments every 80ns (regardless of
+       * other clock scaling in the system).  The timestamp register we can
+       * read for glGetTimestamp() masks out the top 32 bits, so we do that
+       * here too to let the two counters be compared against each other.
+       *
+       * If we just multiplied that 32 bits of data by 80, it would roll
+       * over at a non-power-of-two, so an application couldn't use
+       * GL_QUERY_COUNTER_BITS to handle rollover correctly.  Instead, we
+       * report 36 bits and truncate at that (rolling over 5 times as often
+       * as the HW counter), and when the 32-bit counter rolls over, it
+       * happens to also be at a rollover in the reported value from near
+       * (1<<36) to 0.
+       *
+       * The low 32 bits rolls over in ~343 seconds.  Our 36-bit result
+       * rolls over every ~69 seconds.
+       *
+       * The query BO contains a single timestamp value in results[0].
+       */
+      query->Base.Result = 80 * (results[0] & 0xffffffff);
+      query->Base.Result &= (1ull << 36) - 1;
+      break;
+
+   case GL_SAMPLES_PASSED_ARB:
+      /* We need to use += rather than = here since some BLT-based operations
+       * may have added additional samples to our occlusion query value.
+       */
+      query->Base.Result += results[1] - results[0];
+      break;
+
+   case GL_ANY_SAMPLES_PASSED:
+   case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
+      if (results[0] != results[1])
+         query->Base.Result = true;
+      break;
+
+   case GL_PRIMITIVES_GENERATED:
+   case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
+      /* We don't actually query the hardware for this value, so query->bo
+       * should always be NULL and execution should never reach here.
+       */
+      assert(!"Unreachable");
+      break;
+
+   default:
+      assert(!"Unrecognized query target in brw_queryobj_get_results()");
+      break;
+   }
+   drm_intel_bo_unmap(query->bo);
+
+   /* Now that we've processed the data stored in the query's buffer object,
+    * we can release it.
+    */
+   drm_intel_bo_unreference(query->bo);
+   query->bo = NULL;
+}
+
+/**
+ * Driver hook for glBeginQuery().
+ *
+ * Initializes driver structures and emits any GPU commands required to begin
+ * recording data for the query.
+ */
+static void
+gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   switch (query->Base.Target) {
+   case GL_TIME_ELAPSED:
+      /* For timestamp queries, we record the starting time right away so that
+       * we measure the full time between BeginQuery and EndQuery.  There's
+       * some debate about whether this is the right thing to do.  Our decision
+       * is based on the following text from the ARB_timer_query extension:
+       *
+       * "(5) Should the extension measure total time elapsed between the full
+       *      completion of the BeginQuery and EndQuery commands, or just time
+       *      spent in the graphics library?
+       *
+       *  RESOLVED:  This extension will measure the total time elapsed
+       *  between the full completion of these commands.  Future extensions
+       *  may implement a query to determine time elapsed at different stages
+       *  of the graphics pipeline."
+       *
+       * We write a starting timestamp now (at index 0).  At EndQuery() time,
+       * we'll write a second timestamp (at index 1), and subtract the two to
+       * obtain the time elapsed.  Notably, this includes time elapsed while
+       * the system was doing other work, such as running other applications.
+       */
+      drm_intel_bo_unreference(query->bo);
+      query->bo = drm_intel_bo_alloc(intel->bufmgr, "timer query", 4096, 4096);
+      write_timestamp(intel, query->bo, 0);
+      break;
+
+   case GL_ANY_SAMPLES_PASSED:
+   case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
+   case GL_SAMPLES_PASSED_ARB:
+      /* Since we're starting a new query, we need to be sure to throw away
+       * any previous occlusion query results.
+       */
+      drm_intel_bo_unreference(query->bo);
+      query->bo = drm_intel_bo_alloc(intel->bufmgr, "occl. query", 4096, 4096);
+      write_depth_count(intel, query->bo, 0);
+      break;
+
+   case GL_PRIMITIVES_GENERATED:
+      /* We don't actually query the hardware for this value; we keep track of
+       * it a software counter.  So just reset the counter.
+       */
+      brw->sol.primitives_generated = 0;
+      brw->sol.counting_primitives_generated = true;
+      break;
+
+   case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
+      /* We don't actually query the hardware for this value; we keep track of
+       * it a software counter.  So just reset the counter.
+       */
+      brw->sol.primitives_written = 0;
+      brw->sol.counting_primitives_written = true;
+      break;
+
+   default:
+      assert(!"Unrecognized query target in brw_begin_query()");
+      break;
+   }
+}
+
+/**
+ * Driver hook for glEndQuery().
+ *
+ * Emits GPU commands to record a final query value, ending any data capturing.
+ * However, the final result isn't necessarily available until the GPU processes
+ * those commands.  brw_queryobj_get_results() processes the captured data to
+ * produce the final result.
+ */
+static void
+gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   switch (query->Base.Target) {
+   case GL_TIME_ELAPSED:
+      write_timestamp(intel, query->bo, 1);
+      break;
+
+   case GL_ANY_SAMPLES_PASSED:
+   case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
+   case GL_SAMPLES_PASSED_ARB:
+      write_depth_count(intel, query->bo, 1);
+      break;
+
+   case GL_PRIMITIVES_GENERATED:
+      /* We don't actually query the hardware for this value; we keep track of
+       * it in a software counter.  So just read the counter and store it in
+       * the query object.
+       */
+      query->Base.Result = brw->sol.primitives_generated;
+      brw->sol.counting_primitives_generated = false;
+      break;
+
+   case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
+      /* We don't actually query the hardware for this value; we keep track of
+       * it in a software counter.  So just read the counter and store it in
+       * the query object.
+       */
+      query->Base.Result = brw->sol.primitives_written;
+      brw->sol.counting_primitives_written = false;
+      break;
+
+   default:
+      assert(!"Unrecognized query target in brw_end_query()");
+      break;
+   }
+}
+
+/**
+ * The WaitQuery() driver hook.
+ *
+ * Wait for a query result to become available and return it.  This is the
+ * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname.
+ */
+static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   gen6_queryobj_get_results(ctx, query);
+   query->Base.Ready = true;
+}
+
+/**
+ * The CheckQuery() driver hook.
+ *
+ * Checks whether a query result is ready yet.  If not, flushes.
+ * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname.
+ */
+static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* From the GL_ARB_occlusion_query spec:
+    *
+    *     "Instead of allowing for an infinite loop, performing a
+    *      QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is
+    *      not ready yet on the first time it is queried.  This ensures that
+    *      the async query will return true in finite time.
+    */
+   if (query->bo && drm_intel_bo_references(intel->batch.bo, query->bo))
+      intel_batchbuffer_flush(intel);
+
+   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
+      gen6_queryobj_get_results(ctx, query);
+      query->Base.Ready = true;
+   }
+}
+
+/* Initialize Gen6+-specific query object functions. */
+void gen6_init_queryobj_functions(struct dd_function_table *functions)
+{
+   functions->BeginQuery = gen6_begin_query;
+   functions->EndQuery = gen6_end_query;
+   functions->CheckQuery = gen6_check_query;
+   functions->WaitQuery = gen6_wait_query;
+}