[Mesa-dev] [PATCH] intel: Refine the swapbuffers throttling by emitting a small no-op batch.

Thu Feb 16 23:46:38 PST 2012

Improves citybench performance by 39.8638% +/- 2.26632% (n=10) (at
default resolution and about the same for 320x240).
---
 src/mesa/drivers/dri/intel/intel_batchbuffer.c |    5 -----
 src/mesa/drivers/dri/intel/intel_context.c     |   21 +++++++++++++++++----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index d10e008..5631c19 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -211,11 +211,6 @@ _intel_batchbuffer_flush(struct intel_context *intel,
    if (intel->batch.used == 0)
       return 0;
 
-   if (intel->first_post_swapbuffers_batch == NULL) {
-      intel->first_post_swapbuffers_batch = intel->batch.bo;
-      drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
-   }
-
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
       fprintf(stderr, "%s:%d: Batchbuffer flush with %db used\n", file, line,
 	      4*intel->batch.used);
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 377bcbc..3d3feb6 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -419,18 +419,31 @@ intel_prepare_render(struct intel_context *intel)
     * We're using intelDRI2Flush (called from the loader before
     * swapbuffer) and glFlush (for front buffer rendering) as the
     * indicator that a frame is done and then throttle when we get
-    * here as we prepare to render the next frame.  At this point for
+    * here as we prepare to render the next frame.  At this point our
     * round trips for swap/copy and getting new buffers are done and
-    * we'll spend less time waiting on the GPU.
+    * we'll spend less time waiting on the GPU when we throttle here.
     *
     * Unfortunately, we don't have a handle to the batch containing
     * the swap, and getting our hands on that doesn't seem worth it,
-    * so we just us the first batch we emitted after the last swap.
+    * so we just use the first batch we emitted after the last swap.
+    * To bring the point we wait on closer to the swapbuffers, we make
+    * a tiny batchbuffer containing just a noop and flush it out.  The
+    * overhead of it is worth the savings on apps that only emit one or
+    * a few batchbuffers per frame.
     */
    if (intel->need_throttle && intel->first_post_swapbuffers_batch) {
       drm_intel_bo_wait_rendering(intel->first_post_swapbuffers_batch);
       drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
-      intel->first_post_swapbuffers_batch = NULL;
+
+      BEGIN_BATCH(1);
+      OUT_BATCH(MI_NOOP);
+      ADVANCE_BATCH();
+
+      intel->first_post_swapbuffers_batch = intel->batch.bo;
+      drm_intel_bo_reference(intel->batch.bo);
+
+      intel_batchbuffer_flush(intel);
+
       intel->need_throttle = false;
    }
 }
-- 
1.7.9