[Mesa-dev] [PATCH 2/2] i965: Optimize intel_batchbuffer_emit_dword().

Matt Turner mattst88 at gmail.com
Wed Jul 8 14:00:02 PDT 2015


By keeping a pointer to the next available location, we reduce the
number of memory accesses needed to write to the batchbuffer.

A net ~7k reduction of .text size, 7.5k of which is from the change to
intel_batchbuffer_emit_dword().

   text     data      bss      dec      hex  filename
4943740   195152    26192  5165084   4ed01c  i965_dri.so before
4936804   195152    26192  5158148   4eb504  i965_dri.so after

Combined with the previous patch, improves performance of Synmark
OglBatch7 by 4.05914% +/- 1.49373% (n=270) on Haswell.
---
Full disclosure: when testing on an IVB desktop, I measured a
regression in the same benchmark of -4.19005% +/- 1.15188% (n=30).
I don't have any explanation.

Testing on other benchmarks and especially on Atom systems would
be very welcome.

 src/mesa/drivers/dri/i965/brw_blorp.cpp            |  4 +--
 src/mesa/drivers/dri/i965/brw_context.h            |  5 ++--
 .../drivers/dri/i965/brw_performance_monitor.c     |  6 ++--
 src/mesa/drivers/dri/i965/brw_state_batch.c        |  4 +--
 src/mesa/drivers/dri/i965/brw_urb.c                |  6 ++--
 src/mesa/drivers/dri/i965/intel_batchbuffer.c      | 35 +++++++++++-----------
 src/mesa/drivers/dri/i965/intel_batchbuffer.h      | 11 ++++---
 7 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index 2ccfae1..eac1f00 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -226,7 +226,7 @@ retry:
    intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
    intel_batchbuffer_save_state(brw);
    drm_intel_bo *saved_bo = brw->batch.bo;
-   uint32_t saved_used = brw->batch.used;
+   uint32_t saved_used = USED_BATCH(brw->batch);
    uint32_t saved_state_batch_offset = brw->batch.state_batch_offset;
 
    switch (brw->gen) {
@@ -245,7 +245,7 @@ retry:
     * reserved enough space that a wrap will never happen.
     */
    assert(brw->batch.bo == saved_bo);
-   assert((brw->batch.used - saved_used) * 4 +
+   assert((USED_BATCH(brw->batch) - saved_used) * 4 +
           (saved_state_batch_offset - brw->batch.state_batch_offset) <
           estimated_max_batch_usage);
    /* Shut up compiler warnings on release build */
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index afb714b..c7ad35e 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -875,7 +875,8 @@ struct intel_batchbuffer {
 #ifdef DEBUG
    uint16_t emit, total;
 #endif
-   uint16_t used, reserved_space;
+   uint16_t reserved_space;
+   uint32_t *map_next;
    uint32_t *map;
    uint32_t *cpu_map;
 #define BATCH_SZ (8192*sizeof(uint32_t))
@@ -887,7 +888,7 @@ struct intel_batchbuffer {
    uint8_t pipe_controls_since_last_cs_stall;
 
    struct {
-      uint16_t used;
+      uint32_t *map_next;
       int reloc_count;
    } saved;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
index 0a12375..7e90e8a 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -710,7 +710,7 @@ emit_mi_report_perf_count(struct brw_context *brw,
    /* Make sure the commands to take a snapshot fits in a single batch. */
    intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
                                    RENDER_RING);
-   int batch_used = brw->batch.used;
+   int batch_used = USED_BATCH(brw->batch);
 
    /* Reports apparently don't always get written unless we flush first. */
    brw_emit_mi_flush(brw);
@@ -754,7 +754,7 @@ emit_mi_report_perf_count(struct brw_context *brw,
    brw_emit_mi_flush(brw);
 
    (void) batch_used;
-   assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
+   assert(USED_BATCH(brw->batch) - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
 }
 
 /**
@@ -1386,7 +1386,7 @@ void
 brw_perf_monitor_new_batch(struct brw_context *brw)
 {
    assert(brw->batch.ring == RENDER_RING);
-   assert(brw->gen < 6 || brw->batch.used == 0);
+   assert(brw->gen < 6 || USED_BATCH(brw->batch) == 0);
 
    if (brw->perfmon.oa_users == 0)
       return;
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index a405a80..d79e0ea 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -87,7 +87,7 @@ brw_annotate_aub(struct brw_context *brw)
    drm_intel_aub_annotation annotations[annotation_count];
    int a = 0;
    make_annotation(&annotations[a++], AUB_TRACE_TYPE_BATCH, 0,
-                   4*brw->batch.used);
+                   4 * USED_BATCH(brw->batch));
    for (int i = brw->state_batch_count; i-- > 0; ) {
       uint32_t type = brw->state_batch_list[i].type;
       uint32_t start_offset = brw->state_batch_list[i].offset;
@@ -136,7 +136,7 @@ __brw_state_batch(struct brw_context *brw,
     * space, then flush and try again.
     */
    if (batch->state_batch_offset < size ||
-       offset < 4*batch->used + batch->reserved_space) {
+       offset < 4 * USED_BATCH(*batch) + batch->reserved_space) {
       intel_batchbuffer_flush(brw);
       offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 6fcf1b0..6078c38 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -249,10 +249,10 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits1.cs_fence  = brw->urb.size;
 
    /* erratum: URB_FENCE must not cross a 64byte cacheline */
-   if ((brw->batch.used & 15) > 12) {
-      int pad = 16 - (brw->batch.used & 15);
+   if ((USED_BATCH(brw->batch) & 15) > 12) {
+      int pad = 16 - (USED_BATCH(brw->batch) & 15);
       do
-	 brw->batch.map[brw->batch.used++] = MI_NOOP;
+         *brw->batch.map_next++ = MI_NOOP;
       while (--pad);
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 54081a1..16876ed 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -57,6 +57,7 @@ intel_batchbuffer_init(struct brw_context *brw)
    if (!brw->has_llc) {
       brw->batch.cpu_map = malloc(BATCH_SZ);
       brw->batch.map = brw->batch.cpu_map;
+      brw->batch.map_next = brw->batch.cpu_map;
    }
 }
 
@@ -77,10 +78,10 @@ intel_batchbuffer_reset(struct brw_context *brw)
       drm_intel_bo_map(brw->batch.bo, true);
       brw->batch.map = brw->batch.bo->virtual;
    }
+   brw->batch.map_next = brw->batch.map;
 
    brw->batch.reserved_space = BATCH_RESERVED;
    brw->batch.state_batch_offset = brw->batch.bo->size;
-   brw->batch.used = 0;
    brw->batch.needs_sol_reset = false;
    brw->batch.pipe_controls_since_last_cs_stall = 0;
 
@@ -93,7 +94,7 @@ intel_batchbuffer_reset(struct brw_context *brw)
 void
 intel_batchbuffer_save_state(struct brw_context *brw)
 {
-   brw->batch.saved.used = brw->batch.used;
+   brw->batch.saved.map_next = brw->batch.map_next;
    brw->batch.saved.reloc_count =
       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
 }
@@ -103,8 +104,8 @@ intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 {
    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
 
-   brw->batch.used = brw->batch.saved.used;
-   if (brw->batch.used == 0)
+   brw->batch.map_next = brw->batch.saved.map_next;
+   if (USED_BATCH(brw->batch) == 0)
       brw->batch.ring = UNKNOWN_RING;
 }
 
@@ -133,7 +134,7 @@ do_batch_dump(struct brw_context *brw)
       drm_intel_decode_set_batch_pointer(decode,
 					 batch->bo->virtual,
 					 batch->bo->offset64,
-					 batch->used);
+                                         USED_BATCH(*batch));
    } else {
       fprintf(stderr,
 	      "WARNING: failed to map batchbuffer (%s), "
@@ -142,7 +143,7 @@ do_batch_dump(struct brw_context *brw)
       drm_intel_decode_set_batch_pointer(decode,
 					 batch->map,
 					 batch->bo->offset64,
-					 batch->used);
+                                         USED_BATCH(*batch));
    }
 
    drm_intel_decode_set_output_file(decode, stderr);
@@ -278,7 +279,7 @@ do_flush_locked(struct brw_context *brw)
    if (brw->has_llc) {
       drm_intel_bo_unmap(batch->bo);
    } else {
-      ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
+      ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
 	 ret = drm_intel_bo_subdata(batch->bo,
 				    batch->state_batch_offset,
@@ -303,11 +304,11 @@ do_flush_locked(struct brw_context *brw)
             brw_annotate_aub(brw);
 
 	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
-	    ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
-					flags);
+            ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
+                                        NULL, 0, 0, flags);
 	 } else {
 	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
-						4 * batch->used, flags);
+                                                4 * USED_BATCH(*batch), flags);
 	 }
       }
 
@@ -331,7 +332,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 {
    int ret;
 
-   if (brw->batch.used == 0)
+   if (USED_BATCH(brw->batch) == 0)
       return 0;
 
    if (brw->throttle_batch[0] == NULL) {
@@ -340,7 +341,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
-      int bytes_for_commands = 4 * brw->batch.used;
+      int bytes_for_commands = 4 * USED_BATCH(brw->batch);
       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
       int total_bytes = bytes_for_commands + bytes_for_state;
       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
@@ -356,7 +357,7 @@ _intel_batchbuffer_flush(struct brw_context *brw,
 
    /* Mark the end of the buffer. */
    intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
-   if (brw->batch.used & 1) {
+   if (USED_BATCH(brw->batch) & 1) {
       /* Round batchbuffer usage to 2 DWORDs. */
       intel_batchbuffer_emit_dword(brw, MI_NOOP);
    }
@@ -390,7 +391,7 @@ intel_batchbuffer_emit_reloc(struct brw_context *brw,
 {
    int ret;
 
-   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4 * USED_BATCH(brw->batch),
 				 buffer, delta,
 				 read_domains, write_domain);
    assert(ret == 0);
@@ -411,7 +412,7 @@ intel_batchbuffer_emit_reloc64(struct brw_context *brw,
                                uint32_t read_domains, uint32_t write_domain,
 			       uint32_t delta)
 {
-   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
+   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4 * USED_BATCH(brw->batch),
                                      buffer, delta,
                                      read_domains, write_domain);
    assert(ret == 0);
@@ -435,8 +436,8 @@ intel_batchbuffer_data(struct brw_context *brw,
 {
    assert((bytes & 3) == 0);
    intel_batchbuffer_require_space(brw, bytes, ring);
-   memcpy(brw->batch.map + brw->batch.used, data, bytes);
-   brw->batch.used += bytes >> 2;
+   memcpy(brw->batch.map_next, data, bytes);
+   brw->batch.map_next += bytes >> 2;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index f0971e9..a12387b 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -63,6 +63,9 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
                                     uint32_t read_domains,
                                     uint32_t write_domain,
                                     uint32_t offset);
+
+#define USED_BATCH(batch) ((uintptr_t)((batch).map_next - (batch).map))
+
 static inline uint32_t float_as_int(float f)
 {
    union {
@@ -83,7 +86,7 @@ static inline unsigned
 intel_batchbuffer_space(struct brw_context *brw)
 {
    return (brw->batch.state_batch_offset - brw->batch.reserved_space)
-      - brw->batch.used*4;
+      - USED_BATCH(brw->batch) * 4;
 }
 
 
@@ -93,7 +96,7 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
 #ifdef DEBUG
    assert(intel_batchbuffer_space(brw) >= 4);
 #endif
-   brw->batch.map[brw->batch.used++] = dword;
+   *brw->batch.map_next++ = dword;
    assert(brw->batch.ring != UNKNOWN_RING);
 }
 
@@ -135,7 +138,7 @@ intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
    intel_batchbuffer_require_space(brw, n * 4, ring);
 
 #ifdef DEBUG
-   brw->batch.emit = brw->batch.used;
+   brw->batch.emit = USED_BATCH(brw->batch);
    brw->batch.total = n;
 #endif
 }
@@ -145,7 +148,7 @@ intel_batchbuffer_advance(struct brw_context *brw)
 {
 #ifdef DEBUG
    struct intel_batchbuffer *batch = &brw->batch;
-   unsigned int _n = batch->used - batch->emit;
+   unsigned int _n = USED_BATCH(*batch) - batch->emit;
    assert(batch->total != 0);
    if (_n != batch->total) {
       fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",
-- 
2.3.6



More information about the mesa-dev mailing list