[Mesa-stable] [PATCH 5/6] i965: Avoid problems from referencing orphaned BOs after growing.

Wed Nov 29 00:13:20 UTC 2017

Growing the batch/state buffer is a lot more dangerous than I thought.

A number of places emit multiple state buffer sections, and then write
data to the returned pointer, or save a pointer to brw->batch.state.bo
and then use it in relocations.  If each call can grow, this can result
in stale map references or stale BO pointers.  Furthermore, fences refer
to the old batch BO, and that reference needs to continue working.

To avoid these woes, we avoid ever swapping the brw->batch.*.bo pointer,
instead exchanging the brw_bo structures in place.  That way, stale BO
references are fine - the GEM handle changes, but the brw_bo pointer
doesn't.  We also defer the memcpy until a quiescent point, so callers
can write to the returned pointer - which may be in either BO - and
we'll sort it out and combine the two properly in the end.

Fixes GPU hangs in DiRT Rally.

Fixes: 2dfc119f22f257082ab0 "i965: Grow the batch/state buffers if we need space and can't flush."
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103101
---
 src/mesa/drivers/dri/i965/brw_context.h       |   2 +
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 138 ++++++++++++++++++++------
 2 files changed, 107 insertions(+), 33 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 06704838061..09a03f4886b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -473,6 +473,8 @@ struct brw_growing_bo {
    struct brw_bo *bo;
    uint32_t *map;
    uint32_t *cpu_map;
+   struct brw_bo *partial_bo;
+   unsigned partial_bytes;
 };
 
 struct intel_batchbuffer {
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index b30ffe5d7f6..12d165d7236 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -183,6 +183,8 @@ intel_batchbuffer_reset(struct brw_context *brw)
    batch->last_bo = batch->batch.bo;
 
    batch->batch.bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
+   batch->batch.partial_bo = 0;
+   batch->batch.partial_bytes = 0;
    if (!batch->batch.cpu_map) {
       batch->batch.map =
          brw_bo_map(brw, batch->batch.bo, MAP_READ | MAP_WRITE);
@@ -192,6 +194,8 @@ intel_batchbuffer_reset(struct brw_context *brw)
    batch->state.bo = brw_bo_alloc(bufmgr, "statebuffer", STATE_SZ, 4096);
    batch->state.bo->kflags =
       can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
+   batch->state.partial_bo = NULL;
+   batch->state.partial_bytes = 0;
    if (!batch->state.cpu_map) {
       batch->state.map =
          brw_bo_map(brw, batch->state.bo, MAP_READ | MAP_WRITE);
@@ -270,6 +274,29 @@ intel_batchbuffer_free(struct intel_batchbuffer *batch)
       _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
 }
 
+/**
+ * Finish copying the old batch/state buffer's contents to the new one
+ * after we tried to "grow" the buffer in an earlier operation.
+ */
+static void
+finish_growing_bos(struct brw_growing_bo *grow)
+{
+   struct brw_bo *old_bo = grow->partial_bo;
+   if (!old_bo)
+      return;
+
+   if (!grow->cpu_map) {
+      void *old_map = old_bo->map_cpu ? old_bo->map_cpu :
+                      (old_bo->map_wc ? old_bo->map_wc : old_bo->map_gtt);
+      memcpy(grow->map, old_map, grow->partial_bytes);
+   }
+
+   grow->partial_bo = NULL;
+   grow->partial_bytes = 0;
+
+   brw_bo_unreference(old_bo);
+}
+
 static void
 replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
                          uint32_t old_handle, uint32_t new_handle)
@@ -291,30 +318,31 @@ replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
  */
 static void
 grow_buffer(struct brw_context *brw,
-            struct brw_bo **bo_ptr,
-            uint32_t **map_ptr,
-            uint32_t **cpu_map_ptr,
+            struct brw_growing_bo *grow,
             unsigned existing_bytes,
             unsigned new_size)
 {
    struct intel_batchbuffer *batch = &brw->batch;
    struct brw_bufmgr *bufmgr = brw->bufmgr;
+   struct brw_bo *bo = grow->bo;
 
-   uint32_t *old_map = *map_ptr;
-   struct brw_bo *old_bo = *bo_ptr;
+   perf_debug("Growing %s - ran out of space\n", bo->name);
 
-   struct brw_bo *new_bo =
-      brw_bo_alloc(bufmgr, old_bo->name, new_size, old_bo->align);
-   uint32_t *new_map;
+   if (grow->partial_bo) {
+      /* We've already grown once, and now we need to do it again.
+       * Finish our last grow operation so we can start a new one.
+       * This should basically never happen.
+       */
+      perf_debug("Had to grow multiple times");
+      finish_growing_bos(grow);
+   }
 
-   perf_debug("Growing %s - ran out of space\n", old_bo->name);
+   struct brw_bo *new_bo = brw_bo_alloc(bufmgr, bo->name, new_size, bo->align);
 
-   /* Copy existing data to the new larger buffer */
-   if (*cpu_map_ptr) {
-      *cpu_map_ptr = new_map = realloc(*cpu_map_ptr, new_size);
+   if (grow->cpu_map) {
+      grow->cpu_map = realloc(grow->cpu_map, new_size);
    } else {
-      new_map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
-      memcpy(new_map, old_map, existing_bytes);
+      grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
    }
 
    /* Try to put the new BO at the same GTT offset as the old BO (which
@@ -326,21 +354,18 @@ grow_buffer(struct brw_context *brw,
     *
     * Also preserve kflags for EXEC_OBJECT_CAPTURE.
     */
-   new_bo->gtt_offset = old_bo->gtt_offset;
-   new_bo->index = old_bo->index;
-   new_bo->kflags = old_bo->kflags;
+   new_bo->gtt_offset = bo->gtt_offset;
+   new_bo->index = bo->index;
+   new_bo->kflags = bo->kflags;
 
    /* Batch/state buffers are per-context, and if we've run out of space,
     * we must have actually used them before, so...they will be in the list.
     */
-   assert(old_bo->index < batch->exec_count);
-   assert(batch->exec_bos[old_bo->index] == old_bo);
+   assert(bo->index < batch->exec_count);
+   assert(batch->exec_bos[bo->index] == bo);
 
    /* Update the validation list to use the new BO. */
-   batch->exec_bos[old_bo->index] = new_bo;
-   batch->validation_list[old_bo->index].handle = new_bo->gem_handle;
-   brw_bo_reference(new_bo);
-   brw_bo_unreference(old_bo);
+   batch->validation_list[bo->index].handle = new_bo->gem_handle;
 
    if (!batch->use_batch_first) {
       /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
@@ -349,16 +374,62 @@ grow_buffer(struct brw_context *brw,
        * list, which remains unchanged, so we can skip this.)
        */
       replace_bo_in_reloc_list(&batch->batch_relocs,
-                               old_bo->gem_handle, new_bo->gem_handle);
+                               bo->gem_handle, new_bo->gem_handle);
       replace_bo_in_reloc_list(&batch->state_relocs,
-                               old_bo->gem_handle, new_bo->gem_handle);
+                               bo->gem_handle, new_bo->gem_handle);
    }
 
-   /* Drop the *bo_ptr reference.  This should free the old BO. */
-   brw_bo_unreference(old_bo);
+   /* Exchange the two BOs...without breaking pointers to the old BO.
+    *
+    * Consider this scenario:
+    *
+    * 1. Somebody calls brw_state_batch() to get a region of memory, and
+    *    and then creates a brw_address pointing to brw->batch.state.bo.
+    * 2. They then call brw_state_batch() a second time, which happens to
+    *    grow and replace the state buffer.  They then try to emit a
+    *    relocation to their first section of memory.
+    *
+    * If we replace the brw->batch.state.bo pointer at step 2, we would
+    * break the address created in step 1.  They'd have a pointer to the
+    * old destroyed BO.  Emitting a relocation would add this dead BO to
+    * the validation list...causing /both/ statebuffers to be in the list,
+    * and all kinds of disasters.
+    *
+    * This is not a contrived case - BLORP vertex data upload hits this.
+    *
+    * There are worse scenarios too.  Fences for GL sync objects reference
+    * brw->batch.batch.bo.  If we replaced the batch pointer when growing,
+    * we'd need to chase down every fence and update it to point to the
+    * new BO.  Otherwise, it would refer to a "batch" that never actually
+    * gets submitted, and would fail to trigger.
+    *
+    * To work around both of these issues, we transmutate the buffers in
+    * place, making the existing struct brw_bo represent the new buffer,
+    * and "new_bo" represent the old BO.  This is highly unusual, but it
+    * seems like a necessary evil.
+    *
+    * We also defer the memcpy of the existing batch's contents.  Callers
+    * may make multiple brw_state_batch calls, and retain pointers to the
+    * old BO's map.  We'll perform the memcpy in finish_growing_bo() when
+    * we finally submit the batch, at which point we've finished uploading
+    * state, and nobody should have any old references anymore.
+    *
+    * To do that, we keep a reference to the old BO in grow->partial_bo,
+    * and store the number of bytes to copy in grow->partial_bytes.  We
+    * can monkey with the refcounts directly without atomics because these
+    * are per-context BOs and they can only be touched by this thread.
+    */
+   assert(new_bo->refcount == 1);
+   new_bo->refcount = bo->refcount;
+   bo->refcount = 1;
+
+   struct brw_bo tmp;
+   memcpy(&tmp, bo, sizeof(struct brw_bo));
+   memcpy(bo, new_bo, sizeof(struct brw_bo));
+   memcpy(new_bo, &tmp, sizeof(struct brw_bo));
 
-   *bo_ptr = new_bo;
-   *map_ptr = new_map;
+   grow->partial_bo = new_bo; /* the one reference of the OLD bo */
+   grow->partial_bytes = existing_bytes;
 }
 
 void
@@ -382,8 +453,7 @@ intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
          const unsigned new_size =
             MIN2(batch->batch.bo->size + batch->batch.bo->size / 2,
                  MAX_BATCH_SIZE);
-         grow_buffer(brw, &batch->batch.bo, &batch->batch.map,
-                     &batch->batch.cpu_map, batch_used, new_size);
+         grow_buffer(brw, &batch->batch, batch_used, new_size);
          batch->map_next = (void *) batch->batch.map + batch_used;
          assert(batch_used + sz < batch->batch.bo->size);
       }
@@ -926,6 +996,9 @@ _intel_batchbuffer_flush_fence(struct brw_context *brw,
    brw_finish_batch(brw);
    intel_upload_finish(brw);
 
+   finish_growing_bos(&brw->batch.batch);
+   finish_growing_bos(&brw->batch.state);
+
    if (brw->throttle_batch[0] == NULL) {
       brw->throttle_batch[0] = brw->batch.batch.bo;
       brw_bo_reference(brw->throttle_batch[0]);
@@ -1085,8 +1158,7 @@ brw_state_batch(struct brw_context *brw,
          const unsigned new_size =
             MIN2(batch->state.bo->size + batch->state.bo->size / 2,
                  MAX_STATE_SIZE);
-         grow_buffer(brw, &batch->state.bo, &batch->state.map,
-                     &batch->state.cpu_map, batch->state_used, new_size);
+         grow_buffer(brw, &batch->state, batch->state_used, new_size);
          assert(offset + size < batch->state.bo->size);
       }
    }
-- 
2.15.0