[Mesa-dev] [PATCH 12/13] i965: Per-context bo tracking for relocations

Wed Jul 19 10:09:20 UTC 2017

The kernel creates a unique binding for each instance of a GEM handle in
the per-process GTT. Keeping a single bo->offset64 used by multiple
contexts will therefore cause a lot of migration and relocation stalls
when the bo are reused between contexts. Not a common problem, but when
it does occur it can be devastating to performance (and prevents
scaling of a single client over multiple contexts).

The solution is simply keep the offset generated by the execbuf local to
each context. Since, we only add relocations into the batch, they are
transient and there is no advantage to any sharing of relocation offsets
between contexts (as the batches are not reused and are be rewritten
from scratch with fresh relocations each time).

The kernel uses a dense idr to allocate its handles, and so we can
reasonably expect a dense array of GEM handle used by a client within a
context. Though for multi-contexts clients, and if the kernel should
randomize the handles, then using a hashtable for the lookup will make
more sense for the sparser array. However, at present, we can use a direct
mapping to store a mapping between the exec_object used by batch and the
global brw_bo.

Cc: Kenneth Graunke <kenneth at whitecape.org>
Cc: Matt Turner <mattst88 at gmail.com>
Cc: Jason Ekstrand <jason.ekstrand at intel.com>
Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c        |   2 -
 src/mesa/drivers/dri/i965/brw_bufmgr.h        |  14 ---
 src/mesa/drivers/dri/i965/brw_context.h       |   5 +-
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 160 +++++++++++++++-----------
 4 files changed, 95 insertions(+), 86 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 9723124e8d..8dd63f234a 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -388,7 +388,6 @@ retry:
    p_atomic_set(&bo->refcount, 1);
    bo->reusable = true;
    bo->cache_coherent = bufmgr->has_llc;
-   bo->index = -1;
 
    pthread_mutex_unlock(&bufmgr->lock);
 
@@ -513,7 +512,6 @@ brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr,
    p_atomic_set(&bo->refcount, 1);
 
    bo->size = open_arg.size;
-   bo->offset64 = 0;
    bo->bufmgr = bufmgr;
    bo->gem_handle = open_arg.handle;
    bo->name = name;
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 5310e32ac3..5f17023639 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -69,20 +69,6 @@ struct brw_bo {
    uint32_t gem_handle;
 
    /**
-    * Last seen card virtual address (offset from the beginning of the
-    * aperture) for the object.  This should be used to fill relocation
-    * entries when calling brw_bo_emit_reloc()
-    */
-   uint64_t offset64;
-
-   /**
-    * Index of this buffer inside the batch, -1 when not in a batch. Note
-    * that a single buffer may be in multiple batches (contexts), the index
-    * only refers to its last use and should not be trusted!
-    */
-   unsigned int index;
-
-   /**
     * Boolean of whether the GPU is definitely not accessing the buffer.
     *
     * This is only valid when reusable, since non-reusable
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 62ce5e472c..327202edad 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -435,6 +435,8 @@ enum brw_gpu_ring {
    BLT_RING,
 };
 
+struct brw_exec_bo;
+
 struct intel_batchbuffer {
    /** Current batchbuffer being queued up. */
    struct brw_bo *bo;
@@ -461,7 +463,8 @@ struct intel_batchbuffer {
    int reloc_array_size;
    /** The validation list */
    struct drm_i915_gem_exec_object2 *exec_objects;
-   struct brw_bo **exec_bos;
+   struct brw_exec_bo *exec_bos;
+   int max_exec_handle;
    int exec_count;
    int exec_array_size;
    /** The amount of aperture space (in bytes) used by all exec_bos */
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 933812810a..89a2bd16f1 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -40,6 +40,18 @@
 
 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
 
+struct brw_exec_bo {
+   struct brw_bo *bo;
+   struct drm_i915_gem_exec_object2 *exec;
+
+   /**
+    * Last seen card virtual address (offset from the beginning of the
+    * aperture) for the object in this context/batch.  This should be used to
+    * fill relocation entries when calling brw_bo_emit_reloc()
+    */
+   uint64_t offset;
+};
+
 static void
 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
                         struct brw_bufmgr *bufmgr,
@@ -74,10 +86,9 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
                           sizeof(struct drm_i915_gem_relocation_entry));
    batch->exec_count = 0;
    batch->exec_array_size = 100;
-   batch->exec_bos =
-      malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
    batch->exec_objects =
       malloc(batch->exec_array_size * sizeof(batch->exec_objects[0]));
+   batch->max_exec_handle = 0;
 
    if (INTEL_DEBUG & DEBUG_BATCH) {
       batch->state_batch_sizes =
@@ -95,48 +106,56 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
    intel_batchbuffer_reset(batch, bufmgr, has_llc);
 }
 
-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
-static unsigned int
+static struct drm_i915_gem_exec_object2 *
 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
 {
-   if (bo != batch->bo) {
-      unsigned int index = READ_ONCE(bo->index);
+   if (unlikely(bo->gem_handle >= batch->max_exec_handle)) {
+      unsigned int max = ALIGN(bo->gem_handle + 1, 64);
+      struct brw_exec_bo *bos;
 
-      if (index < batch->exec_count && batch->exec_bos[index] == bo)
-         return index;
+      bos = realloc(batch->exec_bos, max * sizeof(batch->exec_bos[0]));
+      assert(bos);
+      memset(bos + batch->max_exec_handle, 0,
+             (max - batch->max_exec_handle) * sizeof(batch->exec_bos[0]));
 
-      /* May have been shared between multiple active batches */
-      for (index = 0; index < batch->exec_count; index++) {
-         if (batch->exec_bos[index] == bo)
-            return index;
-      }
+      batch->exec_bos = bos;
+      batch->max_exec_handle = max;
+   }
 
-      brw_bo_reference(bo);
+   struct brw_exec_bo *exec_bo = &batch->exec_bos[bo->gem_handle];
+   if (exec_bo->exec) {
+      assert(exec_bo->bo == bo);
+      return exec_bo->exec;
    }
 
-   if (batch->exec_count == batch->exec_array_size) {
+   if (unlikely(batch->exec_count == batch->exec_array_size)) {
       batch->exec_array_size *= 2;
-      batch->exec_bos =
-         realloc(batch->exec_bos,
-                 batch->exec_array_size * sizeof(batch->exec_bos[0]));
       batch->exec_objects =
          realloc(batch->exec_objects,
                  batch->exec_array_size * sizeof(batch->exec_objects[0]));
+
+      for (int i = 0; i < batch->exec_count; i++) {
+         struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[i];
+         assert(batch->exec_bos[exec->handle].exec);
+         batch->exec_bos[exec->handle].exec = exec;
+      }
    }
 
    struct drm_i915_gem_exec_object2 *exec =
-      memset(&batch->exec_objects[batch->exec_count], 0, sizeof(*exec));
+      memset(&batch->exec_objects[batch->exec_count++], 0, sizeof(*exec));
    exec->handle = bo->gem_handle;
    exec->alignment = bo->align;
-   exec->offset = bo->offset64;
+   exec->offset = exec_bo->offset;
    exec->flags = bo->kflags;
 
-   bo->index = batch->exec_count;
-   batch->exec_bos[batch->exec_count] = bo;
    batch->aperture_space += bo->size;
+   exec_bo->exec = exec;
+   exec_bo->bo = bo;
+
+   if (bo != batch->bo)
+      brw_bo_reference(bo);
 
-   return batch->exec_count++;
+   return exec;
 }
 
 static void
@@ -157,7 +176,6 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
    batch->map_next = batch->map;
 
    add_exec_bo(batch, batch->bo);
-   assert(batch->bo->index == 0);
 
    batch->reserved_space = BATCH_RESERVED;
    batch->state_batch_offset = batch->bo->size;
@@ -191,18 +209,24 @@ intel_batchbuffer_save_state(struct brw_context *brw)
 void
 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 {
+   struct intel_batchbuffer *batch = &brw->batch;
+
    for (int i = brw->batch.saved.exec_count;
         i < brw->batch.exec_count; i++) {
-      if (brw->batch.exec_bos[i] != brw->batch.bo) {
-         brw_bo_unreference(brw->batch.exec_bos[i]);
-      }
+      struct drm_i915_gem_exec_object2 *exec_object = &batch->exec_objects[i];
+      struct brw_exec_bo *exec_bo = &batch->exec_bos[exec_object->handle];
+
+      assert(exec_bo->exec == exec_object);
+      exec_bo->exec = NULL;
+      if (exec_bo->bo != batch->bo)
+         brw_bo_unreference(exec_bo->bo);
    }
-   brw->batch.reloc_count = brw->batch.saved.reloc_count;
-   brw->batch.exec_count = brw->batch.saved.exec_count;
+   batch->reloc_count = batch->saved.reloc_count;
+   batch->exec_count = batch->saved.exec_count;
 
-   brw->batch.map_next = brw->batch.saved.map_next;
-   if (USED_BATCH(brw->batch) == 0)
-      brw->batch.ring = UNKNOWN_RING;
+   batch->map_next = batch->saved.map_next;
+   if (USED_BATCH(*batch) == 0)
+      batch->ring = UNKNOWN_RING;
 }
 
 void
@@ -287,7 +311,7 @@ decode_structs(struct brw_context *brw, struct gen_spec *spec,
 }
 
 static void
-do_batch_dump(struct brw_context *brw)
+do_batch_dump(struct brw_context *brw, uint64_t gtt_offset)
 {
    struct intel_batchbuffer *batch = &brw->batch;
    struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
@@ -304,7 +328,6 @@ do_batch_dump(struct brw_context *brw)
 
    uint32_t *data = map ? map : batch->map;
    uint32_t *end = data + USED_BATCH(*batch);
-   uint32_t gtt_offset = map ? batch->bo->offset64 : 0;
    int length;
 
    bool color = INTEL_DEBUG & DEBUG_COLOR;
@@ -430,7 +453,7 @@ do_batch_dump(struct brw_context *brw)
    }
 }
 #else
-static void do_batch_dump(struct brw_context *brw) { }
+static void do_batch_dump(struct brw_context *brw, uint64_t offset) { }
 #endif
 
 /**
@@ -605,22 +628,24 @@ execbuffer(struct brw_context *brw,
    }
 
    for (int i = 0; i < batch->exec_count; i++) {
-      struct brw_bo *bo = batch->exec_bos[i];
+      struct drm_i915_gem_exec_object2 *exec_object = &batch->exec_objects[i];
+      struct brw_exec_bo *exec_bo = &batch->exec_bos[exec_object->handle];
 
-      bo->idle = false;
-      bo->index = -1;
+      assert(exec_bo->exec == exec_object);
+      exec_bo->exec = NULL;
 
-      /* Update brw_bo::offset64 */
-      if (batch->exec_objects[i].offset != bo->offset64) {
+      if (exec_object->offset != exec_bo->offset) {
          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
-             bo->gem_handle, bo->offset64, batch->exec_objects[i].offset);
-         bo->offset64 = batch->exec_objects[i].offset;
+             exec_object->handle, exec_bo->offset, exec_object->offset);
+         exec_bo->offset = exec_object->offset;
       }
 
-      if (batch->exec_bos[i] != batch->bo) {
-         brw_bo_unreference(batch->exec_bos[i]);
-      }
-      batch->exec_bos[i] = NULL;
+      struct brw_bo *bo = exec_bo->bo;
+      assert(bo->gem_handle == exec_object->handle);
+
+      bo->idle = false;
+      if (bo != batch->bo)
+         brw_bo_unreference(bo);
    }
 
    if (ret == 0 && out_fence != NULL)
@@ -680,8 +705,12 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
    } else {
       struct drm_i915_gem_exec_object2 tmp = *exec;
       unsigned int index = batch->exec_count - 1;
+
       *exec = batch->exec_objects[index];
       batch->exec_objects[index] = tmp;
+
+      batch->exec_bos[tmp.handle].exec = &batch->exec_objects[index];
+      batch->exec_bos[exec->handle].exec = &batch->exec_objects[0];
    }
 
    if (ret == 0) {
@@ -691,7 +720,7 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
    throttle(brw);
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
-      do_batch_dump(brw);
+      do_batch_dump(brw, exec->offset);
 
    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
       brw_check_for_reset(brw);
@@ -776,15 +805,10 @@ brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
 bool
 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
 {
-   unsigned int index = READ_ONCE(bo->index);
-   if (index < batch->exec_count && batch->exec_bos[index] == bo)
-      return true;
+   if (bo->gem_handle >= batch->max_exec_handle)
+      return false;
 
-   for (int i = 0; i < batch->exec_count; i++) {
-      if (batch->exec_bos[i] == bo)
-         return true;
-   }
-   return false;
+   return batch->exec_bos[bo->gem_handle].exec;
 }
 
 /*  This is the only way buffers get added to the validate list.
@@ -807,16 +831,7 @@ __brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
    assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
    assert(_mesa_bitcount(write_domain) <= 1);
 
-   struct drm_i915_gem_relocation_entry *reloc =
-      &batch->relocs[batch->reloc_count];
-
-   reloc->offset = batch_offset;
-   reloc->delta = target_offset;
-   reloc->read_domains = read_domains;
-   reloc->write_domain = write_domain;
-
-   unsigned int index = add_exec_bo(batch, target);
-   struct drm_i915_gem_exec_object2 *exec = &batch->exec_objects[index];
+   struct drm_i915_gem_exec_object2 *exec = add_exec_bo(batch, target);
 
    if (write_domain) {
       exec->flags |= EXEC_OBJECT_WRITE;
@@ -829,12 +844,19 @@ __brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
       }
    }
 
-   reloc->target_handle = batch->use_exec_lut ? index : target->gem_handle;
+   struct drm_i915_gem_relocation_entry *reloc =
+      &batch->relocs[batch->reloc_count++];
+
+   reloc->offset = batch_offset;
+   reloc->delta = target_offset;
+   reloc->read_domains = read_domains;
+   reloc->write_domain = write_domain;
+
+   reloc->target_handle =
+      batch->use_exec_lut ? exec - batch->exec_objects : target->gem_handle;
    reloc->presumed_offset = exec->offset;
    target_offset += exec->offset;
 
-   batch->reloc_count++;
-
    /* Using the old buffer offset, write in what the right data would be, in
     * case the buffer doesn't move and we can short-circuit the relocation
     * processing in the kernel
-- 
2.13.3