[Mesa-dev] [PATCH 09/10] i965: Per-context bo tracking for relocations
Chris Wilson
chris at chris-wilson.co.uk
Fri Jul 21 15:36:51 UTC 2017
The kernel creates a unique binding for each instance of a GEM handle in
the per-process GTT. Keeping a single bo->offset64 used by multiple
contexts will therefore cause a lot of migration and relocation stalls
when the bo are reused between contexts. Not a common problem, but when
it does occur it can be devastating to performance (and prevents
scaling of a single client over multiple contexts).
The solution is simply keep the offset generated by the execbuf local to
each context. Since, we only add relocations into the batch, they are
transient and there is no advantage to any sharing of relocation offsets
between contexts (as the batches are not reused and are be rewritten
from scratch with fresh relocations each time).
The kernel uses a dense idr to allocate its handles, and so we can
reasonably expect a dense array of GEM handle used by a client within a
context. Though for multi-contexts clients, and if the kernel should
randomize the handles, then using a hashtable for the lookup will make
more sense for the sparser array. However, at present, we can use a direct
mapping to store a mapping between the exec_object used by batch and the
global brw_bo.
Cc: Kenneth Graunke <kenneth at whitecape.org>
Cc: Matt Turner <mattst88 at gmail.com>
Cc: Jason Ekstrand <jason.ekstrand at intel.com>
Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
---
src/mesa/drivers/dri/i965/brw_bufmgr.c | 2 -
src/mesa/drivers/dri/i965/brw_bufmgr.h | 17 ---
src/mesa/drivers/dri/i965/brw_context.h | 5 +-
src/mesa/drivers/dri/i965/intel_batchbuffer.c | 150 +++++++++++++++-----------
4 files changed, 93 insertions(+), 81 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 9723124e8d..8dd63f234a 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -388,7 +388,6 @@ retry:
p_atomic_set(&bo->refcount, 1);
bo->reusable = true;
bo->cache_coherent = bufmgr->has_llc;
- bo->index = -1;
pthread_mutex_unlock(&bufmgr->lock);
@@ -513,7 +512,6 @@ brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr,
p_atomic_set(&bo->refcount, 1);
bo->size = open_arg.size;
- bo->offset64 = 0;
bo->bufmgr = bufmgr;
bo->gem_handle = open_arg.handle;
bo->name = name;
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 083009b52e..5f17023639 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -69,23 +69,6 @@ struct brw_bo {
uint32_t gem_handle;
/**
- * Last seen card virtual address (offset from the beginning of the
- * aperture) for the object. This should be used to fill relocation
- * entries when calling brw_bo_emit_reloc()
- */
- uint64_t offset64;
-
- /**
- * The validation list index for this buffer, or -1 when not in a batch.
- * Note that a single buffer may be in multiple batches (contexts), and
- * this is a global field, which refers to the last batch using the BO.
- * It should not be considered authoritative, but can be used to avoid a
- * linear walk of the validation list in the common case by guessing that
- * exec_bos[bo->index] == bo and confirming whether that's the case.
- */
- unsigned int index;
-
- /**
* Boolean of whether the GPU is definitely not accessing the buffer.
*
* This is only valid when reusable, since non-reusable
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 57081fb434..b9cf6c01a7 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -435,6 +435,8 @@ enum brw_gpu_ring {
BLT_RING,
};
+struct brw_exec_bo;
+
struct intel_batchbuffer {
/** Current batchbuffer being queued up. */
struct brw_bo *bo;
@@ -462,7 +464,8 @@ struct intel_batchbuffer {
/** The validation list */
struct drm_i915_gem_exec_object2 *validation_list;
- struct brw_bo **exec_bos;
+ struct brw_exec_bo *exec_bos;
+ int max_exec_handle;
int exec_count;
int exec_array_size;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 16791de3de..b85038759d 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -40,6 +40,18 @@
#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+struct brw_exec_bo {
+ struct brw_bo *bo;
+ struct drm_i915_gem_exec_object2 *entry;
+
+ /**
+ * Last seen card virtual address (offset from the beginning of the
+ * aperture) for the object in this context/batch. This should be used to
+ * fill relocation entries when calling brw_bo_emit_reloc()
+ */
+ uint64_t offset;
+};
+
static void
intel_batchbuffer_reset(struct intel_batchbuffer *batch,
struct brw_bufmgr *bufmgr,
@@ -72,10 +84,10 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
batch->reloc_array_size = 250;
batch->relocs = malloc(batch->reloc_array_size *
sizeof(struct drm_i915_gem_relocation_entry));
+
+ batch->max_exec_handle = 0;
batch->exec_count = 0;
batch->exec_array_size = 100;
- batch->exec_bos =
- malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
batch->validation_list =
malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
@@ -96,48 +108,59 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
intel_batchbuffer_reset(batch, bufmgr, has_llc);
}
-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
-static unsigned int
+static struct drm_i915_gem_exec_object2 *
add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
- unsigned int index = READ_ONCE(bo->index);
+ if (unlikely(bo->gem_handle >= batch->max_exec_handle)) {
+ unsigned int max = ALIGN(bo->gem_handle + 1, 64);
+ struct brw_exec_bo *bos;
+
+ bos = realloc(batch->exec_bos, max * sizeof(batch->exec_bos[0]));
+ assert(bos);
+ memset(bos + batch->max_exec_handle, 0,
+ (max - batch->max_exec_handle) * sizeof(batch->exec_bos[0]));
- if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return index;
+ batch->exec_bos = bos;
+ batch->max_exec_handle = max;
+ }
- /* May have been shared between multiple active batches */
- for (index = 0; index < batch->exec_count; index++) {
- if (batch->exec_bos[index] == bo)
- return index;
+ struct brw_exec_bo *exec_bo = &batch->exec_bos[bo->gem_handle];
+ if (exec_bo->entry) {
+ assert(exec_bo->bo == bo);
+ return exec_bo->entry;
}
- if (batch->exec_count == batch->exec_array_size) {
+ if (unlikely(batch->exec_count == batch->exec_array_size)) {
batch->exec_array_size *= 2;
- batch->exec_bos =
- realloc(batch->exec_bos,
- batch->exec_array_size * sizeof(batch->exec_bos[0]));
batch->validation_list =
realloc(batch->validation_list,
batch->exec_array_size * sizeof(batch->validation_list[0]));
+
+ for (int i = 0; i < batch->exec_count; i++) {
+ struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[i];
+ assert(batch->exec_bos[entry->handle].entry);
+ batch->exec_bos[entry->handle].entry = entry;
+ }
}
- batch->validation_list[batch->exec_count] =
- (struct drm_i915_gem_exec_object2) {
- .handle = bo->gem_handle,
- .alignment = bo->align,
- .offset = bo->offset64,
- .flags = bo->kflags,
- };
+ struct drm_i915_gem_exec_object2 *entry =
+ &batch->validation_list[batch->exec_count++];
+
+ *entry = (struct drm_i915_gem_exec_object2) {
+ .handle = bo->gem_handle,
+ .alignment = bo->align,
+ .offset = exec_bo->offset,
+ .flags = bo->kflags,
+ };
- bo->index = batch->exec_count;
- batch->exec_bos[batch->exec_count] = bo;
batch->aperture_space += bo->size;
+ exec_bo->entry = entry;
+ exec_bo->bo = bo;
if (bo != batch->bo)
brw_bo_reference(bo);
- return batch->exec_count++;
+ return entry;
}
static void
@@ -158,7 +181,6 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
batch->map_next = batch->map;
add_exec_bo(batch, batch->bo);
- assert(batch->bo->index == 0);
batch->reserved_space = BATCH_RESERVED;
batch->state_batch_offset = batch->bo->size;
@@ -192,18 +214,24 @@ intel_batchbuffer_save_state(struct brw_context *brw)
void
intel_batchbuffer_reset_to_saved(struct brw_context *brw)
{
+ struct intel_batchbuffer *batch = &brw->batch;
+
for (int i = brw->batch.saved.exec_count;
i < brw->batch.exec_count; i++) {
- if (brw->batch.exec_bos[i] != brw->batch.bo) {
- brw_bo_unreference(brw->batch.exec_bos[i]);
- }
+ struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[i];
+ struct brw_exec_bo *exec_bo = &batch->exec_bos[entry->handle];
+
+ assert(exec_bo->entry == entry);
+ exec_bo->entry = NULL;
+ if (exec_bo->bo != batch->bo)
+ brw_bo_unreference(exec_bo->bo);
}
- brw->batch.reloc_count = brw->batch.saved.reloc_count;
- brw->batch.exec_count = brw->batch.saved.exec_count;
+ batch->reloc_count = batch->saved.reloc_count;
+ batch->exec_count = batch->saved.exec_count;
- brw->batch.map_next = brw->batch.saved.map_next;
- if (USED_BATCH(brw->batch) == 0)
- brw->batch.ring = UNKNOWN_RING;
+ batch->map_next = batch->saved.map_next;
+ if (USED_BATCH(*batch) == 0)
+ batch->ring = UNKNOWN_RING;
}
void
@@ -288,7 +316,7 @@ decode_structs(struct brw_context *brw, struct gen_spec *spec,
}
static void
-do_batch_dump(struct brw_context *brw)
+do_batch_dump(struct brw_context *brw, uint64_t gtt_offset)
{
struct intel_batchbuffer *batch = &brw->batch;
struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
@@ -305,7 +333,6 @@ do_batch_dump(struct brw_context *brw)
uint32_t *data = map ? map : batch->map;
uint32_t *end = data + USED_BATCH(*batch);
- uint32_t gtt_offset = map ? batch->bo->offset64 : 0;
int length;
bool color = INTEL_DEBUG & DEBUG_COLOR;
@@ -431,7 +458,7 @@ do_batch_dump(struct brw_context *brw)
}
}
#else
-static void do_batch_dump(struct brw_context *brw) { }
+static void do_batch_dump(struct brw_context *brw, uint64_t offset) { }
#endif
/**
@@ -628,6 +655,9 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
tmp = *entry;
*entry = batch->validation_list[index];
batch->validation_list[index] = tmp;
+
+ batch->exec_bos[tmp.handle].entry = &batch->validation_list[index];
+ batch->exec_bos[entry->handle].entry = &batch->validation_list[0];
}
unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
@@ -652,22 +682,24 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
}
for (int i = 0; i < batch->exec_count; i++) {
- struct brw_bo *bo = batch->exec_bos[i];
+ struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[i];
+ struct brw_exec_bo *exec_bo = &batch->exec_bos[entry->handle];
- bo->idle = false;
- bo->index = -1;
+ assert(exec_bo->entry == entry);
+ exec_bo->entry = NULL;
- /* Update brw_bo::offset64 */
- if (batch->validation_list[i].offset != bo->offset64) {
+ if (entry->offset != exec_bo->offset) {
DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
- bo->gem_handle, bo->offset64, batch->validation_list[i].offset);
- bo->offset64 = batch->validation_list[i].offset;
+ entry->handle, exec_bo->offset, entry->offset);
+ exec_bo->offset = entry->offset;
}
- if (batch->exec_bos[i] != batch->bo) {
- brw_bo_unreference(batch->exec_bos[i]);
- }
- batch->exec_bos[i] = NULL;
+ struct brw_bo *bo = exec_bo->bo;
+ assert(bo->gem_handle == entry->handle);
+
+ bo->idle = false;
+ if (bo != batch->bo)
+ brw_bo_unreference(bo);
}
if (ret == 0 && out_fence_fd != NULL)
@@ -676,7 +708,7 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
throttle(brw);
if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
- do_batch_dump(brw);
+ do_batch_dump(brw, entry->offset);
}
if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
@@ -762,15 +794,10 @@ brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
bool
brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
{
- unsigned int index = READ_ONCE(bo->index);
- if (index < batch->exec_count && batch->exec_bos[index] == bo)
- return true;
+ if (bo->gem_handle >= batch->max_exec_handle)
+ return false;
- for (int i = 0; i < batch->exec_count; i++) {
- if (batch->exec_bos[i] == bo)
- return true;
- }
- return false;
+ return batch->exec_bos[bo->gem_handle].entry;
}
/* This is the only way buffers get added to the validate list.
@@ -793,8 +820,7 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
assert(_mesa_bitcount(write_domain) <= 1);
- unsigned int index = add_exec_bo(batch, target);
- struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
+ struct drm_i915_gem_exec_object2 *entry = add_exec_bo(batch, target);
if (write_domain) {
entry->flags |= EXEC_OBJECT_WRITE;
@@ -811,7 +837,9 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
(struct drm_i915_gem_relocation_entry) {
.offset = batch_offset,
.delta = target_offset,
- .target_handle = batch->use_exec_lut ? index : target->gem_handle,
+ .target_handle = (batch->use_exec_lut ?
+ entry - batch->validation_list :
+ target->gem_handle),
.presumed_offset = entry->offset,
};
--
2.13.3
More information about the mesa-dev
mailing list