[Mesa-dev] [PATCH 08/10] i965: Convert reloc.target_handle into an index for I915_EXEC_HANDLE_LUT

Chris Wilson chris at chris-wilson.co.uk
Fri Jul 21 15:36:50 UTC 2017


Passing the index of the target buffer via the reloc.target_handle is
marginally more efficient for the kernel (it can avoid some allocations,
and can use a direct lookup rather than a hash or search). It is also
useful for ourselves as we can use the index into our exec_bos for other
tasks.

v2: Only enable HANDLE_LUT if we can use BATCH_FIRST and thereby avoid
a post-processing loop to fixup the relocations.
v3: Move kernel probing from context creation to screen init.
Use batch->use_exec_lut as it more descriptive of what's going on (Daniel)
v4: Kernel features already exists, use it for BATCH_FIRST
Rename locals to preserve current flavouring
v5: Squash in "always insert batch bo first"

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth at whitecape.org>
Cc: Matt Turner <mattst88 at gmail.com>
Cc: Jason Ekstrand <jason.ekstrand at intel.com>
Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org> #v4
---
 src/mesa/drivers/dri/i965/brw_context.h       |   1 +
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 109 ++++++++++++++------------
 src/mesa/drivers/dri/i965/intel_screen.c      |   4 +
 src/mesa/drivers/dri/i965/intel_screen.h      |   1 +
 4 files changed, 65 insertions(+), 50 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2acebaa820..57081fb434 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -452,6 +452,7 @@ struct intel_batchbuffer {
 
    uint32_t state_batch_offset;
    enum brw_gpu_ring ring;
+   bool use_exec_lut;
    bool needs_sol_reset;
    bool state_base_address_emitted;
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index 5e861a555a..16791de3de 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -62,8 +62,6 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
                        struct brw_bufmgr *bufmgr,
                        bool has_llc)
 {
-   intel_batchbuffer_reset(batch, bufmgr, has_llc);
-
    if (!has_llc) {
       batch->cpu_map = malloc(BATCH_SZ);
       batch->map = batch->cpu_map;
@@ -85,6 +83,17 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
       batch->state_batch_sizes =
          _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
    }
+
+   struct brw_context *brw = container_of(batch, brw, batch);
+   /* To use the LUT method for execbuf, we also require placing the batch
+    * first (to simplify our implementation). We require a kernel recent
+    * enough to always support EXEC_LUT_HANDLE, but we must check that
+    * the kernel supports EXEC_BATCH_FIRST.
+    */
+   batch->use_exec_lut =
+      brw->screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
+
+   intel_batchbuffer_reset(batch, bufmgr, has_llc);
 }
 
 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
@@ -92,19 +101,15 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch,
 static unsigned int
 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
 {
-   if (bo != batch->bo) {
-      unsigned int index = READ_ONCE(bo->index);
-
-      if (index < batch->exec_count && batch->exec_bos[index] == bo)
-         return index;
+   unsigned int index = READ_ONCE(bo->index);
 
-      /* May have been shared between multiple active batches */
-      for (index = 0; index < batch->exec_count; index++) {
-         if (batch->exec_bos[index] == bo)
-            return index;
-      }
+   if (index < batch->exec_count && batch->exec_bos[index] == bo)
+      return index;
 
-      brw_bo_reference(bo);
+   /* May have been shared between multiple active batches */
+   for (index = 0; index < batch->exec_count; index++) {
+      if (batch->exec_bos[index] == bo)
+         return index;
    }
 
    if (batch->exec_count == batch->exec_array_size) {
@@ -117,26 +122,21 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
                  batch->exec_array_size * sizeof(batch->validation_list[0]));
    }
 
-   struct drm_i915_gem_exec_object2 *validation_entry =
-      &batch->validation_list[batch->exec_count];
-   validation_entry->handle = bo->gem_handle;
-   if (bo == batch->bo) {
-      validation_entry->relocation_count = batch->reloc_count;
-      validation_entry->relocs_ptr = (uintptr_t) batch->relocs;
-   } else {
-      validation_entry->relocation_count = 0;
-      validation_entry->relocs_ptr = 0;
-   }
-   validation_entry->alignment = bo->align;
-   validation_entry->offset = bo->offset64;
-   validation_entry->flags = bo->kflags;
-   validation_entry->rsvd1 = 0;
-   validation_entry->rsvd2 = 0;
+   batch->validation_list[batch->exec_count] =
+      (struct drm_i915_gem_exec_object2) {
+         .handle = bo->gem_handle,
+         .alignment = bo->align,
+         .offset = bo->offset64,
+         .flags = bo->kflags,
+      };
 
    bo->index = batch->exec_count;
    batch->exec_bos[batch->exec_count] = bo;
    batch->aperture_space += bo->size;
 
+   if (bo != batch->bo)
+      brw_bo_reference(bo);
+
    return batch->exec_count++;
 }
 
@@ -157,6 +157,9 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
    }
    batch->map_next = batch->map;
 
+   add_exec_bo(batch, batch->bo);
+   assert(batch->bo->index == 0);
+
    batch->reserved_space = BATCH_RESERVED;
    batch->state_batch_offset = batch->bo->size;
    batch->needs_sol_reset = false;
@@ -581,9 +584,6 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
    }
 
    if (ret == 0) {
-      /* Add the batch itself to the end of the validation list */
-      add_exec_bo(batch, batch->bo);
-
       struct drm_i915_gem_execbuffer2 execbuf = {
          .buffers_ptr = (uintptr_t) batch->validation_list,
          .buffer_count = batch->exec_count,
@@ -614,6 +614,22 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
       if (batch->needs_sol_reset)
          execbuf.flags |= I915_EXEC_GEN7_SOL_RESET;
 
+      struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
+      assert(entry->handle == batch->bo->gem_handle);
+      entry->relocation_count = batch->reloc_count;
+      entry->relocs_ptr = (uintptr_t) batch->relocs;
+
+      if (batch->use_exec_lut) {
+         execbuf.flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
+      } else {
+         struct drm_i915_gem_exec_object2 tmp;
+         unsigned int index = batch->exec_count - 1;
+
+         tmp = *entry;
+         *entry = batch->validation_list[index];
+         batch->validation_list[index] = tmp;
+      }
+
       unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
 
       if (in_fence_fd != -1) {
@@ -777,40 +793,33 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
    assert(batch_offset <= BATCH_SZ - sizeof(uint32_t));
    assert(_mesa_bitcount(write_domain) <= 1);
 
-   uint64_t offset64;
-   if (target != batch->bo) {
-      unsigned int index = add_exec_bo(batch, target);
-      struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
+   unsigned int index = add_exec_bo(batch, target);
+   struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
 
-      if (write_domain) {
-         entry->flags |= EXEC_OBJECT_WRITE;
+   if (write_domain) {
+      entry->flags |= EXEC_OBJECT_WRITE;
 
-         /* PIPECONTROL needs a w/a on gen6 */
-         if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
-            struct brw_context *brw = container_of(batch, brw, batch);
-            if (brw->gen == 6)
-               entry->flags |= EXEC_OBJECT_NEEDS_GTT;
-         }
+      /* PIPECONTROL needs a w/a on gen6 */
+      if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) {
+         struct brw_context *brw = container_of(batch, brw, batch);
+         if (brw->gen == 6)
+            entry->flags |= EXEC_OBJECT_NEEDS_GTT;
       }
-
-      offset64 = entry->offset;
-   } else {
-      offset64 = target->offset64;
    }
 
    batch->relocs[batch->reloc_count++] =
       (struct drm_i915_gem_relocation_entry) {
          .offset = batch_offset,
          .delta = target_offset,
-         .target_handle = target->gem_handle,
-         .presumed_offset = offset64,
+         .target_handle = batch->use_exec_lut ? index : target->gem_handle,
+         .presumed_offset = entry->offset,
       };
 
    /* Using the old buffer offset, write in what the right data would be, in
     * case the buffer doesn't move and we can short-circuit the relocation
     * processing in the kernel
     */
-   return offset64 + target_offset;
+   return target_offset + entry->offset;
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 44ea6a4562..109ba79b67 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -2278,6 +2278,10 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen)
       screen->kernel_features |= KERNEL_ALLOWS_EXEC_CAPTURE;
    }
 
+   if (intel_get_boolean(screen, I915_PARAM_HAS_EXEC_BATCH_FIRST)) {
+      screen->kernel_features |= KERNEL_ALLOWS_EXEC_BATCH_FIRST;
+   }
+
    if (!intel_detect_pipelined_so(screen)) {
       /* We can't do anything, so the effective version is 0. */
       screen->cmd_parser_version = 0;
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 0980c8f561..577058dc15 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -80,6 +80,7 @@ struct intel_screen
 #define KERNEL_ALLOWS_HSW_SCRATCH1_AND_ROW_CHICKEN3 (1<<3)
 #define KERNEL_ALLOWS_COMPUTE_DISPATCH              (1<<4)
 #define KERNEL_ALLOWS_EXEC_CAPTURE                  (1<<5)
+#define KERNEL_ALLOWS_EXEC_BATCH_FIRST              (1<<6)
 
    struct brw_bufmgr *bufmgr;
 
-- 
2.13.3



More information about the mesa-dev mailing list