[Mesa-dev] [PATCH v2 3/3] anv: Do relocations in userspace before execbuf ioctl

Tue Nov 1 16:28:59 UTC 2016

From: Kristian Høgsberg Kristensen <kristian.h.kristensen at intel.com>

This reduces the amount of stalling that the kernel does between batches
and improves the performance of Dota 2 on a Sky Lake GT2 desktop by around
30%.

v2 (Jason Ekstrand):
 - Use canonical form addresses on gen8+ (Chris Wilson)
 - Provide a better correctness proof (Chris Wilson)

Signed-off-by: Jason Ekstrand <jason at jlekstrand.net>
---
 src/intel/vulkan/anv_device.c | 112 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 109 insertions(+), 3 deletions(-)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index baa767e..71ba8d8 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1068,6 +1068,105 @@ void anv_GetDeviceQueue(
    *pQueue = anv_queue_to_handle(&device->queue);
 }
 
+static void
+write_reloc(const struct anv_device *device, void *p, uint64_t v)
+{
+   unsigned reloc_size = 0;
+   if (device->info.gen >= 8) {
+      /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress:
+       *
+       *    "This field specifies the address of the memory location where the
+       *    register value specified in the DWord above will read from. The
+       *    address specifies the DWord location of the data. Range =
+       *    GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress
+       *    [63:48] are ignored by the HW and assumed to be in correct
+       *    canonical form [63:48] == [47]."
+       */
+      reloc_size = sizeof(uint64_t);
+      *(uint64_t *)p = (((int64_t)v) << 8) >> 8;
+   } else {
+      reloc_size = sizeof(uint32_t);
+      *(uint32_t *)p = v;
+   }
+
+   if (!device->info.has_llc)
+      anv_clflush_range(p, reloc_size);
+}
+
+static void
+anv_reloc_list_apply(struct anv_reloc_list *list,
+                     struct anv_device *device, struct anv_bo *bo)
+{
+   for (size_t i = 0; i < list->num_relocs; i++) {
+      void *p = bo->map + list->relocs[i].offset;
+
+      struct anv_bo *target_bo = list->reloc_bos[i];
+      write_reloc(device, p, target_bo->offset + list->relocs[i].delta);
+      list->relocs[i].presumed_offset = bo->offset;
+   }
+}
+
+/**
+ * This function applies the relocation for a command buffer and writes the
+ * actual addresses into the buffers as per what we were told by the kernel on
+ * the previous execbuf2 call.  This should be safe to do because, for each
+ * relocated address, we have two cases:
+ *
+ *  1) The target BO is inactive (as seen by the kernel).  In this case, it is
+ *     not in use by the GPU so updating the address is 100% ok.  It won't be
+ *     in-use by the GPU (from our context) again until the next execbuf2
+ *     happens.  If the kernel decides to move it in the next execbuf2, it
+ *     will have to do the relocations itself, but that's ok because it should
+ *     have all of the information needed to do so.
+ *
+ *  2) The target BO is active (as seen by the kernel).  In this case, it
+ *     hasn't moved since the last execbuffer2 call because GTT shuffling
+ *     *only* happens inside the execbuffer2 ioctl.  Since the target BO
+ *     hasn't moved, our anv_bo::offset exactly matches the BO's GTT address
+ *     and the relocated value we are writing into the BO will be the same as
+ *     the value that is already there.
+ *
+ *     There is also a possibility that the target BO is active but the exact
+ *     RENDER_SURFACE_STATE object we are writing the relocation into isn't in
+ *     use.  In this case, the address currently in the RENDER_SURFACE_STATE
+ *     may be stale but it's still safe to write the relocation because that
+ *     particular RENDER_SURFACE_STATE object isn't in-use by the GPU and
+ *     won't be until the next execbuf2 call.
+ *
+ * By doing relocations on the CPU, we can tell the kernel that it doesn't
+ * need to bother.  We want to do this because the surface state buffer is
+ * used by every command buffer so, if the kernel does the relocations, it
+ * will always be busy and the kernel will always stall.  This is also
+ * probably the fastest mechanism for doing relocations since the kernel would
+ * have to make a full copy of all the relocations lists.
+ */
+static void
+relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer)
+{
+   for (uint32_t i = 0; i < cmd_buffer->execbuf2.bo_count; i++) {
+      if (cmd_buffer->execbuf2.bos[i]->offset == (uint64_t)-1)
+         return;
+   }
+
+   anv_reloc_list_apply(&cmd_buffer->surface_relocs,
+                        cmd_buffer->device,
+                        &cmd_buffer->device->surface_state_block_pool.bo);
+
+   struct anv_batch_bo **bbo;
+   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+      anv_reloc_list_apply(&(*bbo)->relocs,
+                           cmd_buffer->device, &(*bbo)->bo);
+   }
+
+   for (uint32_t i = 0; i < cmd_buffer->execbuf2.bo_count; i++) {
+      struct anv_bo *bo = cmd_buffer->execbuf2.bos[i];
+
+      cmd_buffer->execbuf2.objects[i].offset = bo->offset;
+   }
+
+   cmd_buffer->execbuf2.execbuf.flags |= I915_EXEC_NO_RELOC;
+}
+
 VkResult
 anv_device_execbuf(struct anv_device *device,
                    struct drm_i915_gem_execbuffer2 *execbuf,
@@ -1097,16 +1196,20 @@ VkResult anv_QueueSubmit(
    struct anv_device *device = queue->device;
    VkResult result = VK_SUCCESS;
 
+   pthread_mutex_lock(&device->mutex);
+
    for (uint32_t i = 0; i < submitCount; i++) {
       for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
          ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
                          pSubmits[i].pCommandBuffers[j]);
          assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
+         relocate_cmd_buffer(cmd_buffer);
+
          result = anv_device_execbuf(device, &cmd_buffer->execbuf2.execbuf,
                                      cmd_buffer->execbuf2.bos);
          if (result != VK_SUCCESS)
-            return result;
+            goto out;
       }
    }
 
@@ -1114,10 +1217,13 @@ VkResult anv_QueueSubmit(
       struct anv_bo *fence_bo = &fence->bo;
       result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
       if (result != VK_SUCCESS)
-         return result;
+         goto out;
    }
 
-   return VK_SUCCESS;
+out:
+   pthread_mutex_unlock(&device->mutex);
+
+   return result;
 }
 
 VkResult anv_QueueWaitIdle(
-- 
2.5.0.400.gff86faf