Mesa (master): anv: Move relocation handling from EndCommandBuffer to QueueSubmit

Wed Nov 9 19:32:24 UTC 2016

Module: Mesa
Branch: master
Commit: 8b61c57049ff75766715ad4f7b1ad2d3657b9b4d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8b61c57049ff75766715ad4f7b1ad2d3657b9b4d

Author: Jason Ekstrand <jason.ekstrand at intel.com>
Date:   Sat Nov  5 19:47:33 2016 -0700

anv: Move relocation handling from EndCommandBuffer to QueueSubmit

Ever since the early days of the Vulkan driver, we've been setting up the
lists of relocations at EndCommandBuffer time.  The idea behind this was to
move some of the CPU load out of QueueSubmit which the client is required
to lock around and into command buffer building which could be done in
parallel.  Then QueueSubmit basically just becomes a bunch of execbuf2
calls.

Technically, this works.  However, when you start to do more in QueueSubmit
than just execbuf2, you start to run into problems.  In particular, if a
block pool is resized between EndCommandBuffer and QueueSubmit, the list of
anv_bo's and the execbuf2 object list can get out of sync.  This can cause
problems if, for instance, you wanted to do relocations in userspace.

Signed-off-by: Jason Ekstrand <jason at jlekstrand.net>
Reviewed-by: Kristian H. Kristensen <hoegsberg at google.com>
Cc: "13.0" <mesa-stable at lists.freedesktop.org>

---

 src/intel/vulkan/anv_batch_chain.c | 94 ++++++++++++++++++++------------------
 src/intel/vulkan/anv_device.c      | 30 ++++++++++--
 src/intel/vulkan/anv_private.h     | 13 ------
 src/intel/vulkan/genX_cmd_buffer.c | 11 -----
 4 files changed, 76 insertions(+), 72 deletions(-)

diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
index 29c7995..e03b585 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -645,20 +645,6 @@ anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
    return VK_SUCCESS;
 }
 
-static void
-anv_execbuf_init(struct anv_execbuf *exec)
-{
-   memset(exec, 0, sizeof(*exec));
-}
-
-static void
-anv_execbuf_finish(struct anv_execbuf *exec,
-                   const VkAllocationCallbacks *alloc)
-{
-   vk_free(alloc, exec->objects);
-   vk_free(alloc, exec->bos);
-}
-
 VkResult
 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
 {
@@ -706,8 +692,6 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
 
    anv_cmd_buffer_new_binding_table_block(cmd_buffer);
 
-   anv_execbuf_init(&cmd_buffer->execbuf2);
-
    return VK_SUCCESS;
 
  fail_bt_blocks:
@@ -739,8 +723,6 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
                             &cmd_buffer->batch_bos, link) {
       anv_batch_bo_destroy(bbo, cmd_buffer);
    }
-
-   anv_execbuf_finish(&cmd_buffer->execbuf2, &cmd_buffer->pool->alloc);
 }
 
 void
@@ -938,6 +920,31 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
                          &secondary->surface_relocs, 0);
 }
 
+struct anv_execbuf {
+   struct drm_i915_gem_execbuffer2           execbuf;
+
+   struct drm_i915_gem_exec_object2 *        objects;
+   uint32_t                                  bo_count;
+   struct anv_bo **                          bos;
+
+   /* Allocated length of the 'objects' and 'bos' arrays */
+   uint32_t                                  array_length;
+};
+
+static void
+anv_execbuf_init(struct anv_execbuf *exec)
+{
+   memset(exec, 0, sizeof(*exec));
+}
+
+static void
+anv_execbuf_finish(struct anv_execbuf *exec,
+                   const VkAllocationCallbacks *alloc)
+{
+   vk_free(alloc, exec->objects);
+   vk_free(alloc, exec->bos);
+}
+
 static VkResult
 anv_execbuf_add_bo(struct anv_execbuf *exec,
                    struct anv_bo *bo,
@@ -1108,20 +1115,20 @@ adjust_relocations_to_state_pool(struct anv_block_pool *pool,
    }
 }
 
-void
-anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
+VkResult
+anv_cmd_buffer_execbuf(struct anv_device *device,
+                       struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_batch *batch = &cmd_buffer->batch;
    struct anv_block_pool *ss_pool =
       &cmd_buffer->device->surface_state_block_pool;
 
-   cmd_buffer->execbuf2.bo_count = 0;
+   struct anv_execbuf execbuf;
+   anv_execbuf_init(&execbuf);
 
    adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs,
                                       cmd_buffer->last_ss_pool_center);
-
-   anv_execbuf_add_bo(&cmd_buffer->execbuf2, &ss_pool->bo,
-                      &cmd_buffer->surface_relocs,
+   anv_execbuf_add_bo(&execbuf, &ss_pool->bo, &cmd_buffer->surface_relocs,
                       &cmd_buffer->pool->alloc);
 
    /* First, we walk over all of the bos we've seen and add them and their
@@ -1132,7 +1139,7 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
       adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs,
                                        cmd_buffer->last_ss_pool_center);
 
-      anv_execbuf_add_bo(&cmd_buffer->execbuf2, &(*bbo)->bo, &(*bbo)->relocs,
+      anv_execbuf_add_bo(&execbuf, &(*bbo)->bo, &(*bbo)->relocs,
                          &cmd_buffer->pool->alloc);
    }
 
@@ -1150,20 +1157,19 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
     * corresponding to the first batch_bo in the chain with the last
     * element in the list.
     */
-   if (first_batch_bo->bo.index != cmd_buffer->execbuf2.bo_count - 1) {
+   if (first_batch_bo->bo.index != execbuf.bo_count - 1) {
       uint32_t idx = first_batch_bo->bo.index;
-      uint32_t last_idx = cmd_buffer->execbuf2.bo_count - 1;
+      uint32_t last_idx = execbuf.bo_count - 1;
 
-      struct drm_i915_gem_exec_object2 tmp_obj =
-         cmd_buffer->execbuf2.objects[idx];
-      assert(cmd_buffer->execbuf2.bos[idx] == &first_batch_bo->bo);
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf.objects[idx];
+      assert(execbuf.bos[idx] == &first_batch_bo->bo);
 
-      cmd_buffer->execbuf2.objects[idx] = cmd_buffer->execbuf2.objects[last_idx];
-      cmd_buffer->execbuf2.bos[idx] = cmd_buffer->execbuf2.bos[last_idx];
-      cmd_buffer->execbuf2.bos[idx]->index = idx;
+      execbuf.objects[idx] = execbuf.objects[last_idx];
+      execbuf.bos[idx] = execbuf.bos[last_idx];
+      execbuf.bos[idx]->index = idx;
 
-      cmd_buffer->execbuf2.objects[last_idx] = tmp_obj;
-      cmd_buffer->execbuf2.bos[last_idx] = &first_batch_bo->bo;
+      execbuf.objects[last_idx] = tmp_obj;
+      execbuf.bos[last_idx] = &first_batch_bo->bo;
       first_batch_bo->bo.index = last_idx;
    }
 
@@ -1184,9 +1190,9 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
       }
    }
 
-   cmd_buffer->execbuf2.execbuf = (struct drm_i915_gem_execbuffer2) {
-      .buffers_ptr = (uintptr_t) cmd_buffer->execbuf2.objects,
-      .buffer_count = cmd_buffer->execbuf2.bo_count,
+   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf.objects,
+      .buffer_count = execbuf.bo_count,
       .batch_start_offset = 0,
       .batch_len = batch->next - batch->start,
       .cliprects_ptr = 0,
@@ -1198,12 +1204,7 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer)
       .rsvd1 = cmd_buffer->device->context_id,
       .rsvd2 = 0,
    };
-}
 
-VkResult
-anv_cmd_buffer_execbuf(struct anv_device *device,
-                       struct anv_cmd_buffer *cmd_buffer)
-{
    /* Since surface states are shared between command buffers and we don't
     * know what order they will be submitted to the kernel, we don't know what
     * address is actually written in the surface state object at any given
@@ -1213,6 +1214,9 @@ anv_cmd_buffer_execbuf(struct anv_device *device,
    for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
       cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
 
-   return anv_device_execbuf(device, &cmd_buffer->execbuf2.execbuf,
-                             cmd_buffer->execbuf2.bos);
+   VkResult result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos);
+
+   anv_execbuf_finish(&execbuf, &cmd_buffer->pool->alloc);
+
+   return result;
 }
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index c40598c..c47961e 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1097,6 +1097,27 @@ VkResult anv_QueueSubmit(
    struct anv_device *device = queue->device;
    VkResult result = VK_SUCCESS;
 
+   /* We lock around QueueSubmit for two main reasons:
+    *
+    *  1) When a block pool is resized, we create a new gem handle with a
+    *     different size and, in the case of surface states, possibly a
+    *     different center offset but we re-use the same anv_bo struct when
+    *     we do so.  If this happens in the middle of setting up an execbuf,
+    *     we could end up with our list of BOs out of sync with our list of
+    *     gem handles.
+    *
+    *  2) The algorithm we use for building the list of unique buffers isn't
+    *     thread-safe.  While the client is supposed to syncronize around
+    *     QueueSubmit, this would be extremely difficult to debug if it ever
+    *     came up in the wild due to a broken app.  It's better to play it
+    *     safe and just lock around QueueSubmit.
+    *
+    * Since the only other things that ever take the device lock such as block
+    * pool resize only rarely happen, this will almost never be contended so
+    * taking a lock isn't really an expensive operation in this case.
+    */
+   pthread_mutex_lock(&device->mutex);
+
    for (uint32_t i = 0; i < submitCount; i++) {
       for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
          ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
@@ -1105,7 +1126,7 @@ VkResult anv_QueueSubmit(
 
          result = anv_cmd_buffer_execbuf(device, cmd_buffer);
          if (result != VK_SUCCESS)
-            return result;
+            goto out;
       }
    }
 
@@ -1113,10 +1134,13 @@ VkResult anv_QueueSubmit(
       struct anv_bo *fence_bo = &fence->bo;
       result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
       if (result != VK_SUCCESS)
-         return result;
+         goto out;
    }
 
-   return VK_SUCCESS;
+out:
+   pthread_mutex_unlock(&device->mutex);
+
+   return result;
 }
 
 VkResult anv_QueueWaitIdle(
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index a60cd3c..4ee6118 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1144,17 +1144,6 @@ enum anv_cmd_buffer_exec_mode {
    ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN,
 };
 
-struct anv_execbuf {
-   struct drm_i915_gem_execbuffer2           execbuf;
-
-   struct drm_i915_gem_exec_object2 *        objects;
-   uint32_t                                  bo_count;
-   struct anv_bo **                          bos;
-
-   /* Allocated length of the 'objects' and 'bos' arrays */
-   uint32_t                                  array_length;
-};
-
 struct anv_cmd_buffer {
    VK_LOADER_DATA                               _loader_data;
 
@@ -1190,8 +1179,6 @@ struct anv_cmd_buffer {
    /** Last seen surface state block pool center bo offset */
    uint32_t                                     last_ss_pool_center;
 
-   struct anv_execbuf                           execbuf2;
-
    /* Serial for tracking buffer completion */
    uint32_t                                     serial;
 
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 24e0012..045cb9d 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -200,20 +200,9 @@ genX(EndCommandBuffer)(
     VkCommandBuffer                             commandBuffer)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct anv_device *device = cmd_buffer->device;
 
    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
 
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
-      /* The algorithm used to compute the validate list is not threadsafe as
-       * it uses the bo->index field.  We have to lock the device around it.
-       * Fortunately, the chances for contention here are probably very low.
-       */
-      pthread_mutex_lock(&device->mutex);
-      anv_cmd_buffer_prepare_execbuf(cmd_buffer);
-      pthread_mutex_unlock(&device->mutex);
-   }
-
    return VK_SUCCESS;
 }