Mesa (main): venus: add fence feedback

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu Jun 16 19:06:12 UTC 2022


Module: Mesa
Branch: main
Commit: d7f2e6c8d033de19a1d473df4fb1a46c7d365159
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d7f2e6c8d033de19a1d473df4fb1a46c7d365159

Author: Yiwei Zhang <zzyiwei at chromium.org>
Date:   Wed May 25 07:13:13 2022 +0000

venus: add fence feedback

- intercept to record feedback cmds for:
  - vkQueueSubmit
- add feedback code path for
  - vkGetFenceStatus
  - vkResetFences
- VN_PERF_NO_FENCE_FEEDBACK can disable fence feedback

Test: dEQP-VK.synchronization.basic.fence.*
Test: dEQP-VK.wsi.android.swapchain.render.basic*
Test: dEQP-VK.api.object_management.*
Test: dEQP-VK.api.external.fence.sync_fd.*

Signed-off-by: Yiwei Zhang <zzyiwei at chromium.org>
Reviewed-by: Ryan Neph <ryanneph at google.com>
Reviewed-by: Chad Versace <chadversary at chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16731>

---

 src/virtio/vulkan/vn_device.c   |   4 +-
 src/virtio/vulkan/vn_feedback.c | 106 +++++++++++++++++++
 src/virtio/vulkan/vn_feedback.h |  11 ++
 src/virtio/vulkan/vn_queue.c    | 227 +++++++++++++++++++++++++++++++++++-----
 src/virtio/vulkan/vn_queue.h    |   6 ++
 5 files changed, 324 insertions(+), 30 deletions(-)

diff --git a/src/virtio/vulkan/vn_device.c b/src/virtio/vulkan/vn_device.c
index 5fc070a9ee4..ff17999d70e 100644
--- a/src/virtio/vulkan/vn_device.c
+++ b/src/virtio/vulkan/vn_device.c
@@ -310,7 +310,7 @@ vn_device_feedback_pool_init(struct vn_device *dev)
    static const uint32_t pool_size = 4096;
    const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
 
-   if (VN_PERF(NO_EVENT_FEEDBACK))
+   if (VN_PERF(NO_EVENT_FEEDBACK) && VN_PERF(NO_FENCE_FEEDBACK))
       return VK_SUCCESS;
 
    return vn_feedback_pool_init(dev, &dev->feedback_pool, pool_size, alloc);
@@ -319,7 +319,7 @@ vn_device_feedback_pool_init(struct vn_device *dev)
 static inline void
 vn_device_feedback_pool_fini(struct vn_device *dev)
 {
-   if (VN_PERF(NO_EVENT_FEEDBACK))
+   if (VN_PERF(NO_EVENT_FEEDBACK) && VN_PERF(NO_FENCE_FEEDBACK))
       return;
 
    vn_feedback_pool_fini(&dev->feedback_pool);
diff --git a/src/virtio/vulkan/vn_feedback.c b/src/virtio/vulkan/vn_feedback.c
index 31d653357de..7e2d1a65950 100644
--- a/src/virtio/vulkan/vn_feedback.c
+++ b/src/virtio/vulkan/vn_feedback.c
@@ -323,6 +323,112 @@ vn_feedback_event_cmd_record(VkCommandBuffer cmd_handle,
                          &buf_barrier_after, 0, NULL);
 }
 
+static VkResult
+vn_feedback_fence_cmd_record(VkCommandBuffer cmd_handle,
+                             struct vn_feedback_slot *slot)
+
+{
+   STATIC_ASSERT(sizeof(*slot->status) == 4);
+
+   static const VkCommandBufferBeginInfo begin_info = {
+      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+      .pNext = NULL,
+      .flags = 0,
+      .pInheritanceInfo = NULL,
+   };
+   VkResult result = vn_BeginCommandBuffer(cmd_handle, &begin_info);
+   if (result != VK_SUCCESS)
+      return result;
+
+   static const VkMemoryBarrier mem_barrier_before = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+      .pNext = NULL,
+      /* make pending writes available to stay close to fence signal op */
+      .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+      /* no need to make all memory visible for feedback update */
+      .dstAccessMask = 0,
+   };
+   const VkBufferMemoryBarrier buf_barrier_before = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+      .pNext = NULL,
+      /* slot memory has been made available via mem_barrier_before */
+      .srcAccessMask = 0,
+      .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .buffer = slot->buffer,
+      .offset = slot->offset,
+      .size = 4,
+   };
+   vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                         VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1,
+                         &mem_barrier_before, 1, &buf_barrier_before, 0,
+                         NULL);
+   vn_CmdFillBuffer(cmd_handle, slot->buffer, slot->offset, 4, VK_SUCCESS);
+
+   const VkBufferMemoryBarrier buf_barrier_after = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+      .pNext = NULL,
+      .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+      .dstAccessMask = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT,
+      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .buffer = slot->buffer,
+      .offset = slot->offset,
+      .size = 4,
+   };
+   vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                         VK_PIPELINE_STAGE_HOST_BIT, 0, 0, NULL, 1,
+                         &buf_barrier_after, 0, NULL);
+
+   return vn_EndCommandBuffer(cmd_handle);
+}
+
+VkResult
+vn_feedback_fence_cmd_alloc(VkDevice dev_handle,
+                            struct vn_feedback_cmd_pool *pool,
+                            struct vn_feedback_slot *slot,
+                            VkCommandBuffer *out_cmd_handle)
+{
+   const VkCommandBufferAllocateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+      .pNext = NULL,
+      .commandPool = pool->pool,
+      .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+      .commandBufferCount = 1,
+   };
+   VkCommandBuffer cmd_handle;
+   VkResult result;
+
+   simple_mtx_lock(&pool->mutex);
+   result = vn_AllocateCommandBuffers(dev_handle, &info, &cmd_handle);
+   if (result != VK_SUCCESS)
+      goto out_unlock;
+
+   result = vn_feedback_fence_cmd_record(cmd_handle, slot);
+   if (result != VK_SUCCESS) {
+      vn_FreeCommandBuffers(dev_handle, pool->pool, 1, &cmd_handle);
+      goto out_unlock;
+   }
+
+   *out_cmd_handle = cmd_handle;
+
+out_unlock:
+   simple_mtx_unlock(&pool->mutex);
+
+   return result;
+}
+
+void
+vn_feedback_fence_cmd_free(VkDevice dev_handle,
+                           struct vn_feedback_cmd_pool *pool,
+                           VkCommandBuffer cmd_handle)
+{
+   simple_mtx_lock(&pool->mutex);
+   vn_FreeCommandBuffers(dev_handle, pool->pool, 1, &cmd_handle);
+   simple_mtx_unlock(&pool->mutex);
+}
+
 VkResult
 vn_feedback_cmd_pools_init(struct vn_device *dev)
 {
diff --git a/src/virtio/vulkan/vn_feedback.h b/src/virtio/vulkan/vn_feedback.h
index c391ed8d78e..8f7870c894b 100644
--- a/src/virtio/vulkan/vn_feedback.h
+++ b/src/virtio/vulkan/vn_feedback.h
@@ -113,6 +113,17 @@ vn_feedback_event_cmd_record(VkCommandBuffer cmd_handle,
                              VkPipelineStageFlags stage_mask,
                              VkResult status);
 
+VkResult
+vn_feedback_fence_cmd_alloc(VkDevice dev_handle,
+                            struct vn_feedback_cmd_pool *pool,
+                            struct vn_feedback_slot *slot,
+                            VkCommandBuffer *out_cmd_handle);
+
+void
+vn_feedback_fence_cmd_free(VkDevice dev_handle,
+                           struct vn_feedback_cmd_pool *pool,
+                           VkCommandBuffer cmd_handle);
+
 VkResult
 vn_feedback_cmd_pools_init(struct vn_device *dev);
 
diff --git a/src/virtio/vulkan/vn_queue.c b/src/virtio/vulkan/vn_queue.c
index e50d28b2eaf..7408ba50941 100644
--- a/src/virtio/vulkan/vn_queue.c
+++ b/src/virtio/vulkan/vn_queue.c
@@ -312,6 +312,38 @@ vn_queue_submission_cleanup(struct vn_queue_submission *submit)
    vk_free(alloc, submit->temp.storage);
 }
 
+static inline uint32_t
+vn_queue_family_array_index(struct vn_queue *queue)
+{
+   for (uint32_t i = 0; i < queue->device->queue_family_count; i++) {
+      if (queue->device->queue_families[i] == queue->family)
+         return i;
+   }
+   unreachable("invalid queue");
+}
+
+static VkResult
+vn_queue_submit(struct vn_instance *instance,
+                VkQueue queue_handle,
+                uint32_t batch_count,
+                const VkSubmitInfo *batches,
+                VkFence fence_handle,
+                bool sync_submit)
+{
+   /* skip no-op submit */
+   if (!batch_count && fence_handle == VK_NULL_HANDLE)
+      return VK_SUCCESS;
+
+   if (sync_submit) {
+      return vn_call_vkQueueSubmit(instance, queue_handle, batch_count,
+                                   batches, fence_handle);
+   }
+
+   vn_async_vkQueueSubmit(instance, queue_handle, batch_count, batches,
+                          fence_handle);
+   return VK_SUCCESS;
+}
+
 VkResult
 vn_QueueSubmit(VkQueue _queue,
                uint32_t submitCount,
@@ -322,15 +354,18 @@ vn_QueueSubmit(VkQueue _queue,
    struct vn_queue *queue = vn_queue_from_handle(_queue);
    struct vn_device *dev = queue->device;
    struct vn_fence *fence = vn_fence_from_handle(_fence);
-   const bool is_fence_external = fence && fence->is_external;
-
+   const bool external_fence = fence && fence->is_external;
+   const bool feedback_fence = fence && fence->feedback.slot;
    struct vn_queue_submission submit;
-   VkResult result = vn_queue_submission_prepare_submit(
-      &submit, _queue, submitCount, pSubmits, _fence);
+   const struct vn_device_memory *wsi_mem = NULL;
+   bool sync_submit;
+   VkResult result;
+
+   result = vn_queue_submission_prepare_submit(&submit, _queue, submitCount,
+                                               pSubmits, _fence);
    if (result != VK_SUCCESS)
       return vn_error(dev->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   const struct vn_device_memory *wsi_mem = NULL;
    if (submit.batch_count == 1) {
       const struct wsi_memory_signal_submit_info *info = vk_find_struct_const(
          submit.submit_batches[0].pNext, WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA);
@@ -340,22 +375,51 @@ vn_QueueSubmit(VkQueue _queue,
       }
    }
 
-   /* TODO defer roundtrip for external fence until the next sync operation */
-   if (!wsi_mem && !is_fence_external && !VN_PERF(NO_ASYNC_QUEUE_SUBMIT)) {
-      vn_async_vkQueueSubmit(dev->instance, submit.queue, submit.batch_count,
-                             submit.submit_batches, submit.fence);
-      vn_queue_submission_cleanup(&submit);
-      return VK_SUCCESS;
-   }
-
-   result =
-      vn_call_vkQueueSubmit(dev->instance, submit.queue, submit.batch_count,
-                            submit.submit_batches, submit.fence);
+   /* force synchronous submission if any of the below applies:
+    * - struct wsi_memory_signal_submit_info
+    * - fence is an external fence
+    * - NO_ASYNC_QUEUE_SUBMIT perf option enabled
+    */
+   sync_submit = wsi_mem || external_fence || VN_PERF(NO_ASYNC_QUEUE_SUBMIT);
+
+   /* if the original submission involves a feedback fence:
+    * - defer the feedback fence to another submit to avoid deep copy
+    * - defer the potential sync_submit to the feedback fence submission
+    */
+   result = vn_queue_submit(dev->instance, submit.queue, submit.batch_count,
+                            submit.submit_batches,
+                            feedback_fence ? VK_NULL_HANDLE : submit.fence,
+                            !feedback_fence && sync_submit);
    if (result != VK_SUCCESS) {
       vn_queue_submission_cleanup(&submit);
       return vn_error(dev->instance, result);
    }
 
+   /* TODO intercept original submit batches to append the fence feedback cmd
+    * with a per-queue cached submission builder to avoid transient allocs.
+    *
+    * vn_queue_submission bits must be fixed for VkTimelineSemaphoreSubmitInfo
+    * before adding timeline semaphore feedback.
+    */
+   if (feedback_fence) {
+      const uint32_t feedback_cmd_index = vn_queue_family_array_index(queue);
+      const VkSubmitInfo info = {
+         .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+         .pNext = NULL,
+         .waitSemaphoreCount = 0,
+         .pWaitSemaphores = NULL,
+         .pWaitDstStageMask = NULL,
+         .commandBufferCount = 1,
+         .pCommandBuffers = &fence->feedback.commands[feedback_cmd_index],
+      };
+      result = vn_queue_submit(dev->instance, submit.queue, 1, &info,
+                               submit.fence, sync_submit);
+      if (result != VK_SUCCESS) {
+         vn_queue_submission_cleanup(&submit);
+         return vn_error(dev->instance, result);
+      }
+   }
+
    if (wsi_mem) {
       /* XXX this is always false and kills the performance */
       if (dev->instance->renderer->info.has_implicit_fencing) {
@@ -463,6 +527,84 @@ vn_fence_signal_wsi(struct vn_device *dev, struct vn_fence *fence)
    fence->payload = temp;
 }
 
+static VkResult
+vn_fence_feedback_init(struct vn_device *dev,
+                       struct vn_fence *fence,
+                       bool signaled,
+                       const VkAllocationCallbacks *alloc)
+{
+   VkDevice dev_handle = vn_device_to_handle(dev);
+   struct vn_feedback_slot *slot;
+   VkCommandBuffer *cmd_handles;
+   VkResult result;
+
+   /* Fence feedback implementation relies on vkWaitForFences to cover the gap
+    * between feedback slot signaling and the actual fence signal operation.
+    */
+   if (unlikely(!dev->instance->renderer->info.allow_vk_wait_syncs))
+      return VK_SUCCESS;
+
+   if (VN_PERF(NO_FENCE_FEEDBACK))
+      return VK_SUCCESS;
+
+   slot = vn_feedback_pool_alloc(&dev->feedback_pool, VN_FEEDBACK_TYPE_FENCE);
+   if (!slot)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   vn_feedback_set_status(slot, signaled ? VK_SUCCESS : VK_NOT_READY);
+
+   cmd_handles =
+      vk_zalloc(alloc, sizeof(*cmd_handles) * dev->queue_family_count,
+                VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!cmd_handles) {
+      vn_feedback_pool_free(&dev->feedback_pool, slot);
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   for (uint32_t i = 0; i < dev->queue_family_count; i++) {
+      result = vn_feedback_fence_cmd_alloc(dev_handle, &dev->cmd_pools[i],
+                                           slot, &cmd_handles[i]);
+      if (result != VK_SUCCESS) {
+         for (uint32_t j = 0; j < i; j++) {
+            vn_feedback_fence_cmd_free(dev_handle, &dev->cmd_pools[j],
+                                       cmd_handles[j]);
+         }
+         break;
+      }
+   }
+
+   if (result != VK_SUCCESS) {
+      vk_free(alloc, cmd_handles);
+      vn_feedback_pool_free(&dev->feedback_pool, slot);
+      return result;
+   }
+
+   fence->feedback.slot = slot;
+   fence->feedback.commands = cmd_handles;
+
+   return VK_SUCCESS;
+}
+
+static void
+vn_fence_feedback_fini(struct vn_device *dev,
+                       struct vn_fence *fence,
+                       const VkAllocationCallbacks *alloc)
+{
+   VkDevice dev_handle = vn_device_to_handle(dev);
+
+   if (!fence->feedback.slot)
+      return;
+
+   for (uint32_t i = 0; i < dev->queue_family_count; i++) {
+      vn_feedback_fence_cmd_free(dev_handle, &dev->cmd_pools[i],
+                                 fence->feedback.commands[i]);
+   }
+
+   vn_feedback_pool_free(&dev->feedback_pool, fence->feedback.slot);
+
+   vk_free(alloc, fence->feedback.commands);
+}
+
 VkResult
 vn_CreateFence(VkDevice device,
                const VkFenceCreateInfo *pCreateInfo,
@@ -472,6 +614,8 @@ vn_CreateFence(VkDevice device,
    struct vn_device *dev = vn_device_from_handle(device);
    const VkAllocationCallbacks *alloc =
       pAllocator ? pAllocator : &dev->base.base.alloc;
+   const bool signaled = pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT;
+   VkResult result;
 
    struct vn_fence *fence = vk_zalloc(alloc, sizeof(*fence), VN_DEFAULT_ALIGN,
                                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -491,21 +635,27 @@ vn_CreateFence(VkDevice device,
       fence->is_external = !!export_info->handleTypes;
    }
 
-   VkResult result = vn_fence_init_payloads(
-      dev, fence, pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT, alloc);
-   if (result != VK_SUCCESS) {
-      vn_object_base_fini(&fence->base);
-      vk_free(alloc, fence);
-      return vn_error(dev->instance, result);
-   }
+   result = vn_fence_init_payloads(dev, fence, signaled, alloc);
+   if (result != VK_SUCCESS)
+      goto out_object_base_fini;
 
-   VkFence fence_handle = vn_fence_to_handle(fence);
-   vn_async_vkCreateFence(dev->instance, device, pCreateInfo, NULL,
-                          &fence_handle);
+   result = vn_fence_feedback_init(dev, fence, signaled, alloc);
+   if (result != VK_SUCCESS)
+      goto out_payloads_fini;
 
-   *pFence = fence_handle;
+   *pFence = vn_fence_to_handle(fence);
+   vn_async_vkCreateFence(dev->instance, device, pCreateInfo, NULL, pFence);
 
    return VK_SUCCESS;
+
+out_payloads_fini:
+   vn_sync_payload_release(dev, &fence->permanent);
+   vn_sync_payload_release(dev, &fence->temporary);
+
+out_object_base_fini:
+   vn_object_base_fini(&fence->base);
+   vk_free(alloc, fence);
+   return vn_error(dev->instance, result);
 }
 
 void
@@ -523,6 +673,8 @@ vn_DestroyFence(VkDevice device,
 
    vn_async_vkDestroyFence(dev->instance, device, _fence, NULL);
 
+   vn_fence_feedback_fini(dev, fence, alloc);
+
    vn_sync_payload_release(dev, &fence->permanent);
    vn_sync_payload_release(dev, &fence->temporary);
 
@@ -549,6 +701,9 @@ vn_ResetFences(VkDevice device, uint32_t fenceCount, const VkFence *pFences)
 
       assert(perm->type == VN_SYNC_TYPE_DEVICE_ONLY);
       fence->payload = perm;
+
+      if (fence->feedback.slot)
+         vn_feedback_reset_status(fence->feedback.slot);
    }
 
    return VK_SUCCESS;
@@ -564,7 +719,23 @@ vn_GetFenceStatus(VkDevice device, VkFence _fence)
    VkResult result;
    switch (payload->type) {
    case VN_SYNC_TYPE_DEVICE_ONLY:
-      result = vn_call_vkGetFenceStatus(dev->instance, device, _fence);
+      if (fence->feedback.slot) {
+         result = vn_feedback_get_status(fence->feedback.slot);
+         if (result == VK_SUCCESS) {
+            /* When fence feedback slot gets signaled, the real fence
+             * signal operation follows after but the signaling isr can be
+             * deferred or preempted. To avoid theoretical racing, we let
+             * the renderer wait for the fence. This also helps resolve
+             * synchronization validation errors, because the layer no
+             * longer sees any fence status checks and falsely believes the
+             * caller does not sync.
+             */
+            vn_async_vkWaitForFences(dev->instance, device, 1, &_fence,
+                                     VK_TRUE, UINT64_MAX);
+         }
+      } else {
+         result = vn_call_vkGetFenceStatus(dev->instance, device, _fence);
+      }
       break;
    case VN_SYNC_TYPE_WSI_SIGNALED:
       result = VK_SUCCESS;
diff --git a/src/virtio/vulkan/vn_queue.h b/src/virtio/vulkan/vn_queue.h
index a66697b4f9c..594ca226e53 100644
--- a/src/virtio/vulkan/vn_queue.h
+++ b/src/virtio/vulkan/vn_queue.h
@@ -50,6 +50,12 @@ struct vn_fence {
    struct vn_sync_payload permanent;
    struct vn_sync_payload temporary;
 
+   struct {
+      /* non-NULL if VN_PERF_NO_FENCE_FEEDBACK is disabled */
+      struct vn_feedback_slot *slot;
+      VkCommandBuffer *commands;
+   } feedback;
+
    bool is_external;
 };
 VK_DEFINE_NONDISP_HANDLE_CASTS(vn_fence,



More information about the mesa-commit mailing list