Mesa (main): v3dv: track submitted jobs by GPU queue type

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Fri Jan 21 13:23:16 UTC 2022


Module: Mesa
Branch: main
Commit: 03a6a8274042af376b86c82ff6bbff9826a924b2
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=03a6a8274042af376b86c82ff6bbff9826a924b2

Author: Melissa Wen <mwen at igalia.com>
Date:   Tue Dec 14 13:40:34 2021 -0100

v3dv: track submitted jobs by GPU queue type

The order in which a GPU job is scheduled is guaranteed within the
same queue type (CL, TFU, CSD), but the order of completion of jobs
from different queues cannot be guaranteed. Since we have multiple
semaphores support now, we can track the completion of the last job
submitted to each queue and therefore better determine when gpu is
idle. We do it using an array of syncobj (last_job_syncs) for each
GPU queue (CL, TFU, CSD). With this, job serialization also become
more accurate. We also keep tracking the very last job submitted
(last_job_sync became an element of the last_job_syncs array as
V3DV_QUEUE_ANY) for the case we don't have multisync support.
To help in handling wait semaphores, we set a flag per queue to
indicate we are starting a new cmd buffer batch and a job submitted
to this queue will be the first.

Signed-off-by: Melissa Wen <mwen at igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13178>

---

 src/broadcom/vulkan/v3dv_device.c  |  28 +++++--
 src/broadcom/vulkan/v3dv_private.h |  30 ++++++-
 src/broadcom/vulkan/v3dv_queue.c   | 168 +++++++++++++++++++++++++------------
 3 files changed, 162 insertions(+), 64 deletions(-)

diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 7410bf12b95..0ea21f67827 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1745,6 +1745,16 @@ init_device_meta(struct v3dv_device *device)
    v3dv_meta_texel_buffer_copy_init(device);
 }
 
+static void
+destroy_device_syncs(struct v3dv_device *device,
+                       int render_fd)
+{
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+      if (device->last_job_syncs.syncs[i])
+         drmSyncobjDestroy(render_fd, device->last_job_syncs.syncs[i]);
+   }
+}
+
 static void
 destroy_device_meta(struct v3dv_device *device)
 {
@@ -1829,12 +1839,15 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    if (device->features.robustBufferAccess)
       perf_debug("Device created with Robust Buffer Access enabled.\n");
 
-   int ret = drmSyncobjCreate(physical_device->render_fd,
-                              DRM_SYNCOBJ_CREATE_SIGNALED,
-                              &device->last_job_sync);
-   if (ret) {
-      result = VK_ERROR_INITIALIZATION_FAILED;
-      goto fail;
+   for (int i = 0; i < V3DV_QUEUE_COUNT; i++) {
+      device->last_job_syncs.first[i] = true;
+      int ret = drmSyncobjCreate(physical_device->render_fd,
+                                 DRM_SYNCOBJ_CREATE_SIGNALED,
+                                 &device->last_job_syncs.syncs[i]);
+      if (ret) {
+         result = VK_ERROR_INITIALIZATION_FAILED;
+         goto fail;
+      }
    }
 
 #ifdef DEBUG
@@ -1852,6 +1865,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    return VK_SUCCESS;
 
 fail:
+   destroy_device_syncs(device, physical_device->render_fd);
    vk_device_finish(&device->vk);
    vk_free(&device->vk.alloc, device);
 
@@ -1867,7 +1881,7 @@ v3dv_DestroyDevice(VkDevice _device,
    v3dv_DeviceWaitIdle(_device);
    queue_finish(&device->queue);
    pthread_mutex_destroy(&device->mutex);
-   drmSyncobjDestroy(device->pdevice->render_fd, device->last_job_sync);
+   destroy_device_syncs(device, device->pdevice->render_fd);
    destroy_device_meta(device);
    v3dv_pipeline_cache_finish(&device->default_pipeline_cache);
 
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 93461b4a8d2..083ad5a21bc 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -434,6 +434,32 @@ struct v3dv_pipeline_cache {
    bool externally_synchronized;
 };
 
+/* FIXME: In addition to tracking the last job submitted by GPU queue (cl, csd,
+ * tfu), we still need a syncobj to track the last overall job submitted
+ * (V3DV_QUEUE_ANY) for the case we don't support multisync. Someday we can
+ * start expecting multisync to be present and drop the legacy implementation
+ * together with this V3DV_QUEUE_ANY tracker.
+ */
+enum v3dv_queue_type {
+   V3DV_QUEUE_CL = 0,
+   V3DV_QUEUE_CSD,
+   V3DV_QUEUE_TFU,
+   V3DV_QUEUE_ANY,
+   V3DV_QUEUE_COUNT,
+};
+
+/* For each GPU queue, we use a syncobj to track the last job submitted. We
+ * set the flag `first` to determine when we are starting a new cmd buffer
+ * batch and therefore a job submitted to a given queue will be the first in a
+ * cmd buf batch.
+ */
+struct v3dv_last_job_sync {
+   /* If the job is the first submitted to a GPU queue in a cmd buffer batch */
+   bool first[V3DV_QUEUE_COUNT];
+   /* Array of syncobj to track the last job submitted to a GPU queue */
+   uint32_t syncs[V3DV_QUEUE_COUNT];
+};
+
 struct v3dv_device {
    struct vk_device vk;
 
@@ -443,8 +469,8 @@ struct v3dv_device {
    struct v3d_device_info devinfo;
    struct v3dv_queue queue;
 
-   /* A sync object to track the last job submitted to the GPU. */
-   uint32_t last_job_sync;
+   /* Syncobjs to track the last job submitted to any GPU queue */
+   struct v3dv_last_job_sync last_job_syncs;
 
    /* A mutex to prevent concurrent access to last_job_sync from the queue */
    mtx_t mutex;
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 41bb6b4caff..17a5e2c60d2 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -133,15 +133,26 @@ static VkResult
 gpu_queue_wait_idle(struct v3dv_queue *queue)
 {
    struct v3dv_device *device = queue->device;
+   int render_fd = device->pdevice->render_fd;
+   struct v3dv_last_job_sync last_job_syncs;
 
    mtx_lock(&device->mutex);
-   uint32_t last_job_sync = device->last_job_sync;
+   memcpy(&last_job_syncs, &device->last_job_syncs, sizeof(last_job_syncs));
    mtx_unlock(&device->mutex);
 
-   int ret = drmSyncobjWait(device->pdevice->render_fd,
-                            &last_job_sync, 1, INT64_MAX, 0, NULL);
-   if (ret)
-      return VK_ERROR_DEVICE_LOST;
+   if (device->pdevice->caps.multisync) {
+      int ret = drmSyncobjWait(render_fd, (uint32_t *) &last_job_syncs.syncs,
+                               3, INT64_MAX,
+                               DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
+      if (ret)
+         return VK_ERROR_DEVICE_LOST;
+   } else {
+      int ret =
+         drmSyncobjWait(render_fd, &last_job_syncs.syncs[V3DV_QUEUE_ANY], 1,
+                        INT64_MAX, 0, NULL);
+      if (ret)
+         return VK_ERROR_DEVICE_LOST;
+   }
 
    return VK_SUCCESS;
 }
@@ -585,7 +596,9 @@ process_semaphores_to_signal(struct v3dv_device *device,
 
    int fd;
    mtx_lock(&device->mutex);
-   drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
+   drmSyncobjExportSyncFile(render_fd,
+                            device->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+                            &fd);
    mtx_unlock(&device->mutex);
    if (fd == -1)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -624,7 +637,9 @@ process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
 
    int fd;
    mtx_lock(&device->mutex);
-   drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
+   drmSyncobjExportSyncFile(render_fd,
+                            device->last_job_syncs.syncs[V3DV_QUEUE_ANY],
+                            &fd);
    mtx_unlock(&device->mutex);
    if (fd == -1)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -651,14 +666,17 @@ multisync_free(struct v3dv_device *device,
 }
 
 static struct drm_v3d_sem *
-set_syncs(struct v3dv_device *device,
-          uint32_t *count, VkSemaphore *sems,
-          uint32_t last_job_sync)
+set_in_syncs(struct v3dv_device *device,
+             struct v3dv_job *job,
+             uint32_t *count,
+             struct v3dv_submit_info_semaphores *sems_info)
 {
-   uint32_t n_sem = *count;
-
-   if (last_job_sync)
-      (*count)++;
+   /* If we are serializing a job in a cmd buffer, we are already making it
+    * wait until the last job submitted to each queue completes before
+    * running, so in that case we can skip waiting for any additional
+    * semaphores.
+    */
+   *count = job->serialize ? 3 : sems_info->wait_sem_count;
 
    if (!*count)
       return NULL;
@@ -670,14 +688,53 @@ set_syncs(struct v3dv_device *device,
    if (!syncs)
       return NULL;
 
-   if (n_sem)
-      for (unsigned i = 0; i < n_sem; i++) {
-         struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]);
+   if (!job->serialize) {
+      for (int i = 0; i < *count; i++) {
+         struct v3dv_semaphore *sem =
+            v3dv_semaphore_from_handle(sems_info->wait_sems[i]);
+         syncs[i].handle = sem->sync;
+      }
+   } else {
+      for (int i = 0; i < *count; i++)
+         syncs[i].handle = device->last_job_syncs.syncs[i];
+   }
+
+   return syncs;
+}
+
+static struct drm_v3d_sem *
+set_out_syncs(struct v3dv_device *device,
+              bool do_sem_signal,
+              enum v3dv_queue_type queue,
+              uint32_t *count,
+              struct v3dv_submit_info_semaphores *sems_info)
+{
+   uint32_t n_sems = do_sem_signal ? sems_info->signal_sem_count : 0;
+
+   /* We always signal the syncobj from `device->last_job_syncs` related to
+    * this v3dv_queue_type to track the last job submitted to this queue. We
+    * also signal the last overall job (V3DV_QUEUE_ANY) as we use it to
+    * process signal semaphores and fence.
+    */
+   (*count) = n_sems + 2;
+
+   struct drm_v3d_sem *syncs =
+      vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+   if (!syncs)
+      return NULL;
+
+   if (n_sems) {
+      for (unsigned i = 0; i < n_sems; i++) {
+         struct v3dv_semaphore *sem =
+            v3dv_semaphore_from_handle(sems_info->signal_sems[i]);
          syncs[i].handle = sem->sync;
       }
+   }
 
-   if (last_job_sync)
-      syncs[n_sem].handle = last_job_sync;
+   syncs[n_sems].handle = device->last_job_syncs.syncs[queue];
+   syncs[++n_sems].handle = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
 
    return syncs;
 }
@@ -702,45 +759,42 @@ set_multisync(struct drm_v3d_multi_sync *ms,
               struct v3dv_submit_info_semaphores *sems_info,
               struct drm_v3d_extension *next,
               struct v3dv_device *device,
+              struct v3dv_job *job,
               struct drm_v3d_sem *out_syncs,
               struct drm_v3d_sem *in_syncs,
               bool do_sem_signal,
-              bool serialize,
-              enum v3d_queue queue)
+              enum v3dv_queue_type queue_sync,
+              enum v3d_queue wait_stage)
 {
    uint32_t out_sync_count = 0, in_sync_count = 0;
 
-   /* We only want to signal out semaphores for this submission upon
-    * completion of the last job involved with it. We still want to always
-    * signal last_job_sync so we can serialize jobs when needed.
-    */
-   out_sync_count = do_sem_signal ? sems_info->signal_sem_count : 0;
-   out_syncs = set_syncs(device, &out_sync_count, sems_info->signal_sems,
-                         device->last_job_sync);
+   in_syncs = set_in_syncs(device, job, &in_sync_count, sems_info);
+   if (!in_syncs && in_sync_count)
+      goto fail;
+
+   out_syncs = set_out_syncs(device, do_sem_signal, queue_sync,
+                             &out_sync_count, sems_info);
 
    assert(out_sync_count > 0);
 
    if (!out_syncs)
-      return;
-
-   /* If we are serializing a job in a command buffer, we are already making
-    * it wait for completion of the last job submitted, so in that case we can
-    * skip waiting for any additional semaphores.
-    */
-   in_sync_count = serialize ? 0 : sems_info->wait_sem_count;
-   in_syncs = set_syncs(device, &in_sync_count, sems_info->wait_sems,
-                        (serialize ? device->last_job_sync : 0));
-   if (!in_syncs && in_sync_count) {
-      vk_free(&device->vk.alloc, out_syncs);
-      return;
-   }
+      goto fail;
 
    set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
-   ms->wait_stage = queue;
+   ms->wait_stage = wait_stage;
    ms->out_sync_count = out_sync_count;
    ms->out_syncs = (uintptr_t)(void *)out_syncs;
    ms->in_sync_count = in_sync_count;
    ms->in_syncs = (uintptr_t)(void *)in_syncs;
+
+   return;
+
+fail:
+   if (in_syncs)
+      vk_free(&device->vk.alloc, in_syncs);
+   assert(!out_syncs);
+
+   return;
 }
 
 static VkResult
@@ -814,13 +868,14 @@ handle_cl_job(struct v3dv_queue *queue,
     */
    if (device->pdevice->caps.multisync) {
       struct drm_v3d_multi_sync ms = { 0 };
+      enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
       /* We are processing all signal VkSemaphores together in the submit
        * master thread and therefore we don't handle signal VkSemaphores in cl
        * submission yet. For this reason, we set do_sem_signal to false in the
        * multisync extension.
        */
-      set_multisync(&ms, sems_info, NULL, device, out_syncs, in_syncs, false,
-                    job->serialize, needs_rcl_sync ? V3D_RENDER : V3D_BIN);
+      set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
+                    false, V3DV_QUEUE_CL, wait_stage);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -831,9 +886,10 @@ handle_cl_job(struct v3dv_queue *queue,
       submit.in_sync_bcl = 0;
       submit.out_sync = 0;
    } else {
-      submit.in_sync_bcl = needs_bcl_sync ? device->last_job_sync : 0;
-      submit.in_sync_rcl = needs_rcl_sync ? device->last_job_sync : 0;
-      submit.out_sync = device->last_job_sync;
+      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0;
+      submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0;
+      submit.out_sync = last_job_sync;
    }
 
    v3dv_clif_dump(device, job, &submit);
@@ -880,8 +936,8 @@ handle_tfu_job(struct v3dv_queue *queue,
        * tfu jobs yet. For this reason, we set do_sem_signal to false in the
        * multisync extension.
        */
-      set_multisync(&ms, sems_info, NULL, device, out_syncs, in_syncs, false,
-                    job->serialize, V3D_TFU);
+      set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
+                    false, V3DV_QUEUE_TFU, V3D_TFU);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -891,8 +947,9 @@ handle_tfu_job(struct v3dv_queue *queue,
       job->tfu.in_sync = 0;
       job->tfu.out_sync = 0;
    } else {
-      job->tfu.in_sync = needs_sync ? device->last_job_sync : 0;
-      job->tfu.out_sync = device->last_job_sync;
+      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      job->tfu.in_sync = needs_sync ? last_job_sync : 0;
+      job->tfu.out_sync = last_job_sync;
    }
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
@@ -943,8 +1000,8 @@ handle_csd_job(struct v3dv_queue *queue,
        * csd jobs yet. For this reason, we set do_sem_signal to false in the
        * multisync extension.
        */
-      set_multisync(&ms, sems_info, NULL, device, out_syncs, in_syncs, false,
-                    job->serialize, V3D_CSD);
+      set_multisync(&ms, sems_info, NULL, device, job, out_syncs, in_syncs,
+                    false, V3DV_QUEUE_CSD, V3D_CSD);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
@@ -954,8 +1011,9 @@ handle_csd_job(struct v3dv_queue *queue,
       submit->in_sync = 0;
       submit->out_sync = 0;
    } else {
-      submit->in_sync = needs_sync ? device->last_job_sync : 0;
-      submit->out_sync = device->last_job_sync;
+      uint32_t last_job_sync = device->last_job_syncs.syncs[V3DV_QUEUE_ANY];
+      submit->in_sync = needs_sync ? last_job_sync : 0;
+      submit->out_sync = last_job_sync;
    }
    int ret = v3dv_ioctl(device->pdevice->render_fd,
                         DRM_IOCTL_V3D_SUBMIT_CSD, submit);



More information about the mesa-commit mailing list