Mesa (main): turnip: Implement VK_KHR_timeline_semaphore.

Wed May 12 05:31:16 UTC 2021

Module: Mesa
Branch: main
Commit: 3f229e34c9793b71001f85e08ef64b7f33565d50
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=3f229e34c9793b71001f85e08ef64b7f33565d50

Author: Hyunjun Ko <zzoon at igalia.com>
Date:   Thu May  6 05:05:39 2021 +0000

turnip: Implement VK_KHR_timeline_semaphore.

Implements non-shareable timelines using legacy syncobjs,
inspired by anv/radv implementation.

v1. Avoid memcpy in/out_syncobjs and fix some mistakes.
v2.
 - Handle vkQueueWaitIdle.
 - Add enum tu_semaphore_type.
 - Fix to handle VK_SEMAPHORE_WAIT_ANY_BIT_KHR correctly.
 - Fix a crash of dEQP-VK.synchronization.timeline_semaphore.device_host.misc.max_difference_value.

v3. Avoid indefinite waiting in vkQueueWaitIdle by calling
tu_device_submit_deferred_locked itself.

Signed-off-by: Hyunjun Ko <zzoon at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10126>

---

 docs/features.txt                 |   2 +-
 src/freedreno/vulkan/tu_device.c  |  54 ++-
 src/freedreno/vulkan/tu_drm.c     | 830 +++++++++++++++++++++++++++++++++++---
 src/freedreno/vulkan/tu_private.h |  11 +
 4 files changed, 830 insertions(+), 67 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index 6a168c1bb8b..66b4d7f6370 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -462,7 +462,7 @@ Vulkan 1.2 -- all DONE: anv, vn
   VK_KHR_shader_float_controls                          DONE (anv/gen8+, radv, tu, vn)
   VK_KHR_shader_subgroup_extended_types                 DONE (anv/gen8+, radv, vn)
   VK_KHR_spirv_1_4                                      DONE (anv, radv, tu, vn)
-  VK_KHR_timeline_semaphore                             DONE (anv, radv, vn)
+  VK_KHR_timeline_semaphore                             DONE (anv, radv, tu, vn)
   VK_KHR_uniform_buffer_standard_layout                 DONE (anv, lvp, radv, vn)
   VK_KHR_vulkan_memory_model                            DONE (anv, radv, tu, vn)
   VK_EXT_descriptor_indexing                            DONE (anv/gen9+, radv, tu, vn)
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index 697b80980a9..a003a4c6911 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -148,6 +148,9 @@ get_device_extensions(const struct tu_physical_device *device,
       .KHR_swapchain = TU_HAS_SURFACE,
       .KHR_variable_pointers = true,
       .KHR_vulkan_memory_model = true,
+#ifndef ANDROID
+      .KHR_timeline_semaphore = true,
+#endif
 #ifdef VK_USE_PLATFORM_DISPLAY_KHR
       .EXT_display_control = true,
 #endif
@@ -565,7 +568,7 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->shaderSubgroupExtendedTypes         = false;
          features->separateDepthStencilLayouts         = false;
          features->hostQueryReset                      = true;
-         features->timelineSemaphore                   = false;
+         features->timelineSemaphore                   = true;
          features->bufferDeviceAddress                 = false;
          features->bufferDeviceAddressCaptureReplay    = false;
          features->bufferDeviceAddressMultiDevice      = false;
@@ -757,6 +760,12 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          feature->vulkanMemoryModelAvailabilityVisibilityChains = true;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES: {
+         VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *features =
+            (VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *) ext;
+         features->timelineSemaphore = true;
+         break;
+      }
 
       default:
          break;
@@ -1076,6 +1085,12 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->robustUniformBufferAccessSizeAlignment = 16;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES: {
+         VkPhysicalDeviceTimelineSemaphorePropertiesKHR *props =
+            (VkPhysicalDeviceTimelineSemaphorePropertiesKHR *) ext;
+         props->maxTimelineSemaphoreValueDifference = UINT64_MAX;
+         break;
+      }
       default:
          break;
       }
@@ -1198,6 +1213,8 @@ tu_queue_init(struct tu_device *device,
    queue->queue_idx = idx;
    queue->flags = flags;
 
+   list_inithead(&queue->queued_submits);
+
    int ret = tu_drm_submitqueue_new(device, 0, &queue->msm_queue_id);
    if (ret)
       return vk_startup_errorf(device->instance, VK_ERROR_INITIALIZATION_FAILED,
@@ -1294,6 +1311,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    device->_lost = false;
 
    mtx_init(&device->bo_mutex, mtx_plain);
+   pthread_mutex_init(&device->submit_mutex, NULL);
 
    for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
       const VkDeviceQueueCreateInfo *queue_create =
@@ -1424,6 +1442,24 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
       }
    }
 
+   /* Initialize a condition variable for timeline semaphore */
+   pthread_condattr_t condattr;
+   if (pthread_condattr_init(&condattr) != 0) {
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      goto fail_timeline_cond;
+   }
+   if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0) {
+      pthread_condattr_destroy(&condattr);
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      goto fail_timeline_cond;
+   }
+   if (pthread_cond_init(&device->timeline_cond, &condattr) != 0) {
+      pthread_condattr_destroy(&condattr);
+      result = VK_ERROR_INITIALIZATION_FAILED;
+      goto fail_timeline_cond;
+   }
+   pthread_condattr_destroy(&condattr);
+
    device->mem_cache = tu_pipeline_cache_from_handle(pc);
 
    for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
@@ -1434,6 +1470,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    *pDevice = tu_device_to_handle(device);
    return VK_SUCCESS;
 
+fail_timeline_cond:
 fail_prepare_perfcntrs_pass_cs:
    free(device->perfcntrs_pass_cs_entries);
    tu_cs_finish(device->perfcntrs_pass_cs);
@@ -1492,6 +1529,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
       free(device->perfcntrs_pass_cs);
    }
 
+   pthread_cond_destroy(&device->timeline_cond);
    vk_free(&device->vk.alloc, device->bo_list);
    vk_free(&device->vk.alloc, device->bo_idx);
    vk_device_finish(&device->vk);
@@ -1609,6 +1647,20 @@ tu_QueueWaitIdle(VkQueue _queue)
    if (queue->fence < 0)
       return VK_SUCCESS;
 
+   pthread_mutex_lock(&queue->device->submit_mutex);
+
+   do {
+      tu_device_submit_deferred_locked(queue->device);
+
+      if (list_is_empty(&queue->queued_submits))
+         break;
+
+      pthread_cond_wait(&queue->device->timeline_cond,
+            &queue->device->submit_mutex);
+   } while (!list_is_empty(&queue->queued_submits));
+
+   pthread_mutex_unlock(&queue->device->submit_mutex);
+
    struct pollfd fds = { .fd = queue->fence, .events = POLLIN };
    int ret;
    do {
diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c
index 6da91dad87e..a668556eca3 100644
--- a/src/freedreno/vulkan/tu_drm.c
+++ b/src/freedreno/vulkan/tu_drm.c
@@ -32,6 +32,8 @@
 #include "vk_util.h"
 
 #include "drm-uapi/msm_drm.h"
+#include "util/timespec.h"
+#include "util/os_time.h"
 
 #include "tu_private.h"
 
@@ -39,14 +41,73 @@ struct tu_binary_syncobj {
    uint32_t permanent, temporary;
 };
 
+struct tu_timeline_point {
+   struct list_head link;
+
+   uint64_t value;
+   uint32_t syncobj;
+   uint32_t wait_count;
+};
+
+struct tu_timeline {
+   uint64_t highest_submitted;
+   uint64_t highest_signaled;
+
+   /* A timeline can have multiple timeline points */
+   struct list_head points;
+
+   /* A list containing points that has been already submited.
+    * A point will be moved to 'points' when new point is required
+    * at submit time.
+    */
+   struct list_head free_points;
+};
+
+typedef enum {
+   TU_SEMAPHORE_BINARY,
+   TU_SEMAPHORE_TIMELINE,
+} tu_semaphore_type;
+
+
 struct tu_syncobj {
    struct vk_object_base base;
 
+   tu_semaphore_type type;
    union {
       struct tu_binary_syncobj binary;
+      struct tu_timeline timeline;
    };
 };
 
+struct tu_queue_submit
+{
+   struct   list_head link;
+
+   struct   tu_syncobj **wait_semaphores;
+   uint32_t wait_semaphore_count;
+   struct   tu_syncobj **signal_semaphores;
+   uint32_t signal_semaphore_count;
+
+   struct   tu_syncobj **wait_timelines;
+   uint64_t *wait_timeline_values;
+   uint32_t wait_timeline_count;
+   uint32_t wait_timeline_array_length;
+
+   struct   tu_syncobj **signal_timelines;
+   uint64_t *signal_timeline_values;
+   uint32_t signal_timeline_count;
+   uint32_t signal_timeline_array_length;
+
+   struct   drm_msm_gem_submit_cmd *cmds;
+   struct   drm_msm_gem_submit_syncobj *in_syncobjs;
+   uint32_t nr_in_syncobjs;
+   struct   drm_msm_gem_submit_syncobj *out_syncobjs;
+   uint32_t nr_out_syncobjs;
+
+   bool     last_submit;
+   uint32_t entry_count;
+};
+
 static int
 tu_drm_get_param(const struct tu_physical_device *dev,
                  uint32_t param,
@@ -454,11 +515,33 @@ tu_enumerate_devices(struct tu_instance *instance)
    return result;
 }
 
+static void
+tu_timeline_finish(struct tu_device *device,
+                    struct tu_timeline *timeline)
+{
+   list_for_each_entry_safe(struct tu_timeline_point, point,
+                            &timeline->free_points, link) {
+      list_del(&point->link);
+      ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY,
+            &(struct drm_syncobj_destroy) { .handle = point->syncobj });
+
+      vk_free(&device->vk.alloc, point);
+   }
+   list_for_each_entry_safe(struct tu_timeline_point, point,
+                            &timeline->points, link) {
+      list_del(&point->link);
+      ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY,
+            &(struct drm_syncobj_destroy) { .handle = point->syncobj });
+      vk_free(&device->vk.alloc, point);
+   }
+}
+
 static VkResult
 sync_create(VkDevice _device,
             bool signaled,
             bool fence,
             bool binary,
+            uint64_t timeline_value,
             const VkAllocationCallbacks *pAllocator,
             void **p_sync)
 {
@@ -483,6 +566,13 @@ sync_create(VkDevice _device,
 
       sync->binary.permanent = create.handle;
       sync->binary.temporary = 0;
+      sync->type = TU_SEMAPHORE_BINARY;
+   } else {
+      sync->type = TU_SEMAPHORE_TIMELINE;
+      sync->timeline.highest_signaled = sync->timeline.highest_submitted =
+             timeline_value;
+      list_inithead(&sync->timeline.points);
+      list_inithead(&sync->timeline.free_points);
    }
 
    *p_sync = sync;
@@ -508,9 +598,13 @@ sync_destroy(VkDevice _device, struct tu_syncobj *sync, const VkAllocationCallba
    if (!sync)
       return;
 
-   sync_set_temporary(device, sync, 0);
-   ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY,
-         &(struct drm_syncobj_destroy) { .handle = sync->binary.permanent });
+   if (sync->type == TU_SEMAPHORE_BINARY) {
+      sync_set_temporary(device, sync, 0);
+      ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY,
+            &(struct drm_syncobj_destroy) { .handle = sync->binary.permanent });
+   } else {
+      tu_timeline_finish(device, &sync->timeline);
+   }
 
    vk_object_free(&device->vk, pAllocator, sync);
 }
@@ -588,13 +682,31 @@ sync_export(VkDevice _device, struct tu_syncobj *sync, bool sync_fd, int *p_fd)
    return VK_SUCCESS;
 }
 
+static VkSemaphoreTypeKHR
+get_semaphore_type(const void *pNext, uint64_t *initial_value)
+{
+   const VkSemaphoreTypeCreateInfoKHR *type_info =
+      vk_find_struct_const(pNext, SEMAPHORE_TYPE_CREATE_INFO_KHR);
+
+   if (!type_info)
+      return VK_SEMAPHORE_TYPE_BINARY_KHR;
+
+   if (initial_value)
+      *initial_value = type_info->initialValue;
+   return type_info->semaphoreType;
+}
+
 VkResult
 tu_CreateSemaphore(VkDevice device,
                    const VkSemaphoreCreateInfo *pCreateInfo,
                    const VkAllocationCallbacks *pAllocator,
                    VkSemaphore *pSemaphore)
 {
-   return sync_create(device, false, false, true, pAllocator, (void**) pSemaphore);
+   uint64_t timeline_value = 0;
+   VkSemaphoreTypeKHR sem_type = get_semaphore_type(pCreateInfo->pNext, &timeline_value);
+
+   return sync_create(device, false, false, (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR),
+                      timeline_value, pAllocator, (void**) pSemaphore);
 }
 
 void
@@ -626,8 +738,11 @@ tu_GetPhysicalDeviceExternalSemaphoreProperties(
    const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo,
    VkExternalSemaphoreProperties *pExternalSemaphoreProperties)
 {
-   if (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT ||
-       pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
+   VkSemaphoreTypeKHR type = get_semaphore_type(pExternalSemaphoreInfo->pNext, NULL);
+
+   if (type != VK_SEMAPHORE_TYPE_TIMELINE &&
+       (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT ||
+       pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT )) {
       pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
       pExternalSemaphoreProperties->compatibleHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
       pExternalSemaphoreProperties->externalSemaphoreFeatures = VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT |
@@ -639,6 +754,413 @@ tu_GetPhysicalDeviceExternalSemaphoreProperties(
    }
 }
 
+static VkResult
+tu_queue_submit_add_timeline_wait_locked(struct tu_queue_submit* submit,
+                                         struct tu_device *device,
+                                         struct tu_syncobj *timeline,
+                                         uint64_t value)
+{
+   if (submit->wait_timeline_count >= submit->wait_timeline_array_length) {
+      uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64);
+
+      submit->wait_timelines = vk_realloc(&device->vk.alloc,
+            submit->wait_timelines,
+            new_len * sizeof(*submit->wait_timelines),
+            8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+      if (submit->wait_timelines == NULL)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+      submit->wait_timeline_values = vk_realloc(&device->vk.alloc,
+            submit->wait_timeline_values,
+            new_len * sizeof(*submit->wait_timeline_values),
+            8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+      if (submit->wait_timeline_values == NULL) {
+         vk_free(&device->vk.alloc, submit->wait_timelines);
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+
+      submit->wait_timeline_array_length = new_len;
+   }
+
+   submit->wait_timelines[submit->wait_timeline_count] = timeline;
+   submit->wait_timeline_values[submit->wait_timeline_count] = value;
+
+   submit->wait_timeline_count++;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+tu_queue_submit_add_timeline_signal_locked(struct tu_queue_submit* submit,
+                                           struct tu_device *device,
+                                           struct tu_syncobj *timeline,
+                                           uint64_t value)
+{
+   if (submit->signal_timeline_count >= submit->signal_timeline_array_length) {
+      uint32_t new_len = MAX2(submit->signal_timeline_array_length * 2, 32);
+
+      submit->signal_timelines = vk_realloc(&device->vk.alloc,
+            submit->signal_timelines,
+            new_len * sizeof(*submit->signal_timelines),
+            8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+      if (submit->signal_timelines == NULL)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+      submit->signal_timeline_values = vk_realloc(&device->vk.alloc,
+            submit->signal_timeline_values,
+            new_len * sizeof(*submit->signal_timeline_values),
+            8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+      if (submit->signal_timeline_values == NULL) {
+         vk_free(&device->vk.alloc, submit->signal_timelines);
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+
+      submit->signal_timeline_array_length = new_len;
+   }
+
+   submit->signal_timelines[submit->signal_timeline_count] = timeline;
+   submit->signal_timeline_values[submit->signal_timeline_count] = value;
+
+   submit->signal_timeline_count++;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+tu_queue_submit_create_locked(struct tu_queue *queue,
+                              const VkSubmitInfo *submit_info,
+                              const uint32_t entry_count,
+                              const uint32_t nr_in_syncobjs,
+                              const uint32_t nr_out_syncobjs,
+                              const bool last_submit,
+                              struct tu_queue_submit **submit)
+{
+   VkResult result;
+
+   const VkTimelineSemaphoreSubmitInfoKHR *timeline_info =
+         vk_find_struct_const(submit_info->pNext,
+                              TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR);
+
+   const uint32_t wait_values_count =
+         timeline_info ? timeline_info->waitSemaphoreValueCount : 0;
+   const uint32_t signal_values_count =
+         timeline_info ? timeline_info->signalSemaphoreValueCount : 0;
+
+   const uint64_t *wait_values =
+         wait_values_count ? timeline_info->pWaitSemaphoreValues : NULL;
+   const uint64_t *signal_values =
+         signal_values_count ?  timeline_info->pSignalSemaphoreValues : NULL;
+
+   struct tu_queue_submit *new_submit = vk_zalloc(&queue->device->vk.alloc,
+               sizeof(*new_submit), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+   new_submit->wait_semaphores = vk_zalloc(&queue->device->vk.alloc,
+         submit_info->waitSemaphoreCount * sizeof(*new_submit->wait_semaphores),
+         8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (new_submit->wait_semaphores == NULL) {
+      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      goto fail_wait_semaphores;
+   }
+   new_submit->wait_semaphore_count = submit_info->waitSemaphoreCount;
+
+   new_submit->signal_semaphores = vk_zalloc(&queue->device->vk.alloc,
+         submit_info->signalSemaphoreCount *sizeof(*new_submit->signal_semaphores),
+         8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (new_submit->signal_semaphores == NULL) {
+      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      goto fail_signal_semaphores;
+   }
+   new_submit->signal_semaphore_count = submit_info->signalSemaphoreCount;
+
+   for (uint32_t i = 0; i < submit_info->waitSemaphoreCount; i++) {
+      TU_FROM_HANDLE(tu_syncobj, sem, submit_info->pWaitSemaphores[i]);
+      new_submit->wait_semaphores[i] = sem;
+
+      if (sem->type == TU_SEMAPHORE_TIMELINE) {
+         result = tu_queue_submit_add_timeline_wait_locked(new_submit,
+               queue->device, sem, wait_values[i]);
+         if (result != VK_SUCCESS)
+            goto fail_wait_timelines;
+      }
+   }
+
+   for (uint32_t i = 0; i < submit_info->signalSemaphoreCount; i++) {
+      TU_FROM_HANDLE(tu_syncobj, sem, submit_info->pSignalSemaphores[i]);
+      new_submit->signal_semaphores[i] = sem;
+
+      if (sem->type == TU_SEMAPHORE_TIMELINE) {
+         result = tu_queue_submit_add_timeline_signal_locked(new_submit,
+               queue->device, sem, signal_values[i]);
+         if (result != VK_SUCCESS)
+            goto fail_signal_timelines;
+      }
+   }
+
+   new_submit->cmds = vk_zalloc(&queue->device->vk.alloc,
+         entry_count * sizeof(*new_submit->cmds), 8,
+         VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+   if (new_submit->cmds == NULL) {
+      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      goto fail_cmds;
+   }
+
+   /* Allocate without wait timeline semaphores */
+   new_submit->in_syncobjs = vk_zalloc(&queue->device->vk.alloc,
+         (nr_in_syncobjs - new_submit->wait_timeline_count) *
+         sizeof(*new_submit->in_syncobjs), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+   if (new_submit->in_syncobjs == NULL) {
+      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      goto fail_in_syncobjs;
+   }
+
+   /* Allocate with signal timeline semaphores considered */
+   new_submit->out_syncobjs = vk_zalloc(&queue->device->vk.alloc,
+         nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8,
+         VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+   if (new_submit->out_syncobjs == NULL) {
+      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      goto fail_out_syncobjs;
+   }
+
+   new_submit->entry_count = entry_count;
+   new_submit->nr_in_syncobjs = nr_in_syncobjs;
+   new_submit->nr_out_syncobjs = nr_out_syncobjs;
+   new_submit->last_submit = last_submit;
+   list_inithead(&new_submit->link);
+
+   *submit = new_submit;
+
+   return VK_SUCCESS;
+
+fail_out_syncobjs:
+   vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs);
+fail_in_syncobjs:
+   vk_free(&queue->device->vk.alloc, new_submit->cmds);
+fail_cmds:
+fail_signal_timelines:
+fail_wait_timelines:
+   vk_free(&queue->device->vk.alloc, new_submit->signal_semaphores);
+fail_signal_semaphores:
+   vk_free(&queue->device->vk.alloc, new_submit->wait_semaphores);
+fail_wait_semaphores:
+   return result;
+}
+
+static void
+tu_queue_submit_free(struct tu_queue *queue, struct tu_queue_submit *submit)
+{
+   vk_free(&queue->device->vk.alloc, submit->wait_semaphores);
+   vk_free(&queue->device->vk.alloc, submit->signal_semaphores);
+
+   vk_free(&queue->device->vk.alloc, submit->wait_timelines);
+   vk_free(&queue->device->vk.alloc, submit->wait_timeline_values);
+   vk_free(&queue->device->vk.alloc, submit->signal_timelines);
+   vk_free(&queue->device->vk.alloc, submit->signal_timeline_values);
+
+   vk_free(&queue->device->vk.alloc, submit->cmds);
+   vk_free(&queue->device->vk.alloc, submit->in_syncobjs);
+   vk_free(&queue->device->vk.alloc, submit->out_syncobjs);
+   vk_free(&queue->device->vk.alloc, submit);
+}
+
+static VkResult
+tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
+{
+   uint32_t flags = MSM_PIPE_3D0;
+
+   if (submit->nr_in_syncobjs)
+      flags |= MSM_SUBMIT_SYNCOBJ_IN;
+
+   if (submit->nr_out_syncobjs)
+      flags |= MSM_SUBMIT_SYNCOBJ_OUT;
+
+   if (submit->last_submit)
+      flags |= MSM_SUBMIT_FENCE_FD_OUT;
+
+   mtx_lock(&queue->device->bo_mutex);
+
+   struct drm_msm_gem_submit req = {
+      .flags = flags,
+      .queueid = queue->msm_queue_id,
+      .bos = (uint64_t)(uintptr_t) queue->device->bo_list,
+      .nr_bos = queue->device->bo_count,
+      .cmds = (uint64_t)(uintptr_t)submit->cmds,
+      .nr_cmds = submit->entry_count,
+      .in_syncobjs = (uint64_t)(uintptr_t)submit->in_syncobjs,
+      .out_syncobjs = (uint64_t)(uintptr_t)submit->out_syncobjs,
+      .nr_in_syncobjs = submit->nr_in_syncobjs - submit->wait_timeline_count,
+      .nr_out_syncobjs = submit->nr_out_syncobjs,
+      .syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj),
+   };
+
+   int ret = drmCommandWriteRead(queue->device->fd,
+                                 DRM_MSM_GEM_SUBMIT,
+                                 &req, sizeof(req));
+
+   mtx_unlock(&queue->device->bo_mutex);
+
+   if (ret)
+      return tu_device_set_lost(queue->device, "submit failed: %s\n",
+                                strerror(errno));
+
+   /* restore permanent payload on wait */
+   for (uint32_t i = 0; i < submit->wait_semaphore_count; i++) {
+      TU_FROM_HANDLE(tu_syncobj, sem, submit->wait_semaphores[i]);
+      if(sem->type == TU_SEMAPHORE_BINARY)
+         sync_set_temporary(queue->device, sem, 0);
+   }
+
+   if (submit->last_submit) {
+      if (queue->fence >= 0)
+         close(queue->fence);
+      queue->fence = req.fence_fd;
+   }
+
+   /* Update highest_submitted values in the timeline. */
+   for (uint32_t i = 0; i < submit->signal_timeline_count; i++) {
+      struct tu_syncobj *sem = submit->signal_timelines[i];
+      uint64_t signal_value = submit->signal_timeline_values[i];
+
+      assert(signal_value > sem->timeline.highest_submitted);
+
+      sem->timeline.highest_submitted = signal_value;
+   }
+
+   pthread_cond_broadcast(&queue->device->timeline_cond);
+
+   return VK_SUCCESS;
+}
+
+
+static bool
+tu_queue_submit_ready_locked(struct tu_queue_submit *submit)
+{
+   for (uint32_t i = 0; i < submit->wait_timeline_count; i++) {
+      if (submit->wait_timeline_values[i] >
+            submit->wait_timelines[i]->timeline.highest_submitted) {
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static VkResult
+tu_timeline_add_point_locked(struct tu_device *device,
+                             struct tu_timeline *timeline,
+                             uint64_t value,
+                             struct tu_timeline_point **point)
+{
+
+   if (list_is_empty(&timeline->free_points)) {
+      *point = vk_zalloc(&device->vk.alloc, sizeof(**point), 8,
+            VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+      if (!(*point))
+         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      struct drm_syncobj_create create = {};
+
+      int ret = ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &create);
+      if (ret) {
+         vk_free(&device->vk.alloc, *point);
+         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      }
+
+      (*point)->syncobj = create.handle;
+
+   } else {
+      *point = list_first_entry(&timeline->free_points,
+                                struct tu_timeline_point, link);
+      list_del(&(*point)->link);
+   }
+
+   (*point)->value = value;
+   list_addtail(&(*point)->link, &timeline->points);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+tu_queue_submit_timeline_locked(struct tu_queue *queue,
+                                struct tu_queue_submit *submit)
+{
+   VkResult result;
+   uint32_t timeline_idx =
+         submit->nr_out_syncobjs - submit->signal_timeline_count;
+
+   for (uint32_t i = 0; i < submit->signal_timeline_count; i++) {
+      struct tu_timeline *timeline = &submit->signal_timelines[i]->timeline;
+      uint64_t signal_value = submit->signal_timeline_values[i];
+      struct tu_timeline_point *point;
+
+      result = tu_timeline_add_point_locked(queue->device, timeline,
+            signal_value, &point);
+      if (result != VK_SUCCESS)
+         return result;
+
+      submit->out_syncobjs[timeline_idx + i] =
+         (struct drm_msm_gem_submit_syncobj) {
+            .handle = point->syncobj,
+            .flags = 0,
+         };
+   }
+
+   return tu_queue_submit_locked(queue, submit);
+}
+
+static VkResult
+tu_queue_submit_deferred_locked(struct tu_queue *queue, uint32_t *advance)
+{
+   VkResult result = VK_SUCCESS;
+
+   list_for_each_entry_safe(struct tu_queue_submit, submit,
+                            &queue->queued_submits, link) {
+      if (!tu_queue_submit_ready_locked(submit))
+         break;
+
+      (*advance)++;
+
+      result = tu_queue_submit_timeline_locked(queue, submit);
+
+      list_del(&submit->link);
+      tu_queue_submit_free(queue, submit);
+
+      if (result != VK_SUCCESS)
+         break;
+   }
+
+   return result;
+}
+
+VkResult
+tu_device_submit_deferred_locked(struct tu_device *dev)
+{
+    VkResult result = VK_SUCCESS;
+
+    uint32_t advance = 0;
+    do {
+       advance = 0;
+       for (uint32_t i = 0; i < dev->queue_count[0]; i++) {
+          /* Try again if there's signaled submission. */
+          result = tu_queue_submit_deferred_locked(&dev->queues[0][i],
+                &advance);
+          if (result != VK_SUCCESS)
+             return result;
+       }
+
+    } while(advance);
+
+    return result;
+}
+
 VkResult
 tu_QueueSubmit(VkQueue _queue,
                uint32_t submitCount,
@@ -659,13 +1181,39 @@ tu_QueueSubmit(VkQueue _queue,
 
       if (last_submit && fence)
          out_syncobjs_size += 1;
+
+      uint32_t entry_count = 0;
+      for (uint32_t j = 0; j < submit->commandBufferCount; ++j) {
+         TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]);
+
+         if (perf_info)
+            entry_count++;
+
+         entry_count += cmdbuf->cs.entry_count;
+      }
+
+      pthread_mutex_lock(&queue->device->submit_mutex);
+      struct tu_queue_submit *submit_req = NULL;
+
+      VkResult ret = tu_queue_submit_create_locked(queue, submit,
+            entry_count, submit->waitSemaphoreCount, out_syncobjs_size,
+            last_submit, &submit_req);
+
+      if (ret != VK_SUCCESS) {
+         pthread_mutex_unlock(&queue->device->submit_mutex);
+         return ret;
+      }
+
       /* note: assuming there won't be any very large semaphore counts */
-      struct drm_msm_gem_submit_syncobj in_syncobjs[submit->waitSemaphoreCount];
-      struct drm_msm_gem_submit_syncobj out_syncobjs[out_syncobjs_size];
+      struct drm_msm_gem_submit_syncobj *in_syncobjs = submit_req->in_syncobjs;
+      struct drm_msm_gem_submit_syncobj *out_syncobjs = submit_req->out_syncobjs;
       uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0;
 
       for (uint32_t i = 0; i < submit->waitSemaphoreCount; i++) {
          TU_FROM_HANDLE(tu_syncobj, sem, submit->pWaitSemaphores[i]);
+         if (sem->type == TU_SEMAPHORE_TIMELINE)
+            continue;
+
          in_syncobjs[nr_in_syncobjs++] = (struct drm_msm_gem_submit_syncobj) {
             .handle = sem->binary.temporary ?: sem->binary.permanent,
             .flags = MSM_SUBMIT_SYNCOBJ_RESET,
@@ -674,6 +1222,13 @@ tu_QueueSubmit(VkQueue _queue,
 
       for (uint32_t i = 0; i < submit->signalSemaphoreCount; i++) {
          TU_FROM_HANDLE(tu_syncobj, sem, submit->pSignalSemaphores[i]);
+
+         /* In case of timeline semaphores, we can defer the creation of syncobj
+          * and adding it at real submit time.
+          */
+         if (sem->type == TU_SEMAPHORE_TIMELINE)
+            continue;
+
          out_syncobjs[nr_out_syncobjs++] = (struct drm_msm_gem_submit_syncobj) {
             .handle = sem->binary.temporary ?: sem->binary.permanent,
             .flags = 0,
@@ -687,19 +1242,8 @@ tu_QueueSubmit(VkQueue _queue,
          };
       }
 
-      uint32_t entry_count = 0;
-      for (uint32_t j = 0; j < submit->commandBufferCount; ++j) {
-         TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]);
+      struct drm_msm_gem_submit_cmd *cmds = submit_req->cmds;
 
-         if (perf_info)
-            entry_count++;
-
-         entry_count += cmdbuf->cs.entry_count;
-      }
-
-      mtx_lock(&queue->device->bo_mutex);
-
-      struct drm_msm_gem_submit_cmd cmds[entry_count];
       uint32_t entry_idx = 0;
       for (uint32_t j = 0; j < submit->commandBufferCount; ++j) {
          TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->pCommandBuffers[j]);
@@ -730,51 +1274,13 @@ tu_QueueSubmit(VkQueue _queue,
          }
       }
 
-      uint32_t flags = MSM_PIPE_3D0;
-      if (nr_in_syncobjs) {
-         flags |= MSM_SUBMIT_SYNCOBJ_IN;
-      }
-      if (nr_out_syncobjs) {
-         flags |= MSM_SUBMIT_SYNCOBJ_OUT;
-      }
-      if (last_submit) {
-         flags |= MSM_SUBMIT_FENCE_FD_OUT;
-      }
-
-      struct drm_msm_gem_submit req = {
-         .flags = flags,
-         .queueid = queue->msm_queue_id,
-         .bos = (uint64_t)(uintptr_t) queue->device->bo_list,
-         .nr_bos = queue->device->bo_count,
-         .cmds = (uint64_t)(uintptr_t)cmds,
-         .nr_cmds = entry_count,
-         .in_syncobjs = (uint64_t)(uintptr_t)in_syncobjs,
-         .out_syncobjs = (uint64_t)(uintptr_t)out_syncobjs,
-         .nr_in_syncobjs = nr_in_syncobjs,
-         .nr_out_syncobjs = nr_out_syncobjs,
-         .syncobj_stride = sizeof(struct drm_msm_gem_submit_syncobj),
-      };
-
-      int ret = drmCommandWriteRead(queue->device->fd,
-                                    DRM_MSM_GEM_SUBMIT,
-                                    &req, sizeof(req));
-      mtx_unlock(&queue->device->bo_mutex);
-      if (ret) {
-         return tu_device_set_lost(queue->device, "submit failed: %s\n",
-                                   strerror(errno));
-      }
-
-      /* restore permanent payload on wait */
-      for (uint32_t i = 0; i < submit->waitSemaphoreCount; i++) {
-         TU_FROM_HANDLE(tu_syncobj, sem, submit->pWaitSemaphores[i]);
-         sync_set_temporary(queue->device, sem, 0);
-      }
+      /* Queue the current submit */
+      list_addtail(&submit_req->link, &queue->queued_submits);
+      ret = tu_device_submit_deferred_locked(queue->device);
 
-      if (last_submit) {
-         if (queue->fence >= 0)
-            close(queue->fence);
-         queue->fence = req.fence_fd;
-      }
+      pthread_mutex_unlock(&queue->device->submit_mutex);
+      if (ret != VK_SUCCESS)
+          return ret;
    }
 
    if (!submitCount && fence) {
@@ -794,7 +1300,7 @@ tu_CreateFence(VkDevice device,
                const VkAllocationCallbacks *pAllocator,
                VkFence *pFence)
 {
-   return sync_create(device, info->flags & VK_FENCE_CREATE_SIGNALED_BIT, true, true,
+   return sync_create(device, info->flags & VK_FENCE_CREATE_SIGNALED_BIT, true, true, 0,
                       pAllocator, (void**) pFence);
 }
 
@@ -954,6 +1460,200 @@ tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync)
    return ret ? -1 : handle.fd;
 }
 
+static VkResult
+tu_timeline_gc_locked(struct tu_device *dev, struct tu_timeline *timeline)
+{
+   VkResult result = VK_SUCCESS;
+
+   /* Go through every point in the timeline and check if any signaled point */
+   list_for_each_entry_safe(struct tu_timeline_point, point,
+                            &timeline->points, link) {
+
+      /* If the value of the point is higher than highest_submitted,
+       * the point has not been submited yet.
+       */
+      if (point->wait_count || point->value > timeline->highest_submitted)
+         return VK_SUCCESS;
+
+      result = drm_syncobj_wait(dev, (uint32_t[]){point->syncobj}, 1, 0, true);
+
+      if (result == VK_TIMEOUT) {
+         /* This means the syncobj is still busy and it should wait
+          * with timeout specified by users via vkWaitSemaphores.
+          */
+         result = VK_SUCCESS;
+      } else {
+         timeline->highest_signaled =
+               MAX2(timeline->highest_signaled, point->value);
+         list_del(&point->link);
+         list_add(&point->link, &timeline->free_points);
+      }
+   }
+
+   return result;
+}
+
+
+static VkResult
+tu_timeline_wait_locked(struct tu_device *device,
+                        struct tu_timeline *timeline,
+                        uint64_t value,
+                        uint64_t abs_timeout)
+{
+   VkResult result;
+
+   while(timeline->highest_submitted < value) {
+      struct timespec abstime;
+      timespec_from_nsec(&abstime, abs_timeout);
+
+      pthread_cond_timedwait(&device->timeline_cond, &device->submit_mutex,
+            &abstime);
+
+      if (os_time_get_nano() >= abs_timeout &&
+            timeline->highest_submitted < value)
+         return VK_TIMEOUT;
+   }
+
+   /* Visit every point in the timeline and wait until
+    * the highest_signaled reaches the value.
+    */
+   while (1) {
+      result = tu_timeline_gc_locked(device, timeline);
+      if (result != VK_SUCCESS)
+         return result;
+
+      if (timeline->highest_signaled >= value)
+          return VK_SUCCESS;
+
+      struct tu_timeline_point *point =
+            list_first_entry(&timeline->points,
+                             struct tu_timeline_point, link);
+
+      point->wait_count++;
+      pthread_mutex_unlock(&device->submit_mutex);
+      result = drm_syncobj_wait(device, (uint32_t[]){point->syncobj}, 1,
+                                abs_timeout, true);
+
+      pthread_mutex_lock(&device->submit_mutex);
+      point->wait_count--;
+
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return result;
+}
+
+static VkResult
+tu_wait_timelines(struct tu_device *device,
+                  const VkSemaphoreWaitInfoKHR* pWaitInfo,
+                  uint64_t abs_timeout)
+{
+   if ((pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR) &&
+         pWaitInfo->semaphoreCount > 1) {
+      pthread_mutex_lock(&device->submit_mutex);
+
+      /* Visit every timline semaphore in the queue until timeout */
+      while (1) {
+         for(uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) {
+            TU_FROM_HANDLE(tu_syncobj, semaphore, pWaitInfo->pSemaphores[i]);
+            VkResult result = tu_timeline_wait_locked(device,
+                  &semaphore->timeline, pWaitInfo->pValues[i], 0);
+
+            /* Returns result values including VK_SUCCESS except for VK_TIMEOUT */
+            if (result != VK_TIMEOUT) {
+               pthread_mutex_unlock(&device->submit_mutex);
+               return result;
+            }
+         }
+
+         if (os_time_get_nano() > abs_timeout) {
+            pthread_mutex_unlock(&device->submit_mutex);
+            return VK_TIMEOUT;
+         }
+      }
+   } else {
+      VkResult result = VK_SUCCESS;
+
+      pthread_mutex_lock(&device->submit_mutex);
+      for(uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) {
+         TU_FROM_HANDLE(tu_syncobj, semaphore, pWaitInfo->pSemaphores[i]);
+         assert(semaphore->type == TU_SEMAPHORE_TIMELINE);
+
+         result = tu_timeline_wait_locked(device, &semaphore->timeline,
+               pWaitInfo->pValues[i], abs_timeout);
+         if (result != VK_SUCCESS)
+            break;
+      }
+      pthread_mutex_unlock(&device->submit_mutex);
+
+      return result;
+   }
+}
+
+
+VkResult
+tu_GetSemaphoreCounterValue(VkDevice _device,
+                                 VkSemaphore _semaphore,
+                                 uint64_t* pValue)
+{
+   TU_FROM_HANDLE(tu_device, device, _device);
+   TU_FROM_HANDLE(tu_syncobj, semaphore, _semaphore);
+
+   assert(semaphore->type == TU_SEMAPHORE_TIMELINE);
+
+   VkResult result;
+
+   pthread_mutex_lock(&device->submit_mutex);
+
+   result = tu_timeline_gc_locked(device, &semaphore->timeline);
+   *pValue = semaphore->timeline.highest_signaled;
+
+   pthread_mutex_unlock(&device->submit_mutex);
+
+   return result;
+}
+
+
+VkResult
+tu_WaitSemaphores(VkDevice _device,
+                  const VkSemaphoreWaitInfoKHR* pWaitInfo,
+                  uint64_t timeout)
+{
+   TU_FROM_HANDLE(tu_device, device, _device);
+
+   return tu_wait_timelines(device, pWaitInfo, absolute_timeout(timeout));
+}
+
+VkResult
+tu_SignalSemaphore(VkDevice _device,
+                   const VkSemaphoreSignalInfoKHR* pSignalInfo)
+{
+   TU_FROM_HANDLE(tu_device, device, _device);
+   TU_FROM_HANDLE(tu_syncobj, semaphore, pSignalInfo->semaphore);
+   VkResult result;
+
+   assert(semaphore->type == TU_SEMAPHORE_TIMELINE);
+
+   pthread_mutex_lock(&device->submit_mutex);
+
+   result = tu_timeline_gc_locked(device, &semaphore->timeline);
+   if (result != VK_SUCCESS) {
+      pthread_mutex_unlock(&device->submit_mutex);
+      return result;
+   }
+
+   semaphore->timeline.highest_submitted = pSignalInfo->value;
+   semaphore->timeline.highest_signaled = pSignalInfo->value;
+
+   result = tu_device_submit_deferred_locked(device);
+
+   pthread_cond_broadcast(&device->timeline_cond);
+   pthread_mutex_unlock(&device->submit_mutex);
+
+   return result;
+}
+
 #ifdef ANDROID
 #include <libsync.h>
 
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index f8fd48aea27..ea70b5e4547 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -299,6 +299,9 @@ struct tu_queue
 
    uint32_t msm_queue_id;
    int fence;
+
+   /* Queue containing deferred submits */
+   struct list_head queued_submits;
 };
 
 struct tu_bo
@@ -399,6 +402,11 @@ struct tu_device
    /* Command streams to set pass index to a scratch reg */
    struct tu_cs *perfcntrs_pass_cs;
    struct tu_cs_entry *perfcntrs_pass_cs_entries;
+
+   /* Condition variable for timeline semaphore to notify waiters when a
+    * new submit is executed. */
+   pthread_cond_t timeline_cond;
+   pthread_mutex_t submit_mutex;
 };
 
 VkResult _tu_device_set_lost(struct tu_device *device,
@@ -412,6 +420,9 @@ tu_device_is_lost(struct tu_device *device)
    return unlikely(p_atomic_read(&device->_lost));
 }
 
+VkResult
+tu_device_submit_deferred_locked(struct tu_device *dev);
+
 VkResult
 tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size, bool dump);
 VkResult