Mesa (master): radv: Add timelines with a VK_KHR_timeline_semaphore impl.

Wed Oct 30 12:07:32 UTC 2019

Module: Mesa
Branch: master
Commit: 88d41367b8aee961e6c47173a1e8848009e2215a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=88d41367b8aee961e6c47173a1e8848009e2215a

Author: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Date:   Tue Oct 22 10:18:06 2019 +0200

radv: Add timelines with a VK_KHR_timeline_semaphore impl.

This does not fully do wait-before-submit, to be done in a follow
up patch.

For kernels without support for timeline syncobjs, this adds an
implementation of non-shareable timelines using legacy syncobjs.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>

---

 src/amd/vulkan/radv_device.c      | 532 +++++++++++++++++++++++++++++++++-----
 src/amd/vulkan/radv_extensions.py |   1 +
 src/amd/vulkan/radv_private.h     |  31 +++
 3 files changed, 504 insertions(+), 60 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 5d46c609bde..5b27961e50d 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -60,9 +60,21 @@
 #include "util/build_id.h"
 #include "util/debug.h"
 #include "util/mesa-sha1.h"
+#include "util/timespec.h"
 #include "compiler/glsl_types.h"
 #include "util/xmlpool.h"
 
+static struct radv_timeline_point *
+radv_timeline_find_point_at_least_locked(struct radv_device *device,
+                                         struct radv_timeline *timeline,
+                                         uint64_t p);
+
+static struct radv_timeline_point *
+radv_timeline_add_point_locked(struct radv_device *device,
+                               struct radv_timeline *timeline,
+                               uint64_t p);
+
+
 static
 void radv_destroy_semaphore_part(struct radv_device *device,
                                  struct radv_semaphore_part *part);
@@ -2276,7 +2288,26 @@ static VkResult fork_secure_compile_device(struct radv_device *device)
 			}
 		}
 	}
+	return VK_SUCCESS;
+}
+
+static VkResult
+radv_create_pthread_cond(pthread_cond_t *cond)
+{
+	pthread_condattr_t condattr;
+	if (pthread_condattr_init(&condattr)) {
+		return VK_ERROR_INITIALIZATION_FAILED;
+	}
 
+	if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)) {
+		pthread_condattr_destroy(&condattr);
+		return VK_ERROR_INITIALIZATION_FAILED;
+	}
+	if (pthread_cond_init(cond, &condattr)) {
+		pthread_condattr_destroy(&condattr);
+		return VK_ERROR_INITIALIZATION_FAILED;
+	}
+	pthread_condattr_destroy(&condattr);
 	return VK_SUCCESS;
 }
 
@@ -2479,6 +2510,10 @@ VkResult radv_CreateDevice(
 
 	device->mem_cache = radv_pipeline_cache_from_handle(pc);
 
+	result = radv_create_pthread_cond(&device->timeline_cond);
+	if (result != VK_SUCCESS)
+		goto fail_mem_cache;
+
 	device->force_aniso =
 		MIN2(16, radv_get_int_debug_option("RADV_TEX_ANISO", -1));
 	if (device->force_aniso >= 0) {
@@ -2497,6 +2532,8 @@ VkResult radv_CreateDevice(
 	*pDevice = radv_device_to_handle(device);
 	return VK_SUCCESS;
 
+fail_mem_cache:
+	radv_DestroyPipelineCache(radv_device_to_handle(device), pc, NULL);
 fail_meta:
 	radv_device_finish_meta(device);
 fail:
@@ -2549,6 +2586,7 @@ void radv_DestroyDevice(
 
 	radv_destroy_shader_slabs(device);
 
+	pthread_cond_destroy(&device->timeline_cond);
 	radv_bo_list_finish(&device->bo_list);
 
 	if (radv_device_use_secure_compile(device->instance)) {
@@ -3404,11 +3442,13 @@ fail:
 	return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 }
 
-static VkResult radv_alloc_sem_counts(struct radv_instance *instance,
+static VkResult radv_alloc_sem_counts(struct radv_device *device,
 				      struct radv_winsys_sem_counts *counts,
 				      int num_sems,
 				      struct radv_semaphore_part **sems,
-				      VkFence _fence)
+				      const uint64_t *timeline_values,
+				      VkFence _fence,
+				      bool is_signal)
 {
 	int syncobj_idx = 0, sem_idx = 0;
 
@@ -3416,10 +3456,19 @@ static VkResult radv_alloc_sem_counts(struct radv_instance *instance,
 		return VK_SUCCESS;
 
 	for (uint32_t i = 0; i < num_sems; i++) {
-		if(sems[i]->kind == RADV_SEMAPHORE_SYNCOBJ)
+		switch(sems[i]->kind) {
+		case RADV_SEMAPHORE_SYNCOBJ:
 			counts->syncobj_count++;
-		else
+			break;
+		case RADV_SEMAPHORE_WINSYS:
 			counts->sem_count++;
+			break;
+		case RADV_SEMAPHORE_NONE:
+			break;
+		case RADV_SEMAPHORE_TIMELINE:
+			counts->syncobj_count++;
+			break;
+		}
 	}
 
 	if (_fence != VK_NULL_HANDLE) {
@@ -3431,14 +3480,14 @@ static VkResult radv_alloc_sem_counts(struct radv_instance *instance,
 	if (counts->syncobj_count) {
 		counts->syncobj = (uint32_t *)malloc(sizeof(uint32_t) * counts->syncobj_count);
 		if (!counts->syncobj)
-			return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}
 
 	if (counts->sem_count) {
 		counts->sem = (struct radeon_winsys_sem **)malloc(sizeof(struct radeon_winsys_sem *) * counts->sem_count);
 		if (!counts->sem) {
 			free(counts->syncobj);
-			return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 	}
 
@@ -3453,6 +3502,26 @@ static VkResult radv_alloc_sem_counts(struct radv_instance *instance,
 		case RADV_SEMAPHORE_WINSYS:
 			counts->sem[sem_idx++] = sems[i]->ws_sem;
 			break;
+		case RADV_SEMAPHORE_TIMELINE: {
+			pthread_mutex_lock(&sems[i]->timeline.mutex);
+			struct radv_timeline_point *point = NULL;
+			if (is_signal) {
+				point = radv_timeline_add_point_locked(device, &sems[i]->timeline, timeline_values[i]);
+			} else {
+				point = radv_timeline_find_point_at_least_locked(device, &sems[i]->timeline, timeline_values[i]);
+			}
+
+			pthread_mutex_unlock(&sems[i]->timeline.mutex);
+
+			if (point) {
+				counts->syncobj[syncobj_idx++] = point->syncobj;
+			} else {
+				/* Explicitly remove the semaphore so we might not find
+				 * a point later post-submit. */
+				sems[i] = NULL;
+			}
+			break;
+		}
 		}
 	}
 
@@ -3464,6 +3533,9 @@ static VkResult radv_alloc_sem_counts(struct radv_instance *instance,
 			counts->syncobj[syncobj_idx++] = fence->syncobj;
 	}
 
+	assert(syncobj_idx <= counts->syncobj_count);
+	counts->syncobj_count = syncobj_idx;
+
 	return VK_SUCCESS;
 }
 
@@ -3487,21 +3559,23 @@ static void radv_free_temp_syncobjs(struct radv_device *device,
 }
 
 static VkResult
-radv_alloc_sem_info(struct radv_instance *instance,
+radv_alloc_sem_info(struct radv_device *device,
 		    struct radv_winsys_sem_info *sem_info,
 		    int num_wait_sems,
 		    struct radv_semaphore_part **wait_sems,
+		    const uint64_t *wait_values,
 		    int num_signal_sems,
 		    struct radv_semaphore_part **signal_sems,
+		    const uint64_t *signal_values,
 		    VkFence fence)
 {
 	VkResult ret;
 	memset(sem_info, 0, sizeof(*sem_info));
 
-	ret = radv_alloc_sem_counts(instance, &sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE);
+	ret = radv_alloc_sem_counts(device, &sem_info->wait, num_wait_sems, wait_sems, wait_values, VK_NULL_HANDLE, false);
 	if (ret)
 		return ret;
-	ret = radv_alloc_sem_counts(instance, &sem_info->signal, num_signal_sems, signal_sems, fence);
+	ret = radv_alloc_sem_counts(device, &sem_info->signal, num_signal_sems, signal_sems, signal_values, fence, true);
 	if (ret)
 		radv_free_sem_info(sem_info);
 
@@ -3512,6 +3586,41 @@ radv_alloc_sem_info(struct radv_instance *instance,
 }
 
 static void
+radv_finalize_timelines(struct radv_device *device,
+                        uint32_t num_wait_sems,
+                        struct radv_semaphore_part **wait_sems,
+                        const uint64_t *wait_values,
+                        uint32_t num_signal_sems,
+                        struct radv_semaphore_part **signal_sems,
+                        const uint64_t *signal_values)
+{
+	for (uint32_t i = 0; i < num_wait_sems; ++i) {
+		if (wait_sems[i] && wait_sems[i]->kind == RADV_SEMAPHORE_TIMELINE) {
+			pthread_mutex_lock(&wait_sems[i]->timeline.mutex);
+			struct radv_timeline_point *point =
+				radv_timeline_find_point_at_least_locked(device, &wait_sems[i]->timeline, wait_values[i]);
+			if (point)
+				--point->wait_count;
+			pthread_mutex_unlock(&wait_sems[i]->timeline.mutex);
+		}
+	}
+	for (uint32_t i = 0; i < num_signal_sems; ++i) {
+		if (signal_sems[i] && signal_sems[i]->kind == RADV_SEMAPHORE_TIMELINE) {
+			pthread_mutex_lock(&signal_sems[i]->timeline.mutex);
+			struct radv_timeline_point *point =
+				radv_timeline_find_point_at_least_locked(device, &signal_sems[i]->timeline, signal_values[i]);
+			if (point) {
+				signal_sems[i]->timeline.highest_submitted =
+					MAX2(signal_sems[i]->timeline.highest_submitted, point->value);
+				point->wait_count--;
+			}
+			pthread_mutex_unlock(&signal_sems[i]->timeline.mutex);
+		}
+	}
+	pthread_cond_broadcast(&device->timeline_cond);
+}
+
+static void
 radv_sparse_buffer_bind_memory(struct radv_device *device,
                                const VkSparseBufferMemoryBindInfo *bind)
 {
@@ -3606,6 +3715,9 @@ struct radv_deferred_queue_submission {
 	uint32_t signal_semaphore_count;
 	VkFence fence;
 
+	uint64_t *wait_values;
+	uint64_t *signal_values;
+
 	struct radv_semaphore_part *temporary_semaphore_parts;
 	uint32_t temporary_semaphore_part_count;
 };
@@ -3627,6 +3739,11 @@ struct radv_queue_submission {
 	const VkSemaphore *signal_semaphores;
 	uint32_t signal_semaphore_count;
 	VkFence fence;
+
+	const uint64_t *wait_values;
+	uint32_t wait_value_count;
+	const uint64_t *signal_values;
+	uint32_t signal_value_count;
 };
 
 static VkResult
@@ -3649,6 +3766,8 @@ radv_create_deferred_submission(struct radv_queue *queue,
 	size += submission->image_opaque_bind_count * sizeof(VkSparseImageOpaqueMemoryBindInfo);
 	size += submission->wait_semaphore_count * sizeof(struct radv_semaphore_part *);
 	size += submission->signal_semaphore_count * sizeof(struct radv_semaphore_part *);
+	size += submission->wait_value_count * sizeof(uint64_t);
+	size += submission->signal_value_count * sizeof(uint64_t);
 
 	deferred = calloc(1, size);
 	if (!deferred)
@@ -3706,6 +3825,11 @@ radv_create_deferred_submission(struct radv_queue *queue,
 		}
 	}
 
+	deferred->wait_values = (void*)(deferred->temporary_semaphore_parts + temporary_count);
+	memcpy(deferred->wait_values, submission->wait_values, submission->wait_value_count * sizeof(uint64_t));
+	deferred->signal_values = deferred->wait_values + submission->wait_value_count;
+	memcpy(deferred->signal_values, submission->signal_values, submission->signal_value_count * sizeof(uint64_t));
+
 	*out = deferred;
 	return VK_SUCCESS;
 }
@@ -3715,7 +3839,6 @@ radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission)
 {
 	RADV_FROM_HANDLE(radv_fence, fence, submission->fence);
 	struct radv_queue *queue = submission->queue;
-	struct radeon_cmdbuf **cs_array;
 	struct radeon_winsys_ctx *ctx = queue->hw_ctx;
 	uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
 	struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL;
@@ -3737,12 +3860,14 @@ radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission)
 	if (result != VK_SUCCESS)
 		goto fail;
 
-	result = radv_alloc_sem_info(queue->device->instance,
+	result = radv_alloc_sem_info(queue->device,
 				     &sem_info,
 				     submission->wait_semaphore_count,
 				     submission->wait_semaphores,
+				     submission->wait_values,
 				     submission->signal_semaphore_count,
 				     submission->signal_semaphores,
+				     submission->signal_values,
 				     submission->fence);
 	if (result != VK_SUCCESS)
 		goto fail;
@@ -3767,68 +3892,73 @@ radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission)
 			radv_loge("failed to submit CS\n");
 			abort();
 		}
-		radv_free_sem_info(&sem_info);
-		radv_free_temp_syncobjs(queue->device,
-		                        submission->temporary_semaphore_part_count,
-		                        submission->temporary_semaphore_parts);
-		free(submission);
-		return VK_SUCCESS;
-	}
 
-	cs_array = malloc(sizeof(struct radeon_cmdbuf *) *
-				        (submission->cmd_buffer_count));
+		goto success;
+	} else {
+		struct radeon_cmdbuf **cs_array = malloc(sizeof(struct radeon_cmdbuf *) *
+		                                         (submission->cmd_buffer_count));
 
-	for (uint32_t j = 0; j < submission->cmd_buffer_count; j++) {
-		RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, submission->cmd_buffers[j]);
-		assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+		for (uint32_t j = 0; j < submission->cmd_buffer_count; j++) {
+			RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, submission->cmd_buffers[j]);
+			assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
-		cs_array[j] = cmd_buffer->cs;
-		if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
-			can_patch = false;
+			cs_array[j] = cmd_buffer->cs;
+			if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
+				can_patch = false;
 
-		cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING;
-	}
+			cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING;
+		}
 
-	for (uint32_t j = 0; j < submission->cmd_buffer_count; j += advance) {
-		struct radeon_cmdbuf *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
-		const struct radv_winsys_bo_list *bo_list = NULL;
+		for (uint32_t j = 0; j < submission->cmd_buffer_count; j += advance) {
+			struct radeon_cmdbuf *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
+			const struct radv_winsys_bo_list *bo_list = NULL;
 
-		advance = MIN2(max_cs_submission,
-			       submission->cmd_buffer_count - j);
+			advance = MIN2(max_cs_submission,
+			               submission->cmd_buffer_count - j);
 
-		if (queue->device->trace_bo)
-			*queue->device->trace_id_ptr = 0;
+			if (queue->device->trace_bo)
+				*queue->device->trace_id_ptr = 0;
 
-		sem_info.cs_emit_wait = j == 0;
-		sem_info.cs_emit_signal = j + advance == submission->cmd_buffer_count;
+			sem_info.cs_emit_wait = j == 0;
+			sem_info.cs_emit_signal = j + advance == submission->cmd_buffer_count;
 
-		if (unlikely(queue->device->use_global_bo_list)) {
-			pthread_mutex_lock(&queue->device->bo_list.mutex);
-			bo_list = &queue->device->bo_list.list;
-		}
+			if (unlikely(queue->device->use_global_bo_list)) {
+				pthread_mutex_lock(&queue->device->bo_list.mutex);
+				bo_list = &queue->device->bo_list.list;
+			}
 
-		ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
-						advance, initial_preamble, continue_preamble_cs,
-						&sem_info, bo_list,
-						can_patch, base_fence);
+			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
+			                                   advance, initial_preamble, continue_preamble_cs,
+			                                   &sem_info, bo_list,
+			                                   can_patch, base_fence);
 
-		if (unlikely(queue->device->use_global_bo_list))
-			pthread_mutex_unlock(&queue->device->bo_list.mutex);
+			if (unlikely(queue->device->use_global_bo_list))
+				pthread_mutex_unlock(&queue->device->bo_list.mutex);
 
-		if (ret) {
-			radv_loge("failed to submit CS\n");
-			abort();
-		}
-		if (queue->device->trace_bo) {
-			radv_check_gpu_hangs(queue, cs_array[j]);
+			if (ret) {
+				radv_loge("failed to submit CS\n");
+				abort();
+			}
+			if (queue->device->trace_bo) {
+				radv_check_gpu_hangs(queue, cs_array[j]);
+			}
 		}
+
+		free(cs_array);
 	}
 
+success:
 	radv_free_temp_syncobjs(queue->device,
 				submission->temporary_semaphore_part_count,
 				submission->temporary_semaphore_parts);
+	radv_finalize_timelines(queue->device,
+	                        submission->wait_semaphore_count,
+	                        submission->wait_semaphores,
+	                        submission->wait_values,
+	                        submission->signal_semaphore_count,
+	                        submission->signal_semaphores,
+	                        submission->signal_values);
 	radv_free_sem_info(&sem_info);
-	free(cs_array);
 	free(submission);
 	return VK_SUCCESS;
 
@@ -3895,6 +4025,9 @@ VkResult radv_QueueSubmit(
 			wait_dst_stage_mask |= pSubmits[i].pWaitDstStageMask[j];
 		}
 
+		const VkTimelineSemaphoreSubmitInfoKHR *timeline_info =
+			vk_find_struct_const(pSubmits[i].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR);
+
 		result = radv_queue_submit(queue, &(struct radv_queue_submission) {
 				.cmd_buffers = pSubmits[i].pCommandBuffers,
 				.cmd_buffer_count = pSubmits[i].commandBufferCount,
@@ -3904,7 +4037,11 @@ VkResult radv_QueueSubmit(
 				.wait_semaphore_count = pSubmits[i].waitSemaphoreCount,
 				.signal_semaphores = pSubmits[i].pSignalSemaphores,
 				.signal_semaphore_count = pSubmits[i].signalSemaphoreCount,
-				.fence = i == fence_idx ? fence : VK_NULL_HANDLE
+				.fence = i == fence_idx ? fence : VK_NULL_HANDLE,
+				.wait_values = timeline_info ? timeline_info->pWaitSemaphoreValues : NULL,
+				.wait_value_count = timeline_info && timeline_info->pWaitSemaphoreValues ? timeline_info->waitSemaphoreValueCount : 0,
+				.signal_values = timeline_info ? timeline_info->pSignalSemaphoreValues : NULL,
+				.signal_value_count = timeline_info && timeline_info->pSignalSemaphoreValues ? timeline_info->signalSemaphoreValueCount : 0,
 			});
 		if (result != VK_SUCCESS)
 			return result;
@@ -4522,6 +4659,9 @@ static bool radv_sparse_bind_has_effects(const VkBindSparseInfo *info)
 		if (i != fence_idx && !radv_sparse_bind_has_effects(pBindInfo + i))
 			continue;
 
+		const VkTimelineSemaphoreSubmitInfoKHR *timeline_info =
+			vk_find_struct_const(pBindInfo[i].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR);
+
 		VkResult result = radv_queue_submit(queue, &(struct radv_queue_submission) {
 				.buffer_binds = pBindInfo[i].pBufferBinds,
 				.buffer_bind_count = pBindInfo[i].bufferBindCount,
@@ -4532,6 +4672,10 @@ static bool radv_sparse_bind_has_effects(const VkBindSparseInfo *info)
 				.signal_semaphores = pBindInfo[i].pSignalSemaphores,
 				.signal_semaphore_count = pBindInfo[i].signalSemaphoreCount,
 				.fence = i == fence_idx ? fence : VK_NULL_HANDLE,
+				.wait_values = timeline_info ? timeline_info->pWaitSemaphoreValues : NULL,
+				.wait_value_count = timeline_info && timeline_info->pWaitSemaphoreValues ? timeline_info->waitSemaphoreValueCount : 0,
+				.signal_values = timeline_info ? timeline_info->pSignalSemaphoreValues : NULL,
+				.signal_value_count = timeline_info && timeline_info->pSignalSemaphoreValues ? timeline_info->signalSemaphoreValueCount : 0,
 			});
 
 		if (result != VK_SUCCESS)
@@ -4820,6 +4964,148 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence)
 
 // Queue semaphore functions
 
+static void
+radv_create_timeline(struct radv_timeline *timeline, uint64_t value)
+{
+	timeline->highest_signaled = value;
+	timeline->highest_submitted = value;
+	list_inithead(&timeline->points);
+	list_inithead(&timeline->free_points);
+	pthread_mutex_init(&timeline->mutex, NULL);
+}
+
+static void
+radv_destroy_timeline(struct radv_device *device,
+                      struct radv_timeline *timeline)
+{
+	list_for_each_entry_safe(struct radv_timeline_point, point,
+	                         &timeline->free_points, list) {
+		list_del(&point->list);
+		device->ws->destroy_syncobj(device->ws, point->syncobj);
+		free(point);
+	}
+	list_for_each_entry_safe(struct radv_timeline_point, point,
+	                         &timeline->points, list) {
+		list_del(&point->list);
+		device->ws->destroy_syncobj(device->ws, point->syncobj);
+		free(point);
+	}
+	pthread_mutex_destroy(&timeline->mutex);
+}
+
+static void
+radv_timeline_gc_locked(struct radv_device *device,
+                        struct radv_timeline *timeline)
+{
+	list_for_each_entry_safe(struct radv_timeline_point, point,
+	                         &timeline->points, list) {
+		if (point->wait_count || point->value > timeline->highest_submitted)
+			return;
+
+		if (device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, 0)) {
+			timeline->highest_signaled = point->value;
+			list_del(&point->list);
+			list_add(&point->list, &timeline->free_points);
+		}
+	}
+}
+
+static struct radv_timeline_point *
+radv_timeline_find_point_at_least_locked(struct radv_device *device,
+                                         struct radv_timeline *timeline,
+                                         uint64_t p)
+{
+	radv_timeline_gc_locked(device, timeline);
+
+	if (p <= timeline->highest_signaled)
+		return NULL;
+
+	list_for_each_entry(struct radv_timeline_point, point,
+	                    &timeline->points, list) {
+		if (point->value >= p) {
+			++point->wait_count;
+			return point;
+		}
+	}
+	return NULL;
+}
+
+static struct radv_timeline_point *
+radv_timeline_add_point_locked(struct radv_device *device,
+                               struct radv_timeline *timeline,
+                               uint64_t p)
+{
+	radv_timeline_gc_locked(device, timeline);
+
+	struct radv_timeline_point *ret = NULL;
+	struct radv_timeline_point *prev = NULL;
+
+	if (p <= timeline->highest_signaled)
+		return NULL;
+
+	list_for_each_entry(struct radv_timeline_point, point,
+	                    &timeline->points, list) {
+		if (point->value == p) {
+			return NULL;
+		}
+
+		if (point->value < p)
+			prev = point;
+	}
+
+	if (list_is_empty(&timeline->free_points)) {
+		ret = malloc(sizeof(struct radv_timeline_point));
+		device->ws->create_syncobj(device->ws, &ret->syncobj);
+	} else {
+		ret = list_first_entry(&timeline->free_points, struct radv_timeline_point, list);
+		list_del(&ret->list);
+
+		device->ws->reset_syncobj(device->ws, ret->syncobj);
+	}
+
+	ret->value = p;
+	ret->wait_count = 1;
+
+	if (prev) {
+		list_add(&ret->list, &prev->list);
+	} else {
+		list_addtail(&ret->list, &timeline->points);
+	}
+	return ret;
+}
+
+
+static VkResult
+radv_timeline_wait_locked(struct radv_device *device,
+                          struct radv_timeline *timeline,
+                          uint64_t value,
+                          uint64_t abs_timeout)
+{
+	while(timeline->highest_submitted < value) {
+		struct timespec abstime;
+		timespec_from_nsec(&abstime, abs_timeout);
+
+		pthread_cond_timedwait(&device->timeline_cond, &timeline->mutex, &abstime);
+
+		if (radv_get_current_time() >= abs_timeout && timeline->highest_submitted < value)
+			return VK_TIMEOUT;
+	}
+
+	struct radv_timeline_point *point = radv_timeline_find_point_at_least_locked(device, timeline, value);
+	if (!point)
+		return VK_SUCCESS;
+
+	point->wait_count++;
+
+	pthread_mutex_unlock(&timeline->mutex);
+
+	bool success = device->ws->wait_syncobj(device->ws, &point->syncobj, 1, true, abs_timeout);
+
+	pthread_mutex_lock(&timeline->mutex);
+	point->wait_count--;
+	return success ? VK_SUCCESS : VK_TIMEOUT;
+}
+
 static
 void radv_destroy_semaphore_part(struct radv_device *device,
                                  struct radv_semaphore_part *part)
@@ -4830,6 +5116,9 @@ void radv_destroy_semaphore_part(struct radv_device *device,
 	case RADV_SEMAPHORE_WINSYS:
 		device->ws->destroy_sem(part->ws_sem);
 		break;
+	case RADV_SEMAPHORE_TIMELINE:
+		radv_destroy_timeline(device, &part->timeline);
+		break;
 	case RADV_SEMAPHORE_SYNCOBJ:
 		device->ws->destroy_syncobj(device->ws, part->syncobj);
 		break;
@@ -4837,6 +5126,20 @@ void radv_destroy_semaphore_part(struct radv_device *device,
 	part->kind = RADV_SEMAPHORE_NONE;
 }
 
+static VkSemaphoreTypeKHR
+radv_get_semaphore_type(const void *pNext, uint64_t *initial_value)
+{
+	const VkSemaphoreTypeCreateInfoKHR *type_info =
+		vk_find_struct_const(pNext, SEMAPHORE_TYPE_CREATE_INFO_KHR);
+
+	if (!type_info)
+		return VK_SEMAPHORE_TYPE_BINARY_KHR;
+
+	if (initial_value)
+		*initial_value = type_info->initialValue;
+	return type_info->semaphoreType;
+}
+
 VkResult radv_CreateSemaphore(
 	VkDevice                                    _device,
 	const VkSemaphoreCreateInfo*                pCreateInfo,
@@ -4848,6 +5151,8 @@ VkResult radv_CreateSemaphore(
 		vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO);
 	VkExternalSemaphoreHandleTypeFlags handleTypes =
 		export ? export->handleTypes : 0;
+	uint64_t initial_value = 0;
+	VkSemaphoreTypeKHR type = radv_get_semaphore_type(pCreateInfo->pNext, &initial_value);
 
 	struct radv_semaphore *sem = vk_alloc2(&device->alloc, pAllocator,
 					       sizeof(*sem), 8,
@@ -4858,8 +5163,10 @@ VkResult radv_CreateSemaphore(
 	sem->temporary.kind = RADV_SEMAPHORE_NONE;
 	sem->permanent.kind = RADV_SEMAPHORE_NONE;
 
-	/* create a syncobject if we are going to export this semaphore */
-	if (device->always_use_syncobj || handleTypes) {
+	if (type == VK_SEMAPHORE_TYPE_TIMELINE_KHR) {
+		radv_create_timeline(&sem->permanent.timeline, initial_value);
+		sem->permanent.kind = RADV_SEMAPHORE_TIMELINE;
+	} else if (device->always_use_syncobj || handleTypes) {
 		assert (device->physical_device->rad_info.has_syncobj);
 		int ret = device->ws->create_syncobj(device->ws, &sem->permanent.syncobj);
 		if (ret) {
@@ -4895,6 +5202,105 @@ void radv_DestroySemaphore(
 	vk_free2(&device->alloc, pAllocator, sem);
 }
 
+VkResult
+radv_GetSemaphoreCounterValueKHR(VkDevice _device,
+                                 VkSemaphore _semaphore,
+                                 uint64_t* pValue)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_semaphore, semaphore, _semaphore);
+
+	struct radv_semaphore_part *part =
+		semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent;
+
+	switch (part->kind) {
+	case RADV_SEMAPHORE_TIMELINE: {
+		pthread_mutex_lock(&part->timeline.mutex);
+		radv_timeline_gc_locked(device, &part->timeline);
+		*pValue = part->timeline.highest_signaled;
+		pthread_mutex_unlock(&part->timeline.mutex);
+		return VK_SUCCESS;
+	}
+	case RADV_SEMAPHORE_NONE:
+	case RADV_SEMAPHORE_SYNCOBJ:
+	case RADV_SEMAPHORE_WINSYS:
+		unreachable("Invalid semaphore type");
+	}
+	unreachable("Unhandled semaphore type");
+}
+
+
+static VkResult
+radv_wait_timelines(struct radv_device *device,
+                    const VkSemaphoreWaitInfoKHR* pWaitInfo,
+                    uint64_t abs_timeout)
+{
+	if ((pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR) && pWaitInfo->semaphoreCount > 1) {
+		for (;;) {
+			for(uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) {
+				RADV_FROM_HANDLE(radv_semaphore, semaphore, pWaitInfo->pSemaphores[i]);
+				pthread_mutex_lock(&semaphore->permanent.timeline.mutex);
+				VkResult result = radv_timeline_wait_locked(device, &semaphore->permanent.timeline, pWaitInfo->pValues[i], 0);
+				pthread_mutex_unlock(&semaphore->permanent.timeline.mutex);
+
+				if (result == VK_SUCCESS)
+					return VK_SUCCESS;
+			}
+			if (radv_get_current_time() > abs_timeout)
+				return VK_TIMEOUT;
+		}
+	}
+
+	for(uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) {
+		RADV_FROM_HANDLE(radv_semaphore, semaphore, pWaitInfo->pSemaphores[i]);
+		pthread_mutex_lock(&semaphore->permanent.timeline.mutex);
+		VkResult result = radv_timeline_wait_locked(device, &semaphore->permanent.timeline, pWaitInfo->pValues[i], abs_timeout);
+		pthread_mutex_unlock(&semaphore->permanent.timeline.mutex);
+
+		if (result != VK_SUCCESS)
+			return result;
+	}
+	return VK_SUCCESS;
+}
+VkResult
+radv_WaitSemaphoresKHR(VkDevice _device,
+                       const VkSemaphoreWaitInfoKHR* pWaitInfo,
+                       uint64_t timeout)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	uint64_t abs_timeout = radv_get_absolute_timeout(timeout);
+	return radv_wait_timelines(device, pWaitInfo, abs_timeout);
+}
+
+VkResult
+radv_SignalSemaphoreKHR(VkDevice _device,
+                        const VkSemaphoreSignalInfoKHR* pSignalInfo)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_semaphore, semaphore, pSignalInfo->semaphore);
+
+	struct radv_semaphore_part *part =
+		semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent;
+
+	switch(part->kind) {
+	case RADV_SEMAPHORE_TIMELINE: {
+		pthread_mutex_lock(&part->timeline.mutex);
+		radv_timeline_gc_locked(device, &part->timeline);
+		part->timeline.highest_submitted = MAX2(part->timeline.highest_submitted, pSignalInfo->value);
+		part->timeline.highest_signaled = MAX2(part->timeline.highest_signaled, pSignalInfo->value);
+		pthread_mutex_unlock(&part->timeline.mutex);
+		break;
+	}
+	case RADV_SEMAPHORE_NONE:
+	case RADV_SEMAPHORE_SYNCOBJ:
+	case RADV_SEMAPHORE_WINSYS:
+		unreachable("Invalid semaphore type");
+	}
+	return VK_SUCCESS;
+}
+
+
+
 VkResult radv_CreateEvent(
 	VkDevice                                    _device,
 	const VkEventCreateInfo*                    pCreateInfo,
@@ -6065,11 +6471,17 @@ void radv_GetPhysicalDeviceExternalSemaphoreProperties(
 	VkExternalSemaphoreProperties               *pExternalSemaphoreProperties)
 {
 	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
+	VkSemaphoreTypeKHR type = radv_get_semaphore_type(pExternalSemaphoreInfo->pNext, NULL);
+	
+	if (type == VK_SEMAPHORE_TYPE_TIMELINE_KHR) {
+		pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
+		pExternalSemaphoreProperties->compatibleHandleTypes = 0;
+		pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
 
 	/* Require has_syncobj_wait_for_submit for the syncobj signal ioctl introduced at virtually the same time */
-	if (pdevice->rad_info.has_syncobj_wait_for_submit &&
-	    (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT || 
-	     pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT)) {
+	} else if (pdevice->rad_info.has_syncobj_wait_for_submit &&
+	           (pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT || 
+	            pExternalSemaphoreInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT)) {
 		pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
 		pExternalSemaphoreProperties->compatibleHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
 		pExternalSemaphoreProperties->externalSemaphoreFeatures = VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT |
diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
index a9e499df91b..9f437f2253f 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -96,6 +96,7 @@ EXTENSIONS = [
     Extension('VK_KHR_surface',                          25, 'RADV_HAS_SURFACE'),
     Extension('VK_KHR_surface_protected_capabilities',    1, 'RADV_HAS_SURFACE'),
     Extension('VK_KHR_swapchain',                        68, 'RADV_HAS_SURFACE'),
+    Extension('VK_KHR_timeline_semaphore',                2, False),
     Extension('VK_KHR_uniform_buffer_standard_layout',    1, True),
     Extension('VK_KHR_variable_pointers',                 1, True),
     Extension('VK_KHR_wayland_surface',                   6, 'VK_USE_PLATFORM_WAYLAND_KHR'),
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index ef7ba34c85b..6eb4e5fbe3f 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -820,6 +820,10 @@ struct radv_device {
 	int force_aniso;
 
 	struct radv_secure_compile_state *sc_state;
+
+	/* Condition variable for legacy timelines, to notify waiters when a
+	 * new point gets submitted. */
+	pthread_cond_t timeline_cond;
 };
 
 struct radv_device_memory {
@@ -2168,13 +2172,40 @@ typedef enum {
 	RADV_SEMAPHORE_NONE,
 	RADV_SEMAPHORE_WINSYS,
 	RADV_SEMAPHORE_SYNCOBJ,
+	RADV_SEMAPHORE_TIMELINE,
 } radv_semaphore_kind;
 
+struct radv_timeline_point {
+	struct list_head list;
+
+	uint64_t value;
+	uint32_t syncobj;
+
+	/* Separate from the list to accomodate CPU wait being async, as well
+	 * as prevent point deletion during submission. */
+	unsigned wait_count;
+};
+
+struct radv_timeline {
+	/* Using a pthread mutex to be compatible with condition variables. */
+	pthread_mutex_t mutex;
+
+	uint64_t highest_signaled;
+	uint64_t highest_submitted;
+
+	struct list_head points;
+
+	/* Keep free points on hand so we do not have to recreate syncobjs all
+	 * the time. */
+	struct list_head free_points;
+};
+
 struct radv_semaphore_part {
 	radv_semaphore_kind kind;
 	union {
 		uint32_t syncobj;
 		struct radeon_winsys_sem *ws_sem;
+		struct radv_timeline timeline;
 	};
 };