Mesa (master): radv/winsys: Add binary syncobj ABI changes for timeline semaphores.

Thu Jul 23 17:49:27 UTC 2020

Module: Mesa
Branch: master
Commit: fa97061a8235b64009d7897ecf20cc81258f3403
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=fa97061a8235b64009d7897ecf20cc81258f3403

Author: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Date:   Mon Jun 22 01:11:03 2020 +0200

radv/winsys: Add binary syncobj ABI changes for timeline semaphores.

To facilitate cross-process timeline semaphores we have to deal with
the fact that the syncobj signal operation might be submitted a
small finite time after the wait operation.

For that we start using DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT during
the wait operation so we properly wait instead of returning an error.

Furthermore, to make this effective for syncobjs that get reused we
actually have to reset them after the wait. Otherwise the wait before
submit would get the previous fence instead of waiting for the new
thing to submit.

The obvious choice, resetting the syncobj after the CS submission
has 2 issues though:

1) If the same semaphore is used for wait and signal we can't reset it.
   This is solvable by only resetting the semaphores that are not in the
   signal list.
2) The submitted work might be complete before we get to resetting the
   syncobj. If there is a cycle of submissions that signals it again and
   finishes before we get to the reset we are screwed.

Solution:
Copy the fence into a new syncobj and reset the old syncobj before
submission. Yes I know it is more syscalls :( At least I reduced the
alloc/free overhead by keeping a cache of temporary syncobjs.

This also introduces a syncobj_reset_count as we don't want to reset
syncobjs that are part of an emulated timeline semaphore. (yes, if
the kernel supports timeline syncobjs we should use those instead,
but those still need to be implemented and if we depend on them in
this patch ordering dependencies get hard ...)

Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5600>

---

 src/amd/vulkan/radv_device.c                      |  13 ++-
 src/amd/vulkan/radv_radeon_winsys.h               |   1 +
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c     | 110 +++++++++++++++++++++-
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c |   5 +
 src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h |   5 +
 5 files changed, 128 insertions(+), 6 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index dd8b2e4b73a..b8ddb89e2d8 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -3762,7 +3762,7 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
 				      VkFence _fence,
 				      bool is_signal)
 {
-	int syncobj_idx = 0, sem_idx = 0;
+	int syncobj_idx = 0, non_reset_idx = 0, sem_idx = 0;
 
 	if (num_sems == 0 && _fence == VK_NULL_HANDLE)
 		return VK_SUCCESS;
@@ -3771,6 +3771,7 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
 		switch(sems[i]->kind) {
 		case RADV_SEMAPHORE_SYNCOBJ:
 			counts->syncobj_count++;
+			counts->syncobj_reset_count++;
 			break;
 		case RADV_SEMAPHORE_WINSYS:
 			counts->sem_count++;
@@ -3807,6 +3808,8 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
 		}
 	}
 
+	non_reset_idx = counts->syncobj_reset_count;
+
 	for (uint32_t i = 0; i < num_sems; i++) {
 		switch(sems[i]->kind) {
 		case RADV_SEMAPHORE_NONE:
@@ -3830,7 +3833,7 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
 			pthread_mutex_unlock(&sems[i]->timeline.mutex);
 
 			if (point) {
-				counts->syncobj[syncobj_idx++] = point->syncobj;
+				counts->syncobj[non_reset_idx++] = point->syncobj;
 			} else {
 				/* Explicitly remove the semaphore so we might not find
 				 * a point later post-submit. */
@@ -3848,11 +3851,11 @@ static VkResult radv_alloc_sem_counts(struct radv_device *device,
 			fence->temporary.kind != RADV_FENCE_NONE ?
 			&fence->temporary : &fence->permanent;
 		if (part->kind == RADV_FENCE_SYNCOBJ)
-			counts->syncobj[syncobj_idx++] = part->syncobj;
+			counts->syncobj[non_reset_idx++] = part->syncobj;
 	}
 
-	assert(syncobj_idx <= counts->syncobj_count);
-	counts->syncobj_count = syncobj_idx;
+	assert(MAX2(syncobj_idx, non_reset_idx) <= counts->syncobj_count);
+	counts->syncobj_count = MAX2(syncobj_idx, non_reset_idx);
 
 	return VK_SUCCESS;
 }
diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h
index ac5555e957d..37576327dfd 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -168,6 +168,7 @@ struct radeon_winsys_bo {
 };
 struct radv_winsys_sem_counts {
 	uint32_t syncobj_count;
+	uint32_t syncobj_reset_count; /* for wait only, whether to reset the syncobj */
 	uint32_t sem_count;
 	uint32_t *syncobj;
 	struct radeon_winsys_sem **sem;
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index b024ef97104..2cce43c2460 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -1451,15 +1451,17 @@ static int radv_amdgpu_signal_sems(struct radv_amdgpu_ctx *ctx,
 }
 
 static struct drm_amdgpu_cs_chunk_sem *radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts,
+									  const uint32_t *syncobj_override,
 									  struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
 {
+	const uint32_t *src = syncobj_override ? syncobj_override : counts->syncobj;
 	struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * counts->syncobj_count);
 	if (!syncobj)
 		return NULL;
 
 	for (unsigned i = 0; i < counts->syncobj_count; i++) {
 		struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
-		sem->handle = counts->syncobj[i];
+		sem->handle = src[i];
 	}
 
 	chunk->chunk_id = chunk_id;
@@ -1468,6 +1470,101 @@ static struct drm_amdgpu_cs_chunk_sem *radv_amdgpu_cs_alloc_syncobj_chunk(struct
 	return syncobj;
 }
 
+static int radv_amdgpu_cache_alloc_syncobjs(struct radv_amdgpu_winsys *ws, unsigned count, uint32_t *dst)
+{
+	pthread_mutex_lock(&ws->syncobj_lock);
+	if (count > ws->syncobj_capacity) {
+		if (ws->syncobj_capacity > UINT32_MAX / 2)
+			goto fail;
+
+		unsigned new_capacity = MAX2(count, ws->syncobj_capacity * 2);
+		uint32_t *n = realloc(ws->syncobj, new_capacity * sizeof(*ws->syncobj));
+		if (!n)
+			goto fail;
+		ws->syncobj_capacity = new_capacity;
+		ws->syncobj = n;
+	}
+
+	while(ws->syncobj_count < count) {
+		int r = amdgpu_cs_create_syncobj(ws->dev, ws->syncobj + ws->syncobj_count);
+		if (r)
+			goto fail;
+		++ws->syncobj_count;
+	}
+
+	for (unsigned i = 0; i < count; ++i)
+		dst[i] = ws->syncobj[--ws->syncobj_count];
+
+	pthread_mutex_unlock(&ws->syncobj_lock);
+	return 0;
+
+fail:
+	pthread_mutex_unlock(&ws->syncobj_lock);
+	return -ENOMEM;
+}
+
+static void radv_amdgpu_cache_free_syncobjs(struct radv_amdgpu_winsys *ws, unsigned count, uint32_t *src)
+{
+	pthread_mutex_lock(&ws->syncobj_lock);
+
+	uint32_t cache_count = MIN2(count, UINT32_MAX - ws->syncobj_count);
+	if (cache_count + ws->syncobj_count > ws->syncobj_capacity) {
+		unsigned new_capacity = MAX2(ws->syncobj_count + cache_count, ws->syncobj_capacity * 2);
+		uint32_t* n = realloc(ws->syncobj, new_capacity * sizeof(*ws->syncobj));
+		if (n) {
+			ws->syncobj_capacity = new_capacity;
+			ws->syncobj = n;
+		}
+	}
+
+	for (unsigned i = 0; i < count; ++i) {
+		if (ws->syncobj_count < ws->syncobj_capacity)
+			ws->syncobj[ws->syncobj_count++] = src[i];
+		else
+			amdgpu_cs_destroy_syncobj(ws->dev, src[i]);
+	}
+
+	pthread_mutex_unlock(&ws->syncobj_lock);
+
+}
+
+static int radv_amdgpu_cs_prepare_syncobjs(struct radv_amdgpu_winsys *ws,
+                                           struct radv_winsys_sem_counts *counts,
+                                           uint32_t **out_syncobjs)
+{
+	int r = 0;
+
+	if (!ws->info.has_timeline_syncobj || !counts->syncobj_count) {
+		*out_syncobjs = NULL;
+		return 0;
+	}
+
+	*out_syncobjs = malloc(counts->syncobj_count * sizeof(**out_syncobjs));
+	if (!*out_syncobjs)
+		return -ENOMEM;
+
+	r = radv_amdgpu_cache_alloc_syncobjs(ws, counts->syncobj_count, *out_syncobjs);
+	if (r)
+		return r;
+	
+	for (unsigned i = 0; i < counts->syncobj_count; ++i) {
+		r = amdgpu_cs_syncobj_transfer(ws->dev, (*out_syncobjs)[i], 0, counts->syncobj[i], 0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT);
+		if (r)
+			goto fail;
+	}
+
+	r = amdgpu_cs_syncobj_reset(ws->dev, counts->syncobj, counts->syncobj_reset_count);
+	if (r)
+		goto fail;
+
+	return 0;
+fail:
+	radv_amdgpu_cache_free_syncobjs(ws, counts->syncobj_count, *out_syncobjs);
+	free(*out_syncobjs);
+	*out_syncobjs = NULL;
+	return r;
+}
+
 static VkResult
 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
 		      struct radv_amdgpu_cs_request *request,
@@ -1483,6 +1580,7 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
 	struct drm_amdgpu_cs_chunk_sem *wait_syncobj = NULL, *signal_syncobj = NULL;
 	bool use_bo_list_create = ctx->ws->info.drm_minor < 27;
 	struct drm_amdgpu_bo_list_in bo_list_in;
+	uint32_t *in_syncobjs = NULL;
 	int i;
 	struct amdgpu_cs_fence *sem;
 	uint32_t bo_list = 0;
@@ -1533,7 +1631,12 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
 	}
 
 	if (sem_info->wait.syncobj_count && sem_info->cs_emit_wait) {
+		r = radv_amdgpu_cs_prepare_syncobjs(ctx->ws, &sem_info->wait, &in_syncobjs);
+		if (r)
+			goto error_out;
+
 		wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait,
+								  in_syncobjs,
 								  &chunks[num_chunks],
 								  AMDGPU_CHUNK_ID_SYNCOBJ_IN);
 		if (!wait_syncobj) {
@@ -1578,6 +1681,7 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
 
 	if (sem_info->signal.syncobj_count && sem_info->cs_emit_signal) {
 		signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal,
+								    NULL,
 								    &chunks[num_chunks],
 								    AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
 		if (!signal_syncobj) {
@@ -1642,6 +1746,10 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx,
 	}
 
 error_out:
+	if (in_syncobjs) {
+		radv_amdgpu_cache_free_syncobjs(ctx->ws, sem_info->wait.syncobj_count, in_syncobjs);
+		free(in_syncobjs);
+	}
 	free(chunks);
 	free(chunk_data);
 	free(sem_dependencies);
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
index 29e7b20b886..c6deedcb32c 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
@@ -162,6 +162,10 @@ static void radv_amdgpu_winsys_destroy(struct radeon_winsys *rws)
 {
 	struct radv_amdgpu_winsys *ws = (struct radv_amdgpu_winsys*)rws;
 
+	for (unsigned i = 0; i < ws->syncobj_count; ++i)
+		amdgpu_cs_destroy_syncobj(ws->dev, ws->syncobj[i]);
+	free(ws->syncobj);
+
 	ac_addrlib_destroy(ws->addrlib);
 	amdgpu_device_deinitialize(ws->dev);
 	FREE(rws);
@@ -197,6 +201,7 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags)
 	ws->use_llvm = debug_flags & RADV_DEBUG_LLVM;
 	list_inithead(&ws->global_bo_list);
 	pthread_mutex_init(&ws->global_bo_list_lock, NULL);
+	pthread_mutex_init(&ws->syncobj_lock, NULL);
 	ws->base.query_info = radv_amdgpu_winsys_query_info;
 	ws->base.query_value = radv_amdgpu_winsys_query_value;
 	ws->base.read_registers = radv_amdgpu_winsys_read_registers;
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
index 0601481625c..bb8ce8863ba 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.h
@@ -55,6 +55,11 @@ struct radv_amdgpu_winsys {
 	uint64_t allocated_vram;
 	uint64_t allocated_vram_vis;
 	uint64_t allocated_gtt;
+
+	/* syncobj cache */
+	pthread_mutex_t syncobj_lock;
+	uint32_t *syncobj;
+	uint32_t syncobj_count, syncobj_capacity;
 };
 
 static inline struct radv_amdgpu_winsys *