Mesa (master): radv: track and report if a logical device is lost

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Wed Aug 19 08:23:40 UTC 2020


Module: Mesa
Branch: master
Commit: d26f62c667099fc3d30a3155335ca4f0e73c8d88
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d26f62c667099fc3d30a3155335ca4f0e73c8d88

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Mon Jul 13 13:59:48 2020 +0200

radv: track and report if a logical device is lost

This currently covers two situations where it's obvious that
the GPU hung:

1) when wait-of-idle doesn't finish in a finite time
2) when a CS submission is cancelled by the kernel

There is still probably some other situations that aren't yet handled.

According to the Vulkan spec, some operations should return
VK_ERROR_DEVICE_LOST when the corresponding logical device is
known to be lost.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5878>

---

 src/amd/vulkan/radv_device.c  | 55 +++++++++++++++++++++++++++++++++++++++----
 src/amd/vulkan/radv_private.h | 17 +++++++++++++
 src/amd/vulkan/radv_query.c   |  3 +++
 3 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index dda535d5538..5b93083913b 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -2590,6 +2590,25 @@ static void radv_device_finish_border_color(struct radv_device *device)
 	}
 }
 
+VkResult
+_radv_device_set_lost(struct radv_device *device,
+		      const char *file, int line,
+		      const char *msg, ...)
+{
+	VkResult err;
+	va_list ap;
+
+	p_atomic_inc(&device->lost);
+
+	va_start(ap, msg);
+	err = __vk_errorv(device->physical_device->instance, device,
+			  VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT,
+			  VK_ERROR_DEVICE_LOST, file, line, msg, ap);
+	va_end(ap);
+
+	return err;
+}
+
 VkResult radv_CreateDevice(
 	VkPhysicalDevice                            physicalDevice,
 	const VkDeviceCreateInfo*                   pCreateInfo,
@@ -4503,7 +4522,7 @@ fail:
 		 * VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
 		 * to submit the same job again to this device.
 		 */
-		result = VK_ERROR_DEVICE_LOST;
+		result = radv_device_set_lost(queue->device, "vkQueueSubmit() failed");
 	}
 
 	radv_free_temp_syncobjs(queue->device,
@@ -4724,6 +4743,9 @@ VkResult radv_QueueSubmit(
 	uint32_t fence_idx = 0;
 	bool flushed_caches = false;
 
+	if (radv_device_is_lost(queue->device))
+		return VK_ERROR_DEVICE_LOST;
+
 	if (fence != VK_NULL_HANDLE) {
 		for (uint32_t i = 0; i < submitCount; ++i)
 			if (radv_submit_has_effects(pSubmits + i))
@@ -4793,6 +4815,9 @@ VkResult radv_QueueWaitIdle(
 {
 	RADV_FROM_HANDLE(radv_queue, queue, _queue);
 
+	if (radv_device_is_lost(queue->device))
+		return VK_ERROR_DEVICE_LOST;
+
 	pthread_mutex_lock(&queue->pending_mutex);
 	while (!list_is_empty(&queue->pending_submissions)) {
 		pthread_cond_wait(&queue->device->timeline_cond, &queue->pending_mutex);
@@ -4802,9 +4827,10 @@ VkResult radv_QueueWaitIdle(
 	if (!queue->device->ws->ctx_wait_idle(queue->hw_ctx,
 					      radv_queue_family_to_ring(queue->queue_family_index),
 					      queue->queue_idx)) {
-		return vk_errorf(queue->device->instance, VK_ERROR_DEVICE_LOST,
-				 "Failed to wait for a '%s' queue to be idle. "
-				 "GPU hang ?", radv_get_queue_family_name(queue));
+		return radv_device_set_lost(queue->device,
+					    "Failed to wait for a '%s' queue "
+					    "to be idle. GPU hang ?",
+					    radv_get_queue_family_name(queue));
 	}
 
 	return VK_SUCCESS;
@@ -5471,6 +5497,9 @@ static bool radv_sparse_bind_has_effects(const VkBindSparseInfo *info)
 	VkResult result;
 	uint32_t fence_idx = 0;
 
+	if (radv_device_is_lost(queue->device))
+		return VK_ERROR_DEVICE_LOST;
+
 	if (fence != VK_NULL_HANDLE) {
 		for (uint32_t i = 0; i < bindInfoCount; ++i)
 			if (radv_sparse_bind_has_effects(pBindInfo + i))
@@ -5653,6 +5682,10 @@ VkResult radv_WaitForFences(
 	uint64_t                                    timeout)
 {
 	RADV_FROM_HANDLE(radv_device, device, _device);
+
+	if (radv_device_is_lost(device))
+		return VK_ERROR_DEVICE_LOST;
+
 	timeout = radv_get_absolute_timeout(timeout);
 
 	if (device->always_use_syncobj &&
@@ -5809,6 +5842,9 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence)
 		fence->temporary.kind != RADV_FENCE_NONE ?
 		&fence->temporary : &fence->permanent;
 
+	if (radv_device_is_lost(device))
+		return VK_ERROR_DEVICE_LOST;
+
 	switch (part->kind) {
 	case RADV_FENCE_NONE:
 		break;
@@ -6134,6 +6170,9 @@ radv_GetSemaphoreCounterValue(VkDevice _device,
 	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_semaphore, semaphore, _semaphore);
 
+	if (radv_device_is_lost(device))
+		return VK_ERROR_DEVICE_LOST;
+
 	struct radv_semaphore_part *part =
 		semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent;
 
@@ -6191,6 +6230,10 @@ radv_WaitSemaphores(VkDevice _device,
 		    uint64_t timeout)
 {
 	RADV_FROM_HANDLE(radv_device, device, _device);
+
+	if (radv_device_is_lost(device))
+		return VK_ERROR_DEVICE_LOST;
+
 	uint64_t abs_timeout = radv_get_absolute_timeout(timeout);
 
 	if (radv_semaphore_from_handle(pWaitInfo->pSemaphores[0])->permanent.kind == RADV_SEMAPHORE_TIMELINE)
@@ -6327,8 +6370,12 @@ VkResult radv_GetEventStatus(
 	VkDevice                                    _device,
 	VkEvent                                     _event)
 {
+	RADV_FROM_HANDLE(radv_device, device, _device);
 	RADV_FROM_HANDLE(radv_event, event, _event);
 
+	if (radv_device_is_lost(device))
+		return VK_ERROR_DEVICE_LOST;
+
 	if (*event->map == 1)
 		return VK_EVENT_SET;
 	return VK_EVENT_RESET;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 22235e530c1..1f5c4403b4e 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -851,8 +851,25 @@ struct radv_device {
 	bool overallocation_disallowed;
 	uint64_t allocated_memory_size[VK_MAX_MEMORY_HEAPS];
 	mtx_t overallocation_mutex;
+
+	/* Track the number of device loss occurs. */
+	int lost;
 };
 
+VkResult _radv_device_set_lost(struct radv_device *device,
+                              const char *file, int line,
+                              const char *msg, ...)
+	radv_printflike(4, 5);
+
+#define radv_device_set_lost(dev, ...) \
+	_radv_device_set_lost(dev, __FILE__, __LINE__, __VA_ARGS__)
+
+static inline bool
+radv_device_is_lost(const struct radv_device *device)
+{
+	return unlikely(p_atomic_read(&device->lost));
+}
+
 struct radv_device_memory {
 	struct vk_object_base                        base;
 	struct radeon_winsys_bo                      *bo;
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index 1331028abb0..feeb5d84512 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -1368,6 +1368,9 @@ VkResult radv_GetQueryPoolResults(
 	char *data = pData;
 	VkResult result = VK_SUCCESS;
 
+	if (radv_device_is_lost(device))
+		return VK_ERROR_DEVICE_LOST;
+
 	for(unsigned i = 0; i < queryCount; ++i, data += stride) {
 		char *dest = data;
 		unsigned query = firstQuery + i;



More information about the mesa-commit mailing list