[Mesa-dev] [PATCH 2/2] anv: Query the kernel for reset status
Jason Ekstrand
jason at jlekstrand.net
Tue Mar 28 21:39:54 UTC 2017
When a client causes a GPU hang (or experiences issues due to a hang in
another client) we want to let it know as soon as possible. In
particular, if it submits work with a fence and calls vkWaitForFences or
vkQueueQaitIdle and it returns VK_SUCCESS, then the client should be
able to trust the results of that rendering. In order to provide this
guarantee, we have to ask the kernel for context status in a few key
locations.
---
src/intel/vulkan/anv_device.c | 78 ++++++++++++++++++++++++++----------------
src/intel/vulkan/anv_gem.c | 18 ++++++++++
src/intel/vulkan/anv_private.h | 3 ++
src/intel/vulkan/genX_query.c | 11 ++----
4 files changed, 72 insertions(+), 38 deletions(-)
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 5f0d00f..33d1984 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -884,8 +884,6 @@ anv_device_submit_simple_batch(struct anv_device *device,
struct anv_bo bo, *exec_bos[1];
VkResult result = VK_SUCCESS;
uint32_t size;
- int64_t timeout;
- int ret;
/* Kernel driver requires 8 byte aligned batch length */
size = align_u32(batch->next - batch->start, 8);
@@ -925,14 +923,7 @@ anv_device_submit_simple_batch(struct anv_device *device,
if (result != VK_SUCCESS)
goto fail;
- timeout = INT64_MAX;
- ret = anv_gem_wait(device, bo.gem_handle, &timeout);
- if (ret != 0) {
- /* We don't know the real error. */
- device->lost = true;
- result = vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
- goto fail;
- }
+ result = anv_device_wait(device, &bo, INT64_MAX);
fail:
anv_bo_pool_free(&device->batch_bo_pool, &bo);
@@ -1264,6 +1255,28 @@ anv_device_execbuf(struct anv_device *device,
return VK_SUCCESS;
}
+VkResult
+anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+ int64_t timeout)
+{
+ int ret = anv_gem_wait(device, bo->gem_handle, &timeout);
+ if (ret == -1 && errno == ETIME) {
+ return VK_TIMEOUT;
+ } else if (ret == -1) {
+ /* We don't know the real error. */
+ device->lost = true;
+ return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
+ }
+
+ if (anv_gem_gpu_has_reset(device)) {
+ device->lost = true;
+ return vk_errorf(VK_ERROR_DEVICE_LOST,
+ "GPU has hung with commands in-flight");
+ }
+
+ return VK_SUCCESS;
+}
+
VkResult anv_QueueSubmit(
VkQueue _queue,
uint32_t submitCount,
@@ -1273,8 +1286,13 @@ VkResult anv_QueueSubmit(
ANV_FROM_HANDLE(anv_queue, queue, _queue);
ANV_FROM_HANDLE(anv_fence, fence, _fence);
struct anv_device *device = queue->device;
- if (unlikely(device->lost))
+
+ if (unlikely(device->lost)) {
return VK_ERROR_DEVICE_LOST;
+ } else if (anv_gem_gpu_has_reset(device)) {
+ device->lost = true;
+ return vk_error(VK_ERROR_DEVICE_LOST);
+ }
VkResult result = VK_SUCCESS;
@@ -1802,9 +1820,6 @@ VkResult anv_GetFenceStatus(
if (unlikely(device->lost))
return VK_ERROR_DEVICE_LOST;
- int64_t t = 0;
- int ret;
-
switch (fence->state) {
case ANV_FENCE_STATE_RESET:
/* If it hasn't even been sent off to the GPU yet, it's not ready */
@@ -1814,15 +1829,18 @@ VkResult anv_GetFenceStatus(
/* It's been signaled, return success */
return VK_SUCCESS;
- case ANV_FENCE_STATE_SUBMITTED:
- /* It's been submitted to the GPU but we don't know if it's done yet. */
- ret = anv_gem_wait(device, fence->bo.gem_handle, &t);
- if (ret == 0) {
+ case ANV_FENCE_STATE_SUBMITTED: {
+ VkResult result = anv_device_wait(device, &fence->bo, 0);
+ switch (result) {
+ case VK_SUCCESS:
fence->state = ANV_FENCE_STATE_SIGNALED;
return VK_SUCCESS;
- } else {
+ case VK_TIMEOUT:
return VK_NOT_READY;
+ default:
+ return result;
}
+ }
default:
unreachable("Invalid fence status");
}
@@ -1884,20 +1902,20 @@ VkResult anv_WaitForFences(
/* These are the fences we really care about. Go ahead and wait
* on it until we hit a timeout.
*/
- ret = anv_gem_wait(device, fence->bo.gem_handle, &timeout);
- if (ret == -1 && errno == ETIME) {
- result = VK_TIMEOUT;
- goto done;
- } else if (ret == -1) {
- /* We don't know the real error. */
- device->lost = true;
- return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
- } else {
+ result = anv_device_wait(device, &fence->bo, 0);
+ switch (result) {
+ case VK_SUCCESS:
fence->state = ANV_FENCE_STATE_SIGNALED;
signaled_fences = true;
if (!waitAll)
- return VK_SUCCESS;
- continue;
+ goto done;
+ break;
+
+ case VK_TIMEOUT:
+ goto done;
+
+ default:
+ return result;
}
}
}
diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
index 0dde6d9..7d4b638 100644
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -301,6 +301,24 @@ anv_gem_get_aperture(int fd, uint64_t *size)
return 0;
}
+bool
+anv_gem_gpu_has_reset(struct anv_device *device)
+{
+ struct drm_i915_reset_stats stats = {
+ .ctx_id = device->context_id,
+ };
+
+ int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
+ if (ret == -1) {
+ /* This really shouldn't be possible but the impossible should probably
+ * be treated as a GPU hang anyway.
+ */
+ return true;
+ }
+
+ return stats.batch_active > 0 || stats.batch_pending > 0;
+}
+
int
anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
{
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 27c887c..f0a2b8d 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -637,6 +637,8 @@ void anv_device_finish_blorp(struct anv_device *device);
VkResult anv_device_execbuf(struct anv_device *device,
struct drm_i915_gem_execbuffer2 *execbuf,
struct anv_bo **execbuf_bos);
+VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+ int64_t timeout);
void* anv_gem_mmap(struct anv_device *device,
uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
@@ -654,6 +656,7 @@ int anv_gem_destroy_context(struct anv_device *device, int context);
int anv_gem_get_param(int fd, uint32_t param);
bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling);
int anv_gem_get_aperture(int fd, uint64_t *size);
+bool anv_gem_gpu_has_reset(struct anv_device *device);
int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 3610665..7ea9404 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -143,8 +143,6 @@ VkResult genX(GetQueryPoolResults)(
{
ANV_FROM_HANDLE(anv_device, device, _device);
ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
- int64_t timeout = INT64_MAX;
- int ret;
assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
@@ -157,12 +155,9 @@ VkResult genX(GetQueryPoolResults)(
return VK_SUCCESS;
if (flags & VK_QUERY_RESULT_WAIT_BIT) {
- ret = anv_gem_wait(device, pool->bo.gem_handle, &timeout);
- if (ret == -1) {
- /* We don't know the real error. */
- return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
- "gem_wait failed %m");
- }
+ VkResult result = anv_device_wait(device, &pool->bo, INT64_MAX);
+ if (result != VK_SUCCESS)
+ return result;
}
void *data_end = pData + dataSize;
--
2.5.0.400.gff86faf
More information about the mesa-dev
mailing list