[Mesa-dev] [PATCH 2/2] anv: Query the kernel for reset status

Tue Mar 28 21:39:54 UTC 2017

When a client causes a GPU hang (or experiences issues due to a hang in
another client) we want to let it know as soon as possible.  In
particular, if it submits work with a fence and calls vkWaitForFences or
vkQueueQaitIdle and it returns VK_SUCCESS, then the client should be
able to trust the results of that rendering.  In order to provide this
guarantee, we have to ask the kernel for context status in a few key
locations.
---
 src/intel/vulkan/anv_device.c  | 78 ++++++++++++++++++++++++++----------------
 src/intel/vulkan/anv_gem.c     | 18 ++++++++++
 src/intel/vulkan/anv_private.h |  3 ++
 src/intel/vulkan/genX_query.c  | 11 ++----
 4 files changed, 72 insertions(+), 38 deletions(-)

diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 5f0d00f..33d1984 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -884,8 +884,6 @@ anv_device_submit_simple_batch(struct anv_device *device,
    struct anv_bo bo, *exec_bos[1];
    VkResult result = VK_SUCCESS;
    uint32_t size;
-   int64_t timeout;
-   int ret;
 
    /* Kernel driver requires 8 byte aligned batch length */
    size = align_u32(batch->next - batch->start, 8);
@@ -925,14 +923,7 @@ anv_device_submit_simple_batch(struct anv_device *device,
    if (result != VK_SUCCESS)
       goto fail;
 
-   timeout = INT64_MAX;
-   ret = anv_gem_wait(device, bo.gem_handle, &timeout);
-   if (ret != 0) {
-      /* We don't know the real error. */
-      device->lost = true;
-      result = vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
-      goto fail;
-   }
+   result = anv_device_wait(device, &bo, INT64_MAX);
 
  fail:
    anv_bo_pool_free(&device->batch_bo_pool, &bo);
@@ -1264,6 +1255,28 @@ anv_device_execbuf(struct anv_device *device,
    return VK_SUCCESS;
 }
 
+VkResult
+anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+                int64_t timeout)
+{
+   int ret = anv_gem_wait(device, bo->gem_handle, &timeout);
+   if (ret == -1 && errno == ETIME) {
+      return VK_TIMEOUT;
+   } else if (ret == -1) {
+      /* We don't know the real error. */
+      device->lost = true;
+      return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
+   }
+
+   if (anv_gem_gpu_has_reset(device)) {
+      device->lost = true;
+      return vk_errorf(VK_ERROR_DEVICE_LOST,
+                       "GPU has hung with commands in-flight");
+   }
+
+   return VK_SUCCESS;
+}
+
 VkResult anv_QueueSubmit(
     VkQueue                                     _queue,
     uint32_t                                    submitCount,
@@ -1273,8 +1286,13 @@ VkResult anv_QueueSubmit(
    ANV_FROM_HANDLE(anv_queue, queue, _queue);
    ANV_FROM_HANDLE(anv_fence, fence, _fence);
    struct anv_device *device = queue->device;
-   if (unlikely(device->lost))
+
+   if (unlikely(device->lost)) {
       return VK_ERROR_DEVICE_LOST;
+   } else if (anv_gem_gpu_has_reset(device)) {
+      device->lost = true;
+      return vk_error(VK_ERROR_DEVICE_LOST);
+   }
 
    VkResult result = VK_SUCCESS;
 
@@ -1802,9 +1820,6 @@ VkResult anv_GetFenceStatus(
    if (unlikely(device->lost))
       return VK_ERROR_DEVICE_LOST;
 
-   int64_t t = 0;
-   int ret;
-
    switch (fence->state) {
    case ANV_FENCE_STATE_RESET:
       /* If it hasn't even been sent off to the GPU yet, it's not ready */
@@ -1814,15 +1829,18 @@ VkResult anv_GetFenceStatus(
       /* It's been signaled, return success */
       return VK_SUCCESS;
 
-   case ANV_FENCE_STATE_SUBMITTED:
-      /* It's been submitted to the GPU but we don't know if it's done yet. */
-      ret = anv_gem_wait(device, fence->bo.gem_handle, &t);
-      if (ret == 0) {
+   case ANV_FENCE_STATE_SUBMITTED: {
+      VkResult result = anv_device_wait(device, &fence->bo, 0);
+      switch (result) {
+      case VK_SUCCESS:
          fence->state = ANV_FENCE_STATE_SIGNALED;
          return VK_SUCCESS;
-      } else {
+      case VK_TIMEOUT:
          return VK_NOT_READY;
+      default:
+         return result;
       }
+   }
    default:
       unreachable("Invalid fence status");
    }
@@ -1884,20 +1902,20 @@ VkResult anv_WaitForFences(
             /* These are the fences we really care about.  Go ahead and wait
              * on it until we hit a timeout.
              */
-            ret = anv_gem_wait(device, fence->bo.gem_handle, &timeout);
-            if (ret == -1 && errno == ETIME) {
-               result = VK_TIMEOUT;
-               goto done;
-            } else if (ret == -1) {
-               /* We don't know the real error. */
-                device->lost = true;
-               return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
-            } else {
+            result = anv_device_wait(device, &fence->bo, 0);
+            switch (result) {
+            case VK_SUCCESS:
                fence->state = ANV_FENCE_STATE_SIGNALED;
                signaled_fences = true;
                if (!waitAll)
-                  return VK_SUCCESS;
-               continue;
+                  goto done;
+               break;
+
+            case VK_TIMEOUT:
+               goto done;
+
+            default:
+               return result;
             }
          }
       }
diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
index 0dde6d9..7d4b638 100644
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -301,6 +301,24 @@ anv_gem_get_aperture(int fd, uint64_t *size)
    return 0;
 }
 
+bool
+anv_gem_gpu_has_reset(struct anv_device *device)
+{
+   struct drm_i915_reset_stats stats = {
+      .ctx_id = device->context_id,
+   };
+
+   int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
+   if (ret == -1) {
+      /* This really shouldn't be possible but the impossible should probably
+       * be treated as a GPU hang anyway.
+       */
+      return true;
+   }
+
+   return stats.batch_active > 0 || stats.batch_pending > 0;
+}
+
 int
 anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
 {
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 27c887c..f0a2b8d 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -637,6 +637,8 @@ void anv_device_finish_blorp(struct anv_device *device);
 VkResult anv_device_execbuf(struct anv_device *device,
                             struct drm_i915_gem_execbuffer2 *execbuf,
                             struct anv_bo **execbuf_bos);
+VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+                         int64_t timeout);
 
 void* anv_gem_mmap(struct anv_device *device,
                    uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
@@ -654,6 +656,7 @@ int anv_gem_destroy_context(struct anv_device *device, int context);
 int anv_gem_get_param(int fd, uint32_t param);
 bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling);
 int anv_gem_get_aperture(int fd, uint64_t *size);
+bool anv_gem_gpu_has_reset(struct anv_device *device);
 int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
 uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
 int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 3610665..7ea9404 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -143,8 +143,6 @@ VkResult genX(GetQueryPoolResults)(
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
-   int64_t timeout = INT64_MAX;
-   int ret;
 
    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
@@ -157,12 +155,9 @@ VkResult genX(GetQueryPoolResults)(
       return VK_SUCCESS;
 
    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
-      ret = anv_gem_wait(device, pool->bo.gem_handle, &timeout);
-      if (ret == -1) {
-         /* We don't know the real error. */
-         return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                          "gem_wait failed %m");
-      }
+      VkResult result = anv_device_wait(device, &pool->bo, INT64_MAX);
+      if (result != VK_SUCCESS)
+         return result;
    }
 
    void *data_end = pData + dataSize;
-- 
2.5.0.400.gff86faf