[Mesa-dev] [PATCH v2 2/2] anv/query: Busy-wait for available query entries

Wed Apr 5 17:03:22 UTC 2017

Before, we were just looking at whether or not the user wanted us to
wait and waiting on the BO.  Some clients, such as the Serious engine,
use a single query pool for hundreds of individual query results where
the writes for those queries may be split across several command
buffers.  In this scenario, the individual query we're looking for may
become available long before the BO is idle so waiting on the query pool
BO to be finished is wasteful. This commit makes us instead busy-loop on
each query until it's available.

This significantly reduces pipeline bubbles and improves performance of
The Talos Principle on medium settings (where the GPU isn't overloaded
with drawing) by around 20% on my SkyLake gt4.
---

The second version is substantially simplified as recommended by Chris and
has a more descriptive commit message.

 src/intel/vulkan/genX_query.c | 52 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 7ea9404..994c387 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -131,6 +131,44 @@ cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
    }
 }
 
+static bool
+query_is_available(struct anv_device *device, uint64_t *slot)
+{
+   if (!device->info.has_llc)
+      __builtin_ia32_clflush(slot);
+
+   return slot[0];
+}
+
+static VkResult
+wait_for_available(struct anv_device *device,
+                   struct anv_query_pool *pool, uint64_t *slot)
+{
+   while (true) {
+      if (query_is_available(device, slot))
+         return VK_SUCCESS;
+
+      VkResult result = anv_device_wait(device, &pool->bo, 0);
+      switch (result) {
+      case VK_SUCCESS:
+         /* The BO is no longer busy.  If we haven't seen availability yet,
+          * then we never will.  The VK_NOT_READY can only happen if we have a
+          * client error where they call GetQueryPoolResults on a query that
+          * they haven't submitted to the GPU yet.  The spec allows us to do
+          * anything in this case, but returning VK_SUCCESS doesn't seem right
+          * and we shouldn't just keep spinning.
+          */
+         return query_is_available(device, slot) ? VK_SUCCESS : VK_NOT_READY;
+      case VK_TIMEOUT:
+         /* The BO is still busy, keep waiting. */
+         continue;
+      default:
+         /* This is most likely VK_ERROR_DEVICE_LOST */
+         return result;
+      }
+   }
+}
+
 VkResult genX(GetQueryPoolResults)(
     VkDevice                                    _device,
     VkQueryPool                                 queryPool,
@@ -154,12 +192,6 @@ VkResult genX(GetQueryPoolResults)(
    if (pData == NULL)
       return VK_SUCCESS;
 
-   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
-      VkResult result = anv_device_wait(device, &pool->bo, INT64_MAX);
-      if (result != VK_SUCCESS)
-         return result;
-   }
-
    void *data_end = pData + dataSize;
 
    if (!device->info.has_llc) {
@@ -176,6 +208,14 @@ VkResult genX(GetQueryPoolResults)(
       /* Availability is always at the start of the slot */
       bool available = slot[0];
 
+      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
+         status = wait_for_available(device, pool, slot);
+         if (status != VK_SUCCESS)
+            return status;
+
+         available = true;
+      }
+
       /* From the Vulkan 1.0.42 spec:
        *
        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
-- 
2.5.0.400.gff86faf