Mesa (master): v3dv: don't use a dedicated BO for each occlusion query

Thu Apr 15 12:58:03 UTC 2021

Module: Mesa
Branch: master
Commit: bed3f31fc66280b39dbba81a83fa41c2231a6653
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=bed3f31fc66280b39dbba81a83fa41c2231a6653

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Wed Apr 14 13:34:00 2021 +0200

v3dv: don't use a dedicated BO for each occlusion query

Dedicated BOs waste memory and are also a significant cause of CPU
overhead when applications use hundreds of them per frame due to
all the work the kernel has to do to page in all these BOs for a job.
The UE4 Vehicle demo was hitting this causing it to freeze and stutter
under 1fps.

The hardware allows us to setup groups of 16 queries in consecutive
4-byte addresses, requiring only that each group of 16 queries is
aligned to a 1024 byte boundary. With this change, we allocate all
the queries in a pool in a single BO and we assign them different
offsets based on the above restriction. This eliminates the freezes
and stutters in the Vehicle sample.

One caveat of this solution is that we can only wait or test for
completion of a query by testing if the GPU is still using its BO,
which basically means that we can only wait for all active queries
in a pool to complete and not just the ones being requested by the
API. Since the Vulkan recommendation is to use a different query
pool per frame this should not be a big issue though.

If this ever becomes a problem (for example if an application does't
follow the recommendation and instead allocates a single pool and
splits its queries between frames), we could try to group queries
in a pool into a number of BOs to try and find a balance, but for
now this should work fine in most cases.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10253>

---

 src/broadcom/vulkan/v3dv_cmd_buffer.c | 14 ++++----
 src/broadcom/vulkan/v3dv_private.h    | 20 ++++++++---
 src/broadcom/vulkan/v3dv_query.c      | 64 ++++++++++++++++++++---------------
 src/broadcom/vulkan/v3dv_queue.c      | 13 ++++---
 4 files changed, 68 insertions(+), 43 deletions(-)

diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 6a4e4476431..b5724d2ecc1 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -3982,9 +3982,10 @@ emit_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
    v3dv_return_if_oom(cmd_buffer, NULL);
 
    cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
-      if (cmd_buffer->state.query.active_query) {
+      if (cmd_buffer->state.query.active_query.bo) {
          counter.address =
-            v3dv_cl_address(cmd_buffer->state.query.active_query, 0);
+            v3dv_cl_address(cmd_buffer->state.query.active_query.bo,
+                            cmd_buffer->state.query.active_query.offset);
       }
    }
 
@@ -4927,10 +4928,11 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
                             VkQueryControlFlags flags)
 {
    /* FIXME: we only support one active query for now */
-   assert(cmd_buffer->state.query.active_query == NULL);
+   assert(cmd_buffer->state.query.active_query.bo == NULL);
    assert(query < pool->query_count);
 
-   cmd_buffer->state.query.active_query = pool->queries[query].bo;
+   cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
+   cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
 }
 
@@ -4940,7 +4942,7 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
                           uint32_t query)
 {
    assert(query < pool->query_count);
-   assert(cmd_buffer->state.query.active_query != NULL);
+   assert(cmd_buffer->state.query.active_query.bo != NULL);
 
    if  (cmd_buffer->state.pass) {
       /* Queue the EndQuery in the command buffer state, we will create a CPU
@@ -4973,7 +4975,7 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
       list_addtail(&job->list_link, &cmd_buffer->jobs);
    }
 
-   cmd_buffer->state.query.active_query = NULL;
+   cmd_buffer->state.query.active_query.bo = NULL;
    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
 }
 
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 8a9943bdf32..91c97d4a5da 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -1141,10 +1141,13 @@ struct v3dv_cmd_buffer_state {
          struct v3dv_end_query_cpu_job_info *states;
       } end;
 
-      /* This is not NULL if we have an active query, that is, we have called
-       * vkCmdBeginQuery but not vkCmdEndQuery.
+      /* This BO is not NULL if we have an active query, that is, we have
+       * called vkCmdBeginQuery but not vkCmdEndQuery.
        */
-      struct v3dv_bo *active_query;
+      struct {
+         struct v3dv_bo *bo;
+         uint32_t offset;
+      } active_query;
    } query;
 };
 
@@ -1205,14 +1208,21 @@ struct v3dv_combined_image_sampler_descriptor {
 struct v3dv_query {
    bool maybe_available;
    union {
-      struct v3dv_bo *bo; /* Used by GPU queries (occlusion) */
-      uint64_t value; /* Used by CPU queries (timestamp) */
+      /* Used by GPU queries (occlusion) */
+      struct {
+         struct v3dv_bo *bo;
+         uint32_t offset;
+      };
+      /* Used by CPU queries (timestamp) */
+      uint64_t value;
    };
 };
 
 struct v3dv_query_pool {
    struct vk_object_base base;
 
+   struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */
+
    VkQueryType query_type;
    uint32_t query_count;
    struct v3dv_query *queries;
diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
index d3100498cbe..94ea46ee5d9 100644
--- a/src/broadcom/vulkan/v3dv_query.c
+++ b/src/broadcom/vulkan/v3dv_query.c
@@ -35,9 +35,6 @@ v3dv_CreateQueryPool(VkDevice _device,
           pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP);
    assert(pCreateInfo->queryCount > 0);
 
-   /* FIXME: the hw allows us to allocate up to 16 queries in a single block
-    *        for occlussion queries so we should try to use that.
-    */
    struct v3dv_query_pool *pool =
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
                        VK_OBJECT_TYPE_QUERY_POOL);
@@ -54,25 +51,38 @@ v3dv_CreateQueryPool(VkDevice _device,
                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pool->queries == NULL) {
       result = vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-      goto fail_alloc_bo_list;
+      goto fail;
+   }
+
+   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
+      /* The hardware allows us to setup groups of 16 queries in consecutive
+       * 4-byte addresses, requiring only that each group of 16 queries is
+       * aligned to a 1024 byte boundary.
+       */
+      const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
+      const uint32_t bo_size = query_groups * 1024;
+      pool->bo = v3dv_bo_alloc(device, bo_size, "query", true);
+      if (!pool->bo) {
+         result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto fail;
+      }
+      if (!v3dv_bo_map(device, pool->bo, bo_size)) {
+         result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto fail;
+      }
    }
 
    uint32_t i;
    for (i = 0; i < pool->query_count; i++) {
       pool->queries[i].maybe_available = false;
       switch (pool->query_type) {
-      case VK_QUERY_TYPE_OCCLUSION:
-         pool->queries[i].bo = v3dv_bo_alloc(device, 4096, "query", true);
-         if (!pool->queries[i].bo) {
-            result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-            goto fail_alloc_bo;
-         }
-         /* For occlusion queries we only need a 4-byte counter */
-         if (!v3dv_bo_map(device, pool->queries[i].bo, 4)) {
-            result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
-            goto fail_alloc_bo;
-         }
+      case VK_QUERY_TYPE_OCCLUSION: {
+         const uint32_t query_group = i / 16;
+         const uint32_t query_offset = query_group * 1024 + (i % 16) * 4;
+         pool->queries[i].bo = pool->bo;
+         pool->queries[i].offset = query_offset;
          break;
+         }
       case VK_QUERY_TYPE_TIMESTAMP:
          pool->queries[i].value = 0;
          break;
@@ -85,12 +95,11 @@ v3dv_CreateQueryPool(VkDevice _device,
 
    return VK_SUCCESS;
 
-fail_alloc_bo:
-   for (uint32_t j = 0; j < i; j++)
-      v3dv_bo_free(device, pool->queries[j].bo);
-   vk_free2(&device->vk.alloc, pAllocator, pool->queries);
-
-fail_alloc_bo_list:
+fail:
+   if (pool->bo)
+      v3dv_bo_free(device, pool->bo);
+   if (pool->queries)
+      vk_free2(&device->vk.alloc, pAllocator, pool->queries);
    vk_object_free(&device->vk, pAllocator, pool);
 
    return result;
@@ -107,12 +116,12 @@ v3dv_DestroyQueryPool(VkDevice _device,
    if (!pool)
       return;
 
-   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
-      for (uint32_t i = 0; i < pool->query_count; i++)
-         v3dv_bo_free(device, pool->queries[i].bo);
-   }
+   if (pool->bo)
+      v3dv_bo_free(device, pool->bo);
+
+   if (pool->queries)
+      vk_free2(&device->vk.alloc, pAllocator, pool->queries);
 
-   vk_free2(&device->vk.alloc, pAllocator, pool->queries);
    vk_object_free(&device->vk, pAllocator, pool);
 }
 
@@ -159,7 +168,8 @@ get_occlusion_query_result(struct v3dv_device *device,
       *available = q->maybe_available && v3dv_bo_wait(device, q->bo, 0);
    }
 
-   return (uint64_t) *((uint32_t *) q->bo->map);
+   const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
+   return (uint64_t) *((uint32_t *)query_addr);
 }
 
 static uint64_t
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 6ea6d1acff8..5698981673a 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -163,19 +163,22 @@ handle_reset_query_cpu_job(struct v3dv_job *job)
     * FIXME: we could avoid blocking the main thread for this if we use
     *        submission thread.
     */
+   if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
+         v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
+
    for (uint32_t i = info->first; i < info->first + info->count; i++) {
       assert(i < info->pool->query_count);
-      struct v3dv_query *query = &info->pool->queries[i];
-      query->maybe_available = false;
+      struct v3dv_query *q = &info->pool->queries[i];
+      q->maybe_available = false;
       switch (info->pool->query_type) {
       case VK_QUERY_TYPE_OCCLUSION: {
-         v3dv_bo_wait(job->device, query->bo, PIPE_TIMEOUT_INFINITE);
-         uint32_t *counter = (uint32_t *) query->bo->map;
+         const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
+         uint32_t *counter = (uint32_t *) q_addr;
          *counter = 0;
          break;
       }
       case VK_QUERY_TYPE_TIMESTAMP:
-         query->value = 0;
+         q->value = 0;
          break;
       default:
          unreachable("Unsupported query type");