Mesa (master): tu: Add a "scratch bo" allocation mechanism

Wed May 13 14:00:59 UTC 2020

Module: Mesa
Branch: master
Commit: ed79f805faf1ac5919a30d3284e37cc3f394e464
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=ed79f805faf1ac5919a30d3284e37cc3f394e464

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Mon May 11 18:46:04 2020 +0200

tu: Add a "scratch bo" allocation mechanism

This is simpler than a full-blown memory reuse mechanism, but is good
enough to make sure that repeatedly doing a copy that requires the
linear staging buffer workaround won't use excessive memory or be slowed
down due to repeated allocations.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5007>

---

 src/freedreno/vulkan/tu_device.c  | 54 +++++++++++++++++++++++++++++++++++++++
 src/freedreno/vulkan/tu_private.h | 20 +++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index 668f4e79d3e..dfe4df85e32 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -39,6 +39,7 @@
 #include "compiler/glsl_types.h"
 #include "util/debug.h"
 #include "util/disk_cache.h"
+#include "util/u_atomic.h"
 #include "vk_format.h"
 #include "vk_util.h"
 
@@ -1256,6 +1257,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
 
    device->mem_cache = tu_pipeline_cache_from_handle(pc);
 
+   for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++)
+      mtx_init(&device->scratch_bos[i].construct_mtx, mtx_plain);
+
    *pDevice = tu_device_to_handle(device);
    return VK_SUCCESS;
 
@@ -1302,6 +1306,11 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
          vk_free(&device->alloc, device->queues[i]);
    }
 
+   for (unsigned i = 0; i < ARRAY_SIZE(device->scratch_bos); i++) {
+      if (device->scratch_bos[i].initialized)
+         tu_bo_finish(device, &device->scratch_bos[i].bo);
+   }
+
    /* the compiler does not use pAllocator */
    ralloc_free(device->compiler);
 
@@ -1311,6 +1320,51 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    vk_free(&device->alloc, device);
 }
 
+VkResult
+tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo)
+{
+   unsigned size_log2 = MAX2(util_logbase2_ceil64(size), MIN_SCRATCH_BO_SIZE_LOG2);
+   unsigned index = size_log2 - MIN_SCRATCH_BO_SIZE_LOG2;
+   assert(index < ARRAY_SIZE(dev->scratch_bos));
+
+   for (unsigned i = index; i < ARRAY_SIZE(dev->scratch_bos); i++) {
+      if (p_atomic_read(&dev->scratch_bos[i].initialized)) {
+         /* Fast path: just return the already-allocated BO. */
+         *bo = &dev->scratch_bos[i].bo;
+         return VK_SUCCESS;
+      }
+   }
+
+   /* Slow path: actually allocate the BO. We take a lock because the process
+    * of allocating it is slow, and we don't want to block the CPU while it
+    * finishes.
+   */
+   mtx_lock(&dev->scratch_bos[index].construct_mtx);
+
+   /* Another thread may have allocated it already while we were waiting on
+    * the lock. We need to check this in order to avoid double-allocating.
+    */
+   if (dev->scratch_bos[index].initialized) {
+      mtx_unlock(&dev->scratch_bos[index].construct_mtx);
+      *bo = &dev->scratch_bos[index].bo;
+      return VK_SUCCESS;
+   }
+
+   unsigned bo_size = 1ull << size_log2;
+   VkResult result = tu_bo_init_new(dev, &dev->scratch_bos[index].bo, bo_size);
+   if (result != VK_SUCCESS) {
+      mtx_unlock(&dev->scratch_bos[index].construct_mtx);
+      return result;
+   }
+
+   p_atomic_set(&dev->scratch_bos[index].initialized, true);
+
+   mtx_unlock(&dev->scratch_bos[index].construct_mtx);
+
+   *bo = &dev->scratch_bos[index].bo;
+   return VK_SUCCESS;
+}
+
 VkResult
 tu_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
                                     VkLayerProperties *pProperties)
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index 99da10a568b..1c32e60000a 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -509,6 +509,17 @@ struct tu_device
    uint32_t vsc_draw_strm_pitch;
    uint32_t vsc_prim_strm_pitch;
 
+#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
+
+   /* Currently the kernel driver uses a 32-bit GPU address space, but it
+    * should be impossible to go beyond 48 bits.
+    */
+   struct {
+      struct tu_bo bo;
+      mtx_t construct_mtx;
+      bool initialized;
+   } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
+
    struct tu_bo border_color;
 
    struct list_head shader_slabs;
@@ -531,6 +542,15 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
 VkResult
 tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
 
+/* Get a scratch bo for use inside a command buffer. This will always return
+ * the same bo given the same size or similar sizes, so only one scratch bo
+ * can be used at the same time. It's meant for short-lived things where we
+ * need to write to some piece of memory, read from it, and then immediately
+ * discard it.
+ */
+VkResult
+tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
+
 struct tu_cs_entry
 {
    /* No ownership */