Mesa (main): turnip: Move autotune buffers to suballoc.

Tue Apr 12 01:26:35 UTC 2022

Module: Mesa
Branch: main
Commit: 835704e669a8a7401b9cfe2fb92fd249ad3deb14
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=835704e669a8a7401b9cfe2fb92fd249ad3deb14

Author: Emma Anholt <emma at anholt.net>
Date:   Fri Mar 18 10:31:12 2022 -0700

turnip: Move autotune buffers to suballoc.

Now the ANGLE trex_200 trace replay does a single BO allocation at startup
for autotune results instead of one per frame (~350 for the whole replay).

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15038>

---

 src/freedreno/vulkan/tu_autotune.c   | 162 ++++++++++-------------------------
 src/freedreno/vulkan/tu_autotune.h   |  25 +-----
 src/freedreno/vulkan/tu_cmd_buffer.c |  15 +---
 src/freedreno/vulkan/tu_device.c     |   4 +
 src/freedreno/vulkan/tu_private.h    |  27 ++++++
 5 files changed, 82 insertions(+), 151 deletions(-)

diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c
index 0874fafa548..4476e50b7a7 100644
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@@ -57,6 +57,9 @@
  * time, so in most cases there will be no locking.
  */
 
+void
+tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
+
 #define TU_AUTOTUNE_DEBUG_LOG 0
 /* Dump history entries on autotuner finish,
  * could be used to gather data from traces.
@@ -68,7 +71,6 @@
 /* For how many submissions we store renderpass stats. */
 #define MAX_HISTORY_LIFETIME 128
 
-#define TU_AUTOTUNE_RP_BO_SIZE 4096
 
 /**
  * Tracks results for a given renderpass key
@@ -88,62 +90,12 @@ struct tu_renderpass_history {
    uint32_t avg_samples;
 };
 
-struct tu_autotune_results_buffer
-{
-   int32_t ref_cnt;
-
-   struct tu_device *device;
-
-   /* TODO: It would be better to suballocate the space from
-    * a memory pool which would create less BOs and waste less space.
-    */
-   struct tu_bo **bos;
-   uint32_t num_bos;
-   uint32_t results_written;
-};
-
-static struct tu_autotune_results_buffer*
-tu_autotune_results_buffer_create(struct tu_device *dev)
-{
-   struct tu_autotune_results_buffer* buffer =
-      malloc(sizeof(struct tu_autotune_results_buffer));
-
-   buffer->ref_cnt = 1;
-   buffer->device = dev;
-   buffer->results_written = 0;
-   buffer->num_bos = 0;
-   buffer->bos = NULL;
-
-   return buffer;
-}
-
-void
-tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer)
-{
-   assert(buffer && buffer->ref_cnt >= 1);
-   p_atomic_inc(&buffer->ref_cnt);
-}
-
-void
-tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer)
-{
-   assert(buffer && buffer->ref_cnt >= 1);
-   if (p_atomic_dec_zero(&buffer->ref_cnt)) {
-      for (int i = 0; i < buffer->num_bos; i++)
-         tu_bo_finish(buffer->device, buffer->bos[i]);
-
-      ralloc_free(buffer->bos);
-      free(buffer);
-   }
-}
-
 /* Holds per-submission cs which writes the fence. */
 struct tu_submission_data {
    struct list_head node;
    uint32_t fence;
 
    struct tu_cs fence_cs;
-   struct tu_autotune_results_buffer **buffers;
    uint32_t buffers_count;
 };
 
@@ -175,11 +127,7 @@ free_submission_data(struct tu_submission_data *data)
 {
    list_del(&data->node);
    tu_cs_finish(&data->fence_cs);
-   for (uint32_t i = 0; i < data->buffers_count; i++) {
-      tu_autotune_results_buffer_unref(data->buffers[i]);
-   }
 
-   free(data->buffers);
    free(data);
 }
 
@@ -220,16 +168,17 @@ hash_renderpass_instance(const struct tu_render_pass *pass,
 }
 
 static void
-free_result(struct tu_renderpass_result *result)
+free_result(struct tu_device *dev, struct tu_renderpass_result *result)
 {
+   tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
    list_del(&result->node);
    free(result);
 }
 
 static void
-free_history(struct tu_renderpass_history *history)
+free_history(struct tu_device *dev, struct tu_renderpass_history *history)
 {
-   tu_autotune_free_results(&history->results);
+   tu_autotune_free_results_locked(dev, &history->results);
    free(history);
 }
 
@@ -266,7 +215,7 @@ create_history_result(struct tu_autotune *at, uint64_t rp_key)
 }
 
 static void
-history_add_result(struct tu_renderpass_history *history,
+history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
                       struct tu_renderpass_result *result)
 {
    list_delinit(&result->node);
@@ -280,7 +229,9 @@ history_add_result(struct tu_renderpass_history *history,
        */
       struct tu_renderpass_result *old_result =
          list_last_entry(&history->results, struct tu_renderpass_result, node);
-      free_result(old_result);
+      mtx_lock(&dev->autotune_mutex);
+      free_result(dev, old_result);
+      mtx_unlock(&dev->autotune_mutex);
    }
 
    /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
@@ -297,7 +248,8 @@ history_add_result(struct tu_renderpass_history *history,
 static void
 process_results(struct tu_autotune *at)
 {
-   struct tu6_global *global = at->device->global_bo->map;
+   struct tu_device *dev = at->device;
+   struct tu6_global *global = dev->global_bo->map;
    uint32_t current_fence = global->autotune_fence;
 
    list_for_each_entry_safe(struct tu_renderpass_result, result,
@@ -309,7 +261,7 @@ process_results(struct tu_autotune *at)
       result->samples_passed =
          result->samples->samples_end - result->samples->samples_start;
 
-      history_add_result(history, result);
+      history_add_result(dev, history, result);
    }
 
    list_for_each_entry_safe(struct tu_submission_data, submission_data,
@@ -338,6 +290,7 @@ queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
          /* TODO: copying each result isn't nice */
          struct tu_renderpass_result *copy = malloc(sizeof(*result));
          *copy = *result;
+         tu_bo_get_ref(copy->bo.bo);
          list_addtail(&copy->node, &at->pending_results);
       }
    }
@@ -393,19 +346,13 @@ tu_autotune_on_submit(struct tu_device *dev,
    struct tu_submission_data *submission_data =
       create_submission_data(dev, at);
    submission_data->buffers_count = result_buffers;
-   submission_data->buffers =
-      malloc(sizeof(struct tu_autotune_results_buffer *) * result_buffers);
 
-   uint32_t buffer_idx = 0;
    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
       if (list_is_empty(&cmdbuf->renderpass_autotune_results))
          continue;
 
       queue_pending_results(at, cmdbuf);
-
-      submission_data->buffers[buffer_idx++] = cmdbuf->autotune_buffer;
-      tu_autotune_results_buffer_ref(cmdbuf->autotune_buffer);
    }
 
 #if TU_AUTOTUNE_DEBUG_LOG != 0
@@ -430,7 +377,9 @@ tu_autotune_on_submit(struct tu_device *dev,
       _mesa_hash_table_remove_key(at->ht, &history->key);
       u_rwlock_wrunlock(&at->ht_lock);
 
-      free_history(history);
+      mtx_lock(&dev->autotune_mutex);
+      free_history(dev, history);
+      mtx_unlock(&dev->autotune_mutex);
    }
 
    return &submission_data->fence_cs;
@@ -480,12 +429,14 @@ tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
    }
 #endif
 
-   tu_autotune_free_results(&at->pending_results);
+   tu_autotune_free_results(dev, &at->pending_results);
 
+   mtx_lock(&dev->autotune_mutex);
    hash_table_foreach(at->ht, entry) {
       struct tu_renderpass_history *history = entry->data;
-      free_history(history);
+      free_history(dev, history);
    }
+   mtx_unlock(&dev->autotune_mutex);
 
    list_for_each_entry_safe(struct tu_submission_data, submission_data,
                             &at->pending_submission_data, node) {
@@ -510,14 +461,22 @@ tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
 }
 
 void
-tu_autotune_free_results(struct list_head *results)
+tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
 {
    list_for_each_entry_safe(struct tu_renderpass_result, result,
                             results, node) {
-      free_result(result);
+      free_result(dev, result);
    }
 }
 
+void
+tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
+{
+   mtx_lock(&dev->autotune_mutex);
+   tu_autotune_free_results_locked(dev, results);
+   mtx_unlock(&dev->autotune_mutex);
+}
+
 static bool
 fallback_use_bypass(const struct tu_render_pass *pass,
                     const struct tu_framebuffer *framebuffer,
@@ -624,32 +583,6 @@ tu_autotune_use_bypass(struct tu_autotune *at,
    return fallback_use_bypass(pass, framebuffer, cmd_buffer);
 }
 
-static uint32_t
-get_offset_for_renderpass(struct tu_autotune_results_buffer *buffer)
-{
-   uint32_t results_per_bo =
-      TU_AUTOTUNE_RP_BO_SIZE / sizeof(struct tu_renderpass_samples);
-   return (buffer->results_written % results_per_bo) *
-          sizeof(struct tu_renderpass_samples);
-}
-
-static struct tu_bo *
-get_bo_for_renderpass(struct tu_autotune_results_buffer *buffer)
-{
-   if (get_offset_for_renderpass(buffer) == 0) {
-      buffer->num_bos++;
-      buffer->bos =
-         reralloc(NULL, buffer->bos, struct tu_bo *, buffer->num_bos);
-      struct tu_bo **new_bo = &buffer->bos[buffer->num_bos - 1];
-
-      tu_bo_init_new(buffer->device, new_bo, TU_AUTOTUNE_RP_BO_SIZE,
-                     TU_BO_ALLOC_NO_FLAGS);
-      tu_bo_map(buffer->device, *new_bo);
-   }
-
-   return buffer->bos[buffer->num_bos - 1];
-}
-
 void
 tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
                              struct tu_cs *cs,
@@ -658,21 +591,21 @@ tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
    if (!autotune_result)
       return;
 
-   /* Lazily allocate memory for renderpass results.
-    * Secondary command buffers do not support renderpasses.
-    */
-   assert(cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-   if (!cmd->autotune_buffer) {
-      cmd->autotune_buffer = tu_autotune_results_buffer_create(cmd->device);
-   }
+   struct tu_device *dev = cmd->device;
 
-   uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
-   struct tu_bo *bo = get_bo_for_renderpass(cmd->autotune_buffer);
+   static const uint32_t size = sizeof(struct tu_renderpass_samples);
 
-   uint64_t result_iova = bo->iova + bo_offset;
+   mtx_lock(&dev->autotune_mutex);
+   VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
+   mtx_unlock(&dev->autotune_mutex);
+   if (ret != VK_SUCCESS) {
+      autotune_result->bo.iova = 0;
+      return;
+   }
 
-   autotune_result->samples =
-      (struct tu_renderpass_samples *) (bo->map + bo_offset);
+   uint64_t result_iova = autotune_result->bo.iova;
+
+   autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);
 
    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 
@@ -689,11 +622,10 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
    if (!autotune_result)
       return;
 
-   uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
-   struct tu_bo *bo = cmd->autotune_buffer->bos[cmd->autotune_buffer->num_bos - 1];
-   cmd->autotune_buffer->results_written += 1;
+   if (!autotune_result->bo.iova)
+      return;
 
-   uint64_t result_iova = bo->iova + bo_offset +
+   uint64_t result_iova = autotune_result->bo.iova +
                           offsetof(struct tu_renderpass_samples, samples_end);
 
    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index 6d3fba012a6..633e5ba5e89 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -32,6 +32,7 @@ struct tu_device;
 struct tu_cmd_buffer;
 
 struct tu_renderpass_history;
+struct tu_renderpass_result;
 
 /**
  * "autotune" our decisions about bypass vs GMEM rendering, based on historical
@@ -111,32 +112,13 @@ struct tu_renderpass_samples {
    uint64_t __pad1;
 };
 
-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
-   /* Points into GPU memory */
-   struct tu_renderpass_samples* samples;
-
-   /*
-    * Below here, only used internally within autotune
-    */
-   uint64_t rp_key;
-   struct tu_renderpass_history *history;
-   struct list_head node;
-   uint32_t fence;
-   uint64_t samples_passed;
-};
-
 VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
 void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
 
 bool tu_autotune_use_bypass(struct tu_autotune *at,
                             struct tu_cmd_buffer *cmd_buffer,
                             struct tu_renderpass_result **autotune_result);
-void tu_autotune_free_results(struct list_head *results);
+void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
 
 bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
                                        uint32_t cmd_buffer_count);
@@ -152,9 +134,6 @@ struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
 
 struct tu_autotune_results_buffer;
 
-void tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer);
-void tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer);
-
 void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
                                   struct tu_cs *cs,
                                   struct tu_renderpass_result *autotune_result);
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 40bef6bdf69..05c98f0d9d4 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -1514,9 +1514,7 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
 
    u_trace_fini(&cmd_buffer->trace);
 
-   if (cmd_buffer->autotune_buffer)
-      tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
-   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
 
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       if (cmd_buffer->descriptors[i].push_set.layout)
@@ -1542,16 +1540,7 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
    tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
    tu_cs_reset(&cmd_buffer->sub_cs);
 
-   /* We can't just reset the autotune_buffer's contents, because it is also
-    * referenced by the submission_data if the command buffer was submitted
-    * and we may be accessing it after cmdbuf reset/free.
-    */
-   if (cmd_buffer->autotune_buffer) {
-      tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
-      cmd_buffer->autotune_buffer = NULL;
-   }
-
-   tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+   tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
 
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index e2d22892fe2..351affb259c 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -1729,6 +1729,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
 
    mtx_init(&device->bo_mutex, mtx_plain);
    mtx_init(&device->pipeline_mutex, mtx_plain);
+   mtx_init(&device->autotune_mutex, mtx_plain);
    u_rwlock_init(&device->dma_bo_lock);
    pthread_mutex_init(&device->submit_mutex, NULL);
 
@@ -1789,6 +1790,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
 
    tu_bo_suballocator_init(&device->pipeline_suballoc, device,
                            128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP);
+   tu_bo_suballocator_init(&device->autotune_suballoc, device,
+                           128 * 1024, 0);
 
    result = tu_bo_init_new(device, &device->global_bo, global_size,
                            TU_BO_ALLOC_ALLOW_DUMP);
@@ -1992,6 +1995,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    tu_autotune_fini(&device->autotune, device);
 
    tu_bo_suballocator_finish(&device->pipeline_suballoc);
+   tu_bo_suballocator_finish(&device->autotune_suballoc);
 
    util_sparse_array_finish(&device->bo_map);
    u_rwlock_destroy(&device->dma_bo_lock);
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index b992c381cfd..28a9c5ea1c8 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -438,6 +438,27 @@ enum global_shader {
    GLOBAL_SH_COUNT,
 };
 
+/**
+ * Tracks the results from an individual renderpass. Initially created
+ * per renderpass, and appended to the tail of at->pending_results. At a later
+ * time, when the GPU has finished writing the results, we fill samples_passed.
+ */
+struct tu_renderpass_result {
+   /* Points into GPU memory */
+   struct tu_renderpass_samples* samples;
+
+   struct tu_suballoc_bo bo;
+
+   /*
+    * Below here, only used internally within autotune
+    */
+   uint64_t rp_key;
+   struct tu_renderpass_history *history;
+   struct list_head node;
+   uint32_t fence;
+   uint64_t samples_passed;
+};
+
 #define TU_BORDER_COLOR_COUNT 4096
 #define TU_BORDER_COLOR_BUILTIN 6
 
@@ -514,6 +535,12 @@ struct tu_device
    struct tu_suballocator pipeline_suballoc;
    mtx_t pipeline_mutex;
 
+   /* Device-global BO suballocator for reducing BO management for small
+    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
+    */
+   struct tu_suballocator autotune_suballoc;
+   mtx_t autotune_mutex;
+
    /* the blob seems to always use 8K factor and 128K param sizes, copy them */
 #define TU_TESS_FACTOR_SIZE (8 * 1024)
 #define TU_TESS_PARAM_SIZE (128 * 1024)