Mesa (main): turnip: Move autotune buffers to suballoc.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Tue Apr 12 01:26:35 UTC 2022
Module: Mesa
Branch: main
Commit: 835704e669a8a7401b9cfe2fb92fd249ad3deb14
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=835704e669a8a7401b9cfe2fb92fd249ad3deb14
Author: Emma Anholt <emma at anholt.net>
Date: Fri Mar 18 10:31:12 2022 -0700
turnip: Move autotune buffers to suballoc.
Now the ANGLE trex_200 trace replay does a single BO allocation at startup
for autotune results instead of one per frame (~350 for the whole replay).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15038>
---
src/freedreno/vulkan/tu_autotune.c | 162 ++++++++++-------------------------
src/freedreno/vulkan/tu_autotune.h | 25 +-----
src/freedreno/vulkan/tu_cmd_buffer.c | 15 +---
src/freedreno/vulkan/tu_device.c | 4 +
src/freedreno/vulkan/tu_private.h | 27 ++++++
5 files changed, 82 insertions(+), 151 deletions(-)
diff --git a/src/freedreno/vulkan/tu_autotune.c b/src/freedreno/vulkan/tu_autotune.c
index 0874fafa548..4476e50b7a7 100644
--- a/src/freedreno/vulkan/tu_autotune.c
+++ b/src/freedreno/vulkan/tu_autotune.c
@@ -57,6 +57,9 @@
* time, so in most cases there will be no locking.
*/
+void
+tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
+
#define TU_AUTOTUNE_DEBUG_LOG 0
/* Dump history entries on autotuner finish,
* could be used to gather data from traces.
@@ -68,7 +71,6 @@
/* For how many submissions we store renderpass stats. */
#define MAX_HISTORY_LIFETIME 128
-#define TU_AUTOTUNE_RP_BO_SIZE 4096
/**
* Tracks results for a given renderpass key
@@ -88,62 +90,12 @@ struct tu_renderpass_history {
uint32_t avg_samples;
};
-struct tu_autotune_results_buffer
-{
- int32_t ref_cnt;
-
- struct tu_device *device;
-
- /* TODO: It would be better to suballocate the space from
- * a memory pool which would create less BOs and waste less space.
- */
- struct tu_bo **bos;
- uint32_t num_bos;
- uint32_t results_written;
-};
-
-static struct tu_autotune_results_buffer*
-tu_autotune_results_buffer_create(struct tu_device *dev)
-{
- struct tu_autotune_results_buffer* buffer =
- malloc(sizeof(struct tu_autotune_results_buffer));
-
- buffer->ref_cnt = 1;
- buffer->device = dev;
- buffer->results_written = 0;
- buffer->num_bos = 0;
- buffer->bos = NULL;
-
- return buffer;
-}
-
-void
-tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer)
-{
- assert(buffer && buffer->ref_cnt >= 1);
- p_atomic_inc(&buffer->ref_cnt);
-}
-
-void
-tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer)
-{
- assert(buffer && buffer->ref_cnt >= 1);
- if (p_atomic_dec_zero(&buffer->ref_cnt)) {
- for (int i = 0; i < buffer->num_bos; i++)
- tu_bo_finish(buffer->device, buffer->bos[i]);
-
- ralloc_free(buffer->bos);
- free(buffer);
- }
-}
-
/* Holds per-submission cs which writes the fence. */
struct tu_submission_data {
struct list_head node;
uint32_t fence;
struct tu_cs fence_cs;
- struct tu_autotune_results_buffer **buffers;
uint32_t buffers_count;
};
@@ -175,11 +127,7 @@ free_submission_data(struct tu_submission_data *data)
{
list_del(&data->node);
tu_cs_finish(&data->fence_cs);
- for (uint32_t i = 0; i < data->buffers_count; i++) {
- tu_autotune_results_buffer_unref(data->buffers[i]);
- }
- free(data->buffers);
free(data);
}
@@ -220,16 +168,17 @@ hash_renderpass_instance(const struct tu_render_pass *pass,
}
static void
-free_result(struct tu_renderpass_result *result)
+free_result(struct tu_device *dev, struct tu_renderpass_result *result)
{
+ tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
list_del(&result->node);
free(result);
}
static void
-free_history(struct tu_renderpass_history *history)
+free_history(struct tu_device *dev, struct tu_renderpass_history *history)
{
- tu_autotune_free_results(&history->results);
+ tu_autotune_free_results_locked(dev, &history->results);
free(history);
}
@@ -266,7 +215,7 @@ create_history_result(struct tu_autotune *at, uint64_t rp_key)
}
static void
-history_add_result(struct tu_renderpass_history *history,
+history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
struct tu_renderpass_result *result)
{
list_delinit(&result->node);
@@ -280,7 +229,9 @@ history_add_result(struct tu_renderpass_history *history,
*/
struct tu_renderpass_result *old_result =
list_last_entry(&history->results, struct tu_renderpass_result, node);
- free_result(old_result);
+ mtx_lock(&dev->autotune_mutex);
+ free_result(dev, old_result);
+ mtx_unlock(&dev->autotune_mutex);
}
/* Do calculations here to avoid locking history in tu_autotune_use_bypass */
@@ -297,7 +248,8 @@ history_add_result(struct tu_renderpass_history *history,
static void
process_results(struct tu_autotune *at)
{
- struct tu6_global *global = at->device->global_bo->map;
+ struct tu_device *dev = at->device;
+ struct tu6_global *global = dev->global_bo->map;
uint32_t current_fence = global->autotune_fence;
list_for_each_entry_safe(struct tu_renderpass_result, result,
@@ -309,7 +261,7 @@ process_results(struct tu_autotune *at)
result->samples_passed =
result->samples->samples_end - result->samples->samples_start;
- history_add_result(history, result);
+ history_add_result(dev, history, result);
}
list_for_each_entry_safe(struct tu_submission_data, submission_data,
@@ -338,6 +290,7 @@ queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
/* TODO: copying each result isn't nice */
struct tu_renderpass_result *copy = malloc(sizeof(*result));
*copy = *result;
+ tu_bo_get_ref(copy->bo.bo);
list_addtail(©->node, &at->pending_results);
}
}
@@ -393,19 +346,13 @@ tu_autotune_on_submit(struct tu_device *dev,
struct tu_submission_data *submission_data =
create_submission_data(dev, at);
submission_data->buffers_count = result_buffers;
- submission_data->buffers =
- malloc(sizeof(struct tu_autotune_results_buffer *) * result_buffers);
- uint32_t buffer_idx = 0;
for (uint32_t i = 0; i < cmd_buffer_count; i++) {
struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
if (list_is_empty(&cmdbuf->renderpass_autotune_results))
continue;
queue_pending_results(at, cmdbuf);
-
- submission_data->buffers[buffer_idx++] = cmdbuf->autotune_buffer;
- tu_autotune_results_buffer_ref(cmdbuf->autotune_buffer);
}
#if TU_AUTOTUNE_DEBUG_LOG != 0
@@ -430,7 +377,9 @@ tu_autotune_on_submit(struct tu_device *dev,
_mesa_hash_table_remove_key(at->ht, &history->key);
u_rwlock_wrunlock(&at->ht_lock);
- free_history(history);
+ mtx_lock(&dev->autotune_mutex);
+ free_history(dev, history);
+ mtx_unlock(&dev->autotune_mutex);
}
return &submission_data->fence_cs;
@@ -480,12 +429,14 @@ tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
}
#endif
- tu_autotune_free_results(&at->pending_results);
+ tu_autotune_free_results(dev, &at->pending_results);
+ mtx_lock(&dev->autotune_mutex);
hash_table_foreach(at->ht, entry) {
struct tu_renderpass_history *history = entry->data;
- free_history(history);
+ free_history(dev, history);
}
+ mtx_unlock(&dev->autotune_mutex);
list_for_each_entry_safe(struct tu_submission_data, submission_data,
&at->pending_submission_data, node) {
@@ -510,14 +461,22 @@ tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
}
void
-tu_autotune_free_results(struct list_head *results)
+tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
{
list_for_each_entry_safe(struct tu_renderpass_result, result,
results, node) {
- free_result(result);
+ free_result(dev, result);
}
}
+void
+tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
+{
+ mtx_lock(&dev->autotune_mutex);
+ tu_autotune_free_results_locked(dev, results);
+ mtx_unlock(&dev->autotune_mutex);
+}
+
static bool
fallback_use_bypass(const struct tu_render_pass *pass,
const struct tu_framebuffer *framebuffer,
@@ -624,32 +583,6 @@ tu_autotune_use_bypass(struct tu_autotune *at,
return fallback_use_bypass(pass, framebuffer, cmd_buffer);
}
-static uint32_t
-get_offset_for_renderpass(struct tu_autotune_results_buffer *buffer)
-{
- uint32_t results_per_bo =
- TU_AUTOTUNE_RP_BO_SIZE / sizeof(struct tu_renderpass_samples);
- return (buffer->results_written % results_per_bo) *
- sizeof(struct tu_renderpass_samples);
-}
-
-static struct tu_bo *
-get_bo_for_renderpass(struct tu_autotune_results_buffer *buffer)
-{
- if (get_offset_for_renderpass(buffer) == 0) {
- buffer->num_bos++;
- buffer->bos =
- reralloc(NULL, buffer->bos, struct tu_bo *, buffer->num_bos);
- struct tu_bo **new_bo = &buffer->bos[buffer->num_bos - 1];
-
- tu_bo_init_new(buffer->device, new_bo, TU_AUTOTUNE_RP_BO_SIZE,
- TU_BO_ALLOC_NO_FLAGS);
- tu_bo_map(buffer->device, *new_bo);
- }
-
- return buffer->bos[buffer->num_bos - 1];
-}
-
void
tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
@@ -658,21 +591,21 @@ tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
if (!autotune_result)
return;
- /* Lazily allocate memory for renderpass results.
- * Secondary command buffers do not support renderpasses.
- */
- assert(cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
- if (!cmd->autotune_buffer) {
- cmd->autotune_buffer = tu_autotune_results_buffer_create(cmd->device);
- }
+ struct tu_device *dev = cmd->device;
- uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
- struct tu_bo *bo = get_bo_for_renderpass(cmd->autotune_buffer);
+ static const uint32_t size = sizeof(struct tu_renderpass_samples);
- uint64_t result_iova = bo->iova + bo_offset;
+ mtx_lock(&dev->autotune_mutex);
+ VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
+ mtx_unlock(&dev->autotune_mutex);
+ if (ret != VK_SUCCESS) {
+ autotune_result->bo.iova = 0;
+ return;
+ }
- autotune_result->samples =
- (struct tu_renderpass_samples *) (bo->map + bo_offset);
+ uint64_t result_iova = autotune_result->bo.iova;
+
+ autotune_result->samples = tu_suballoc_bo_map(&autotune_result->bo);
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
@@ -689,11 +622,10 @@ void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
if (!autotune_result)
return;
- uint32_t bo_offset = get_offset_for_renderpass(cmd->autotune_buffer);
- struct tu_bo *bo = cmd->autotune_buffer->bos[cmd->autotune_buffer->num_bos - 1];
- cmd->autotune_buffer->results_written += 1;
+ if (!autotune_result->bo.iova)
+ return;
- uint64_t result_iova = bo->iova + bo_offset +
+ uint64_t result_iova = autotune_result->bo.iova +
offsetof(struct tu_renderpass_samples, samples_end);
tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
diff --git a/src/freedreno/vulkan/tu_autotune.h b/src/freedreno/vulkan/tu_autotune.h
index 6d3fba012a6..633e5ba5e89 100644
--- a/src/freedreno/vulkan/tu_autotune.h
+++ b/src/freedreno/vulkan/tu_autotune.h
@@ -32,6 +32,7 @@ struct tu_device;
struct tu_cmd_buffer;
struct tu_renderpass_history;
+struct tu_renderpass_result;
/**
* "autotune" our decisions about bypass vs GMEM rendering, based on historical
@@ -111,32 +112,13 @@ struct tu_renderpass_samples {
uint64_t __pad1;
};
-/**
- * Tracks the results from an individual renderpass. Initially created
- * per renderpass, and appended to the tail of at->pending_results. At a later
- * time, when the GPU has finished writing the results, we fill samples_passed.
- */
-struct tu_renderpass_result {
- /* Points into GPU memory */
- struct tu_renderpass_samples* samples;
-
- /*
- * Below here, only used internally within autotune
- */
- uint64_t rp_key;
- struct tu_renderpass_history *history;
- struct list_head node;
- uint32_t fence;
- uint64_t samples_passed;
-};
-
VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
bool tu_autotune_use_bypass(struct tu_autotune *at,
struct tu_cmd_buffer *cmd_buffer,
struct tu_renderpass_result **autotune_result);
-void tu_autotune_free_results(struct list_head *results);
+void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count);
@@ -152,9 +134,6 @@ struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
struct tu_autotune_results_buffer;
-void tu_autotune_results_buffer_ref(struct tu_autotune_results_buffer *buffer);
-void tu_autotune_results_buffer_unref(struct tu_autotune_results_buffer *buffer);
-
void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_renderpass_result *autotune_result);
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c
index 40bef6bdf69..05c98f0d9d4 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@@ -1514,9 +1514,7 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
u_trace_fini(&cmd_buffer->trace);
- if (cmd_buffer->autotune_buffer)
- tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
- tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+ tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
if (cmd_buffer->descriptors[i].push_set.layout)
@@ -1542,16 +1540,7 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
tu_cs_reset(&cmd_buffer->sub_cs);
- /* We can't just reset the autotune_buffer's contents, because it is also
- * referenced by the submission_data if the command buffer was submitted
- * and we may be accessing it after cmdbuf reset/free.
- */
- if (cmd_buffer->autotune_buffer) {
- tu_autotune_results_buffer_unref(cmd_buffer->autotune_buffer);
- cmd_buffer->autotune_buffer = NULL;
- }
-
- tu_autotune_free_results(&cmd_buffer->renderpass_autotune_results);
+ tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results);
for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index e2d22892fe2..351affb259c 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -1729,6 +1729,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
mtx_init(&device->bo_mutex, mtx_plain);
mtx_init(&device->pipeline_mutex, mtx_plain);
+ mtx_init(&device->autotune_mutex, mtx_plain);
u_rwlock_init(&device->dma_bo_lock);
pthread_mutex_init(&device->submit_mutex, NULL);
@@ -1789,6 +1790,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
tu_bo_suballocator_init(&device->pipeline_suballoc, device,
128 * 1024, TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP);
+ tu_bo_suballocator_init(&device->autotune_suballoc, device,
+ 128 * 1024, 0);
result = tu_bo_init_new(device, &device->global_bo, global_size,
TU_BO_ALLOC_ALLOW_DUMP);
@@ -1992,6 +1995,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
tu_autotune_fini(&device->autotune, device);
tu_bo_suballocator_finish(&device->pipeline_suballoc);
+ tu_bo_suballocator_finish(&device->autotune_suballoc);
util_sparse_array_finish(&device->bo_map);
u_rwlock_destroy(&device->dma_bo_lock);
diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h
index b992c381cfd..28a9c5ea1c8 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -438,6 +438,27 @@ enum global_shader {
GLOBAL_SH_COUNT,
};
+/**
+ * Tracks the results from an individual renderpass. Initially created
+ * per renderpass, and appended to the tail of at->pending_results. At a later
+ * time, when the GPU has finished writing the results, we fill samples_passed.
+ */
+struct tu_renderpass_result {
+ /* Points into GPU memory */
+ struct tu_renderpass_samples* samples;
+
+ struct tu_suballoc_bo bo;
+
+ /*
+ * Below here, only used internally within autotune
+ */
+ uint64_t rp_key;
+ struct tu_renderpass_history *history;
+ struct list_head node;
+ uint32_t fence;
+ uint64_t samples_passed;
+};
+
#define TU_BORDER_COLOR_COUNT 4096
#define TU_BORDER_COLOR_BUILTIN 6
@@ -514,6 +535,12 @@ struct tu_device
struct tu_suballocator pipeline_suballoc;
mtx_t pipeline_mutex;
+ /* Device-global BO suballocator for reducing BO management for small
+ * gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
+ */
+ struct tu_suballocator autotune_suballoc;
+ mtx_t autotune_mutex;
+
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024)
More information about the mesa-commit
mailing list