Mesa (main): radv: add segregated fit shader memory allocator

Wed Oct 13 05:33:39 UTC 2021

Module: Mesa
Branch: main
Commit: a1069b8bd4148e3cc924d598aa67e3522933b9c8
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=a1069b8bd4148e3cc924d598aa67e3522933b9c8

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Fri May 14 12:44:17 2021 +0100

radv: add segregated fit shader memory allocator

Way faster than the previous one, especially with a large number of
shaders.

This doesn't have much of an effect right now, but the previous allocator
was expensive compared to the cost of compiling vertex shader prologs.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11717>

---

 src/amd/vulkan/radv_constants.h |   7 +
 src/amd/vulkan/radv_device.c    |   5 +-
 src/amd/vulkan/radv_private.h   |   8 +-
 src/amd/vulkan/radv_shader.c    | 312 +++++++++++++++++++++++++++++++---------
 src/amd/vulkan/radv_shader.h    |  37 +++--
 5 files changed, 284 insertions(+), 85 deletions(-)

diff --git a/src/amd/vulkan/radv_constants.h b/src/amd/vulkan/radv_constants.h
index eb66897a686..c3f057d8ff6 100644
--- a/src/amd/vulkan/radv_constants.h
+++ b/src/amd/vulkan/radv_constants.h
@@ -95,4 +95,11 @@
 
 #define RADV_MAX_HIT_ATTRIB_SIZE 32
 
+#define RADV_SHADER_ALLOC_ALIGNMENT      256
+#define RADV_SHADER_ALLOC_MIN_ARENA_SIZE (256 * 1024)
+#define RADV_SHADER_ALLOC_MIN_SIZE_CLASS 8
+#define RADV_SHADER_ALLOC_MAX_SIZE_CLASS 15
+#define RADV_SHADER_ALLOC_NUM_FREE_LISTS                                                           \
+   (RADV_SHADER_ALLOC_MAX_SIZE_CLASS - RADV_SHADER_ALLOC_MIN_SIZE_CLASS + 1)
+
 #endif /* RADV_CONSTANTS_H */
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 8cf934e35db..6cc96e00d9b 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -2904,8 +2904,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
 
    device->image_float32_atomics = image_float32_atomics;
 
-   mtx_init(&device->shader_slab_mutex, mtx_plain);
-   list_inithead(&device->shader_slabs);
+   radv_init_shader_arenas(device);
 
    device->overallocation_disallowed = overallocation_disallowed;
    mtx_init(&device->overallocation_mutex, mtx_plain);
@@ -3212,7 +3211,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    radv_trap_handler_finish(device);
    radv_finish_trace(device);
 
-   radv_destroy_shader_slabs(device);
+   radv_destroy_shader_arenas(device);
 
    u_cnd_monotonic_destroy(&device->timeline_cond);
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 1a64a439c5e..c65acb1cd64 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -770,8 +770,12 @@ struct radv_device {
     */
    uint32_t image_mrt_offset_counter;
    uint32_t fmask_mrt_offset_counter;
-   struct list_head shader_slabs;
-   mtx_t shader_slab_mutex;
+
+   struct list_head shader_arenas;
+   uint8_t shader_free_list_mask;
+   struct list_head shader_free_lists[RADV_SHADER_ALLOC_NUM_FREE_LISTS];
+   struct list_head shader_block_obj_pool;
+   mtx_t shader_arena_mutex;
 
    /* For detecting VM faults reported by dmesg. */
    uint64_t dmesg_timestamp;
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index d5a8986663b..defa4298a0d 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -1023,84 +1023,265 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
    }
 }
 
-static void *
-radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)
+static unsigned
+get_size_class(unsigned size, bool round_up)
 {
-   mtx_lock(&device->shader_slab_mutex);
-   list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
-   {
-      uint64_t offset = 0;
+   size = round_up ? util_logbase2_ceil(size) : util_logbase2(size);
+   unsigned size_class =
+      MAX2(size, RADV_SHADER_ALLOC_MIN_SIZE_CLASS) - RADV_SHADER_ALLOC_MIN_SIZE_CLASS;
+   return MIN2(size_class, RADV_SHADER_ALLOC_NUM_FREE_LISTS - 1);
+}
 
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wshadow"
-#endif
-      list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list)
+static void
+remove_hole(struct radv_device *device, union radv_shader_arena_block *hole)
+{
+   unsigned size_class = get_size_class(hole->size, false);
+   list_del(&hole->freelist);
+   if (list_is_empty(&device->shader_free_lists[size_class]))
+      device->shader_free_list_mask &= ~(1u << size_class);
+}
+
+static void
+add_hole(struct radv_device *device, union radv_shader_arena_block *hole)
+{
+   unsigned size_class = get_size_class(hole->size, false);
+   list_addtail(&hole->freelist, &device->shader_free_lists[size_class]);
+   device->shader_free_list_mask |= 1u << size_class;
+}
+
+static union radv_shader_arena_block *
+alloc_block_obj(struct radv_device *device)
+{
+   if (!list_is_empty(&device->shader_block_obj_pool)) {
+      union radv_shader_arena_block *block =
+         list_first_entry(&device->shader_block_obj_pool, union radv_shader_arena_block, pool);
+      list_del(&block->pool);
+      return block;
+   }
+
+   return malloc(sizeof(union radv_shader_arena_block));
+}
+
+static void
+free_block_obj(struct radv_device *device, union radv_shader_arena_block *block)
+{
+   list_add(&block->pool, &device->shader_block_obj_pool);
+}
+
+/* Segregated fit allocator, implementing a good-fit allocation policy.
+ *
+ * This is an variation of sequential fit allocation with several lists of free blocks ("holes")
+ * instead of one. Each list of holes only contains holes of a certain range of sizes, so holes that
+ * are too small can easily be ignored while allocating. Because this also ignores holes that are
+ * larger than necessary (approximating best-fit allocation), this could be described as a
+ * "good-fit" allocator.
+ *
+ * Typically, shaders are allocated and only free'd when the device is destroyed. For this pattern,
+ * this should allocate blocks for shaders fast and with no fragmentation, while still allowing
+ * free'd memory to be re-used.
+ */
+static union radv_shader_arena_block *
+alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr)
+{
+   size = align(size, RADV_SHADER_ALLOC_ALIGNMENT);
+
+   mtx_lock(&device->shader_arena_mutex);
+
+   /* Try to use an existing hole. Unless the shader is very large, this should only have to look
+    * at the first one available.
+    */
+   unsigned free_list_mask = BITFIELD_MASK(RADV_SHADER_ALLOC_NUM_FREE_LISTS);
+   unsigned size_class =
+      ffs(device->shader_free_list_mask & (free_list_mask << get_size_class(size, true)));
+   if (size_class) {
+      size_class--;
+
+      list_for_each_entry(union radv_shader_arena_block, hole,
+                          &device->shader_free_lists[size_class], freelist)
       {
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-         if (s->bo_offset - offset >= shader->code_size) {
-            shader->bo = slab->bo;
-            shader->bo_offset = offset;
-            list_addtail(&shader->slab_list, &s->slab_list);
-            mtx_unlock(&device->shader_slab_mutex);
-            return slab->ptr + offset;
+         if (hole->size < size)
+            continue;
+
+         assert(hole->offset % RADV_SHADER_ALLOC_ALIGNMENT == 0);
+
+         if (size == hole->size) {
+            remove_hole(device, hole);
+            hole->freelist.next = ptr;
+            mtx_unlock(&device->shader_arena_mutex);
+            return hole;
+         } else {
+            union radv_shader_arena_block *alloc = alloc_block_obj(device);
+            if (!alloc) {
+               mtx_unlock(&device->shader_arena_mutex);
+               return NULL;
+            }
+            list_addtail(&alloc->list, &hole->list);
+            alloc->freelist.prev = NULL;
+            alloc->freelist.next = ptr;
+            alloc->arena = hole->arena;
+            alloc->offset = hole->offset;
+            alloc->size = size;
+
+            remove_hole(device, hole);
+            hole->offset += size;
+            hole->size -= size;
+            add_hole(device, hole);
+
+            mtx_unlock(&device->shader_arena_mutex);
+            return alloc;
          }
-         offset = align_u64(s->bo_offset + s->code_size, 256);
-      }
-      if (offset <= slab->size && slab->size - offset >= shader->code_size) {
-         shader->bo = slab->bo;
-         shader->bo_offset = offset;
-         list_addtail(&shader->slab_list, &slab->shaders);
-         mtx_unlock(&device->shader_slab_mutex);
-         return slab->ptr + offset;
       }
    }
 
-   mtx_unlock(&device->shader_slab_mutex);
-   struct radv_shader_slab *slab = calloc(1, sizeof(struct radv_shader_slab));
+   /* Allocate a new shader arena. */
+   struct radv_shader_arena *arena = calloc(1, sizeof(struct radv_shader_arena));
+   union radv_shader_arena_block *alloc = NULL, *hole = NULL;
+   if (!arena)
+      goto fail;
 
-   slab->size = MAX2(256 * 1024, shader->code_size);
+   unsigned arena_size = MAX2(RADV_SHADER_ALLOC_MIN_ARENA_SIZE, size);
    VkResult result = device->ws->buffer_create(
-      device->ws, slab->size, 256, RADEON_DOMAIN_VRAM,
+      device->ws, arena_size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_VRAM,
       RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT |
          (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0
                                                                          : RADEON_FLAG_READ_ONLY),
-      RADV_BO_PRIORITY_SHADER, 0, &slab->bo);
-   if (result != VK_SUCCESS) {
-      free(slab);
-      return NULL;
+      RADV_BO_PRIORITY_SHADER, 0, &arena->bo);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   list_inithead(&arena->entries);
+
+   arena->ptr = (char *)device->ws->buffer_map(arena->bo);
+   if (!arena->ptr)
+      goto fail;
+
+   alloc = alloc_block_obj(device);
+   hole = arena_size - size > 0 ? alloc_block_obj(device) : alloc;
+   if (!alloc || !hole)
+      goto fail;
+   list_addtail(&alloc->list, &arena->entries);
+   alloc->freelist.prev = NULL;
+   alloc->freelist.next = ptr;
+   alloc->arena = arena;
+   alloc->offset = 0;
+   alloc->size = size;
+
+   if (hole != alloc) {
+      hole->arena = arena;
+      hole->offset = size;
+      hole->size = arena_size - size;
+
+      list_addtail(&hole->list, &arena->entries);
+      add_hole(device, hole);
    }
 
-   slab->ptr = (char *)device->ws->buffer_map(slab->bo);
-   if (!slab->ptr) {
-      device->ws->buffer_destroy(device->ws, slab->bo);
-      free(slab);
+   list_addtail(&arena->list, &device->shader_arenas);
+
+   mtx_unlock(&device->shader_arena_mutex);
+   return alloc;
+
+fail:
+   mtx_unlock(&device->shader_arena_mutex);
+   free(alloc);
+   free(hole);
+   if (arena && arena->bo)
+      device->ws->buffer_destroy(device->ws, arena->bo);
+   free(arena);
+   return NULL;
+}
+
+static union radv_shader_arena_block *
+get_hole(struct radv_shader_arena *arena, struct list_head *head)
+{
+   if (head == &arena->entries)
       return NULL;
+
+   union radv_shader_arena_block *hole = LIST_ENTRY(union radv_shader_arena_block, head, list);
+   return hole->freelist.prev ? hole : NULL;
+}
+
+static void
+free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc)
+{
+   mtx_lock(&device->shader_arena_mutex);
+
+   union radv_shader_arena_block *hole_prev = get_hole(alloc->arena, alloc->list.prev);
+   union radv_shader_arena_block *hole_next = get_hole(alloc->arena, alloc->list.next);
+
+   union radv_shader_arena_block *hole = alloc;
+
+   /* merge with previous hole */
+   if (hole_prev) {
+      remove_hole(device, hole_prev);
+
+      hole_prev->size += hole->size;
+      list_del(&hole->list);
+      free_block_obj(device, hole);
+
+      hole = hole_prev;
+   }
+
+   /* merge with next hole */
+   if (hole_next) {
+      remove_hole(device, hole_next);
+
+      hole_next->offset -= hole->size;
+      hole_next->size += hole->size;
+      list_del(&hole->list);
+      free_block_obj(device, hole);
+
+      hole = hole_next;
    }
 
-   list_inithead(&slab->shaders);
+   if (list_is_singular(&hole->list)) {
+      struct radv_shader_arena *arena = hole->arena;
+      free_block_obj(device, hole);
+
+      device->ws->buffer_destroy(device->ws, arena->bo);
+      list_del(&arena->list);
+      free(arena);
+   } else {
+      add_hole(device, hole);
+   }
+
+   mtx_unlock(&device->shader_arena_mutex);
+}
+
+static void *
+radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)
+{
+   shader->alloc = alloc_shader_memory(device, shader->code_size, shader);
+   if (!shader->alloc)
+      return NULL;
+   shader->bo = shader->alloc->arena->bo;
+   return shader->alloc->arena->ptr + shader->alloc->offset;
+}
 
-   mtx_lock(&device->shader_slab_mutex);
-   list_add(&slab->slabs, &device->shader_slabs);
+void
+radv_init_shader_arenas(struct radv_device *device)
+{
+   mtx_init(&device->shader_arena_mutex, mtx_plain);
+
+   device->shader_free_list_mask = 0;
 
-   shader->bo = slab->bo;
-   shader->bo_offset = 0;
-   list_add(&shader->slab_list, &slab->shaders);
-   mtx_unlock(&device->shader_slab_mutex);
-   return slab->ptr;
+   list_inithead(&device->shader_arenas);
+   list_inithead(&device->shader_block_obj_pool);
+   for (unsigned i = 0; i < RADV_SHADER_ALLOC_NUM_FREE_LISTS; i++)
+      list_inithead(&device->shader_free_lists[i]);
 }
 
 void
-radv_destroy_shader_slabs(struct radv_device *device)
+radv_destroy_shader_arenas(struct radv_device *device)
 {
-   list_for_each_entry_safe(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
+   list_for_each_entry_safe(union radv_shader_arena_block, block, &device->shader_block_obj_pool,
+                            pool) free(block);
+
+   list_for_each_entry_safe(struct radv_shader_arena, arena, &device->shader_arenas, list)
    {
-      device->ws->buffer_destroy(device->ws, slab->bo);
-      free(slab);
+      device->ws->buffer_destroy(device->ws, arena->bo);
+      free(arena);
    }
-   mtx_destroy(&device->shader_slab_mutex);
+   mtx_destroy(&device->shader_arena_mutex);
 }
 
 /* For the UMR disassembler. */
@@ -1735,9 +1916,7 @@ radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_varia
    if (!p_atomic_dec_zero(&variant->ref_count))
       return;
 
-   mtx_lock(&device->shader_slab_mutex);
-   list_del(&variant->slab_list);
-   mtx_unlock(&device->shader_slab_mutex);
+   free_shader_memory(device, variant->alloc);
 
    free(variant->spirv);
    free(variant->nir_string);
@@ -1750,36 +1929,33 @@ radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_varia
 uint64_t
 radv_shader_variant_get_va(const struct radv_shader_variant *variant)
 {
-   return radv_buffer_get_va(variant->bo) + variant->bo_offset;
+   return radv_buffer_get_va(variant->bo) + variant->alloc->offset;
 }
 
 struct radv_shader_variant *
 radv_find_shader_variant(struct radv_device *device, uint64_t pc)
 {
-   mtx_lock(&device->shader_slab_mutex);
-
-   list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
+   mtx_lock(&device->shader_arena_mutex);
+   list_for_each_entry(struct radv_shader_arena, arena, &device->shader_arenas, list)
    {
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
 #endif
-      list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list)
+      list_for_each_entry(union radv_shader_arena_block, block, &arena->entries, list)
       {
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
-         uint64_t offset = align_u64(s->bo_offset + s->code_size, 256);
-         uint64_t va = radv_buffer_get_va(s->bo);
-
-         if (pc >= va + s->bo_offset && pc < va + offset) {
-            mtx_unlock(&device->shader_slab_mutex);
-            return s;
+         uint64_t start = radv_buffer_get_va(block->arena->bo) + block->offset;
+         if (!block->freelist.prev && pc >= start && pc < start + block->size) {
+            mtx_unlock(&device->shader_arena_mutex);
+            return (struct radv_shader_variant *)block->freelist.next;
          }
       }
    }
-   mtx_unlock(&device->shader_slab_mutex);
 
+   mtx_unlock(&device->shader_arena_mutex);
    return NULL;
 }
 
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 7fd12108440..c7fc550f49b 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -387,11 +387,33 @@ struct radv_shader_binary_rtld {
    uint8_t data[0];
 };
 
+struct radv_shader_arena {
+   struct list_head list;
+   struct list_head entries;
+   struct radeon_winsys_bo *bo;
+   char *ptr;
+};
+
+union radv_shader_arena_block {
+   struct list_head pool;
+   struct {
+      /* List of blocks in the arena, sorted by address. */
+      struct list_head list;
+      /* For holes, a list_head for the free-list. For allocations, freelist.prev=NULL and
+       * freelist.next is a pointer associated with the allocation.
+       */
+      struct list_head freelist;
+      struct radv_shader_arena *arena;
+      uint32_t offset;
+      uint32_t size;
+   };
+};
+
 struct radv_shader_variant {
    uint32_t ref_count;
 
    struct radeon_winsys_bo *bo;
-   uint64_t bo_offset;
+   union radv_shader_arena_block *alloc;
    struct ac_shader_config config;
    uint8_t *code_ptr;
    uint32_t code_size;
@@ -405,16 +427,6 @@ struct radv_shader_variant {
    char *disasm_string;
    char *ir_string;
    uint32_t *statistics;
-
-   struct list_head slab_list;
-};
-
-struct radv_shader_slab {
-   struct list_head slabs;
-   struct list_head shaders;
-   struct radeon_winsys_bo *bo;
-   uint64_t size;
-   char *ptr;
 };
 
 void radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
@@ -428,7 +440,8 @@ nir_shader *radv_shader_compile_to_nir(struct radv_device *device, struct vk_sha
                                        const struct radv_pipeline_layout *layout,
                                        const struct radv_pipeline_key *key);
 
-void radv_destroy_shader_slabs(struct radv_device *device);
+void radv_init_shader_arenas(struct radv_device *device);
+void radv_destroy_shader_arenas(struct radv_device *device);
 
 VkResult radv_create_shaders(struct radv_pipeline *pipeline,
                              struct radv_pipeline_layout *pipeline_layout,