[Mesa-dev] [PATCH 06/10] i965: Add virtual memory allocator infrastructure to brw_bufmgr.

Sat May 5 07:25:20 UTC 2018

Quoting Kenneth Graunke (2018-05-04 02:12:36)
> This introduces a new fast virtual memory allocator integrated with our
> BO cache bucketing.  For larger objects, it falls back to the simple
> free-list allocator (util_vma).
> 
> This puts the allocators in place but doesn't enable softpin yet.
> ---
>  src/mesa/drivers/dri/i965/brw_bufmgr.c | 291 ++++++++++++++++++++++++-
>  src/mesa/drivers/dri/i965/brw_bufmgr.h |   2 +
>  2 files changed, 292 insertions(+), 1 deletion(-)
> 
> I'm happy to write more comments here.  It's a pretty simple system, but
> not necessarily the most intuitive.  Feel free to ask questions.
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> index 66828f319be..07c0d2f7633 100644
> --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
> +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> @@ -60,6 +60,8 @@
>  #include "util/macros.h"
>  #include "util/hash_table.h"
>  #include "util/list.h"
> +#include "util/u_dynarray.h"
> +#include "util/vma.h"
>  #include "brw_bufmgr.h"
>  #include "brw_context.h"
>  #include "string.h"
> @@ -98,9 +100,40 @@ atomic_add_unless(int *v, int add, int unless)
>     return c == unless;
>  }
>  
> +/**
> + * i965 fixed-size bucketing VMA allocator.
> + *
> + * The BO cache maintains "cache buckets" for buffers of various sizes.
> + * All buffers in a given bucket are identically sized - when allocating,
> + * we always round up to the bucket size.  This means that virtually all
> + * allocations are fixed-size; only buffers which are too large to fit in
> + * a bucket can be variably-sized.
> + *
> + * We create an allocator for each bucket.  Each contains a free-list, where
> + * each node contains a <starting address, 64-bit bitmap> pair.  Each bit
> + * represents a bucket-sized block of memory.  (At the first level, each
> + * bit corresponds to a page.  For the second bucket, bits correspond to
> + * two pages, and so on.)  1 means a block is free, and 0 means it's in-use.
> + *
> + * This makes allocations cheap - any bit of any node will do.  We can pick
> + * the head of the list and use ffs() to find a free block.  If there are
> + * none, we allocate 64 blocks from a larger allocator - either a bigger
> + * bucketing allocator, or a fallback top-level allocator for large objects.
> + */
> +struct vma_bucket_node {
> +   uint64_t start_address;
> +   uint64_t bitmap;
> +};
> +
>  struct bo_cache_bucket {
> +   /** List of cached BOs. */
>     struct list_head head;
> +
> +   /** Size of this bucket, in bytes. */
>     uint64_t size;
> +
> +   /** List of vma_bucket_nodes. */
> +   struct util_dynarray vma_list[BRW_MEMZONE_COUNT];
>  };
>  
>  struct brw_bufmgr {
> @@ -116,6 +149,8 @@ struct brw_bufmgr {
>     struct hash_table *name_table;
>     struct hash_table *handle_table;
>  
> +   struct util_vma_heap vma_allocator[BRW_MEMZONE_COUNT];
> +
>     bool has_llc:1;
>     bool has_mmap_wc:1;
>     bool bo_reuse:1;
> @@ -128,6 +163,10 @@ static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
>  
>  static void bo_free(struct brw_bo *bo);
>  
> +static uint64_t __vma_alloc(struct brw_bufmgr *bufmgr,
> +                            enum brw_memory_zone memzone,
> +                            uint64_t size, uint64_t alignment);
> +
>  static uint32_t
>  key_hash_uint(const void *key)
>  {
> @@ -222,6 +261,198 @@ bucket_for_size(struct brw_bufmgr *bufmgr, uint64_t size)
>            &bufmgr->cache_bucket[index] : NULL;
>  }
>  
> +static enum brw_memory_zone
> +memzone_for_address(uint64_t address)
> +{
> +   const uint64_t _4GB = 1ull << 32;
> +
> +   if (address >= _4GB)
> +      return BRW_MEMZONE_OTHER;
> +
> +   return BRW_MEMZONE_LOW_4G;
> +}
> +
> +static uint64_t
> +bucket_vma_alloc(struct brw_bufmgr *bufmgr,
> +                 struct bo_cache_bucket *bucket,
> +                 enum brw_memory_zone memzone)
> +{
> +   struct util_dynarray *vma_list = &bucket->vma_list[memzone];
> +   struct vma_bucket_node *node;
> +
> +   if (vma_list->size == 0) {
> +      /* This bucket allocator is out of space - allocate a new block of
> +       * memory for 64 blocks from a larger allocator (either a larger
> +       * bucket or util_vma).
> +       *
> +       * We align the address to the node size (64 blocks) so that
> +       * bucket_vma_free can easily compute the starting address of this
> +       * block by rounding any address we return down to the node size.
> +       *
> +       * Set the first bit used, and return the start address.
> +       */
> +      uint64_t node_size = 64ull * bucket->size;
> +      node = util_dynarray_grow(vma_list, sizeof(struct vma_bucket_node));

if (!node)
	return 0;

> +      node->start_address = __vma_alloc(bufmgr, memzone, node_size, node_size);
> +      node->bitmap = ~1ull;
> +      return node->start_address;
> +   }
> +
> +   /* Pick any bit from any node - they're all the right size and free. */
> +   node = util_dynarray_top_ptr(vma_list, struct vma_bucket_node);
> +   int bit = ffsll(node->bitmap) - 1;
> +   assert(bit >= 0 && bit <= 63);
> +
> +   /* Reserve the memory by clearing the bit. */
> +   assert((node->bitmap & (1ull << bit)) != 0ull);
> +   node->bitmap &= ~(1ull << bit);
> +
> +   /* If this node is now completely full, remove it from the free list. */
> +   if (node->bitmap == 0ull) {
> +      (void) util_dynarray_pop(vma_list, struct vma_bucket_node);
> +   }
> +
> +   return node->start_address + bit * bucket->size;
> +}
> +
> +static void
> +bucket_vma_free(struct bo_cache_bucket *bucket, uint64_t address)
> +{
> +   enum brw_memory_zone memzone = memzone_for_address(address);
> +   struct util_dynarray *vma_list = &bucket->vma_list[memzone];
> +   const uint64_t node_bytes = 64ull * bucket->size;
> +   struct vma_bucket_node *node = NULL;
> +
> +   /* bucket_vma_alloc allocates 64 blocks at a time, and aligns it to
> +    * that 64 block size.  So, we can round down to get the starting address.
> +    */
> +   uint64_t start = (address / node_bytes) * node_bytes;
> +
> +   /* Dividing the offset from start by bucket size gives us the bit index. */
> +   int bit = (address - start) / bucket->size;
> +
> +   assert(start + bit * bucket->size == address);
> +
> +   util_dynarray_foreach(vma_list, struct vma_bucket_node, cur) {
> +      if (cur->start_address == start) {
> +         node = cur;
> +         break;
> +      }
> +   }

So I made a suggestion that the vma allocator should avoid malloc and
fill in the caller's struct. In this case, you would want to keep all
nodes (even when full) and would also require a small vma node inside
the bo. That gives you space for the backpointer to the bucket's node
when using the bucket as opposed to directly using the vma allocator.

> +
> +   if (!node) {
> +      /* No node - the whole group of 64 blocks must have been in-use. */
> +      node = util_dynarray_grow(vma_list, sizeof(struct vma_bucket_node));

if (!node)
	vma_free(start, 64 * bucket->size);

> +      node->start_address = start;
> +      node->bitmap = 0ull;
> +   }
> +
> +   /* Set the bit to return the memory. */
> +   assert((node->bitmap & (1ull << bit)) == 0ull);
> +   node->bitmap |= 1ull << bit;
> +
> +   /* The block might be entirely free now, and if so, we could return it
> +    * to the larger allocator.  But we may as well hang on to it, in case
> +    * we get more allocations at this block size.
> +    */

Return it! Fear the fragmentation, it'll creep up overtime. And might be
a pressing issue for the partitions.

> +static struct bo_cache_bucket *
> +get_bucket_allocator(struct brw_bufmgr *bufmgr, uint64_t size)

get_vma_bucket()

I think it's useful to keep the distinction between allocation of
physical pages and allocation of GTT offset.

> +{
> +   /* Skip using the bucket allocator for very large sizes, as it allocates
> +    * 64 of them and this can balloon rather quickly.
> +    */
> +   if (size > 1024 * PAGE_SIZE)
> +      return NULL;

Shouldn't this be taking memzone into account? Both for its limit, and
for the freelist.

> +   struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size);
> +
> +   if (bucket && bucket->size == size)

It can return the wrong bucket?

> +      return bucket;
> +
> +   return NULL;
> +}
> +
> +/** Like vma_alloc, but returns a non-canonicalized address. */
> +static uint64_t
> +__vma_alloc(struct brw_bufmgr *bufmgr,
> +            enum brw_memory_zone memzone,
> +            uint64_t size,
> +            uint64_t alignment)
> +{
> +   /* Without softpin support, we let the kernel assign addresses. */
> +   assert(brw_using_softpin(bufmgr));
> +
> +   struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
> +   uint64_t addr;
> +
> +   if (bucket) {
> +      addr = bucket_vma_alloc(bufmgr, bucket, memzone);
> +   } else {
> +      addr = util_vma_heap_alloc(&bufmgr->vma_allocator[memzone], size,
> +                                 alignment);

I still think you want a single vma allocator that supports up/down,
<4GB starts at the bottom, any starts at the top. You can still reserve
memzones within that and use a second allocator, or freelists, but you
get the flexibility of having all memory tracked in the top level
manager, avoiding the worst of the static preassignment of blocks.

But 48b, so who cares? I'd like to see this work for as little as 2GiB
GTT (Haswell).

> @@ -360,7 +591,16 @@ retry:
>        }
>     }
>  
> -   if (!alloc_from_cache) {
> +   if (alloc_from_cache) {
> +      /* If the cache BO isn't in the right memory zone, free the old
> +       * memory and assign it a new address.
> +       */
> +      if (brw_using_softpin(bufmgr) &&
> +          memzone != memzone_for_address(bo->gtt_offset)) {
> +         vma_free(bufmgr, bo->gtt_offset, bo->size);
> +         bo->gtt_offset = 0ull;

Can we use bo->gtt_offset == -1ull for invalid? And assert later on it's
never added to the validation objects. But maybe you feel strongly about
reserving offset 0 as a NULL identifier (just NULL is a valid address as
well :)

> +      }
> +   } else {
>        bo = calloc(1, sizeof(*bo));
>        if (!bo)
>           goto err;
> @@ -404,6 +644,13 @@ retry:
>           goto err_free;
>     }
>  
> +   if (brw_using_softpin(bufmgr) && bo->gtt_offset == 0ull) {
> +      bo->gtt_offset = vma_alloc(bufmgr, memzone, bo->size, 1);
> +
> +      if (bo->gtt_offset == 0ull)
> +         goto err_free;

You could just disable softpin for the bo; the kernel will find room.
But if you plan to have special memzones, then yes I can understand
outright rejection if one memzone is full.
-Chris