[Mesa-dev] [PATCH v2] anv: add support for allocating more than 1 block of memory

Mon Apr 3 12:02:05 UTC 2017

On Wed, 2017-03-29 at 12:06 -0700, Jason Ekstrand wrote:
> Looking over the patch, I think I've convinced myself that it's correct.  (I honestly wasn't expecting to come to that conclusion without more iteration.)  That said, this raises some interesting questions.  I added Kristian to the Cc in case he has any input.
> 
>  1. Should we do powers of two or linear.  I'm still a fan of powers of two.

In which specific part do you mean? In free block lists? or in the
allocator_new? 

> 
>  2. Should block pools even have a block size at all? We could just make every block pool allow any power-of-two size from 4 KiB up to. say, 1 MiB and then make the block size part of the state pool or stream that's allocating from it.  At the moment, I like this idea, but I've given it very little thought.
> 
So IIUC, the idea would be the block pool is just a flat chunk of
memory, where we later fetch blocks of memory from, as requested by
applications. Is that correct?

>  3. If we go with the idea in 2. should we still call it block_pool?  I think we can keep the name but it doesn't it as well as it once did.
> 
> Thanks for working on this!  I'm sorry it's taken so long to respond.  Every time I've looked at it, my brain hasn't been in the right state to think about lock-free code. :-/
> 
> On Wed, Mar 15, 2017 at 5:05 AM, Juan A. Suarez Romero <jasuarez at igalia.com> wrote:
> > Current Anv allocator assign memory in terms of a fixed block size.
> > 
> > But there can be cases where this block is not enough for a memory
> > request, and thus several blocks must be assigned in a row.
> > 
> > This commit adds support for specifying how many blocks of memory must
> > be assigned.
> > 
> > This fixes a number dEQP-VK.pipeline.render_to_image.* tests that crash.
> > 
> > v2: lock-free free-list is not handled correctly (Jason)
> > ---
> >  src/intel/vulkan/anv_allocator.c   | 81 +++++++++++++++++++++++++++-----------
> >  src/intel/vulkan/anv_batch_chain.c |  4 +-
> >  src/intel/vulkan/anv_private.h     |  7 +++-
> >  3 files changed, 66 insertions(+), 26 deletions(-)
> > 
> > diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
> > index 45c663b..3924551 100644
> > --- a/src/intel/vulkan/anv_allocator.c
> > +++ b/src/intel/vulkan/anv_allocator.c
> > @@ -257,7 +257,8 @@ anv_block_pool_init(struct anv_block_pool *pool,
> >     pool->device = device;
> >     anv_bo_init(&pool->bo, 0, 0);
> >     pool->block_size = block_size;
> > -   pool->free_list = ANV_FREE_LIST_EMPTY;
> > +   for (uint32_t i = 0; i < ANV_MAX_BLOCKS; i++)
> > +      pool->free_list[i] = ANV_FREE_LIST_EMPTY;
> >     pool->back_free_list = ANV_FREE_LIST_EMPTY;
> > 
> >     pool->fd = memfd_create("block pool", MFD_CLOEXEC);
> > @@ -500,30 +501,35 @@ fail:
> > 
> >  static uint32_t
> >  anv_block_pool_alloc_new(struct anv_block_pool *pool,
> > -                         struct anv_block_state *pool_state)
> > +                         struct anv_block_state *pool_state,
> > +                         uint32_t n_blocks)
> 
> Maybe have this take a size rather than n_blocks?  It's only ever called by stuff in the block pool so the caller can do the multiplication.  It would certainly make some of the math below easier.
>  
> >  {
> >     struct anv_block_state state, old, new;
> > 
> >     while (1) {
> > -      state.u64 = __sync_fetch_and_add(&pool_state->u64, pool->block_size);
> > -      if (state.next < state.end) {
> > +      state.u64 = __sync_fetch_and_add(&pool_state->u64, n_blocks * pool->block_size);
> > +      if (state.next > state.end) {
> > +         futex_wait(&pool_state->end, state.end);
> > +         continue;
> > +      } else if ((state.next + (n_blocks - 1) * pool->block_size) < state.end) {
> 
> First off, please keep the if's in the same order unless we have a reason to re-arrange them.  It would make this way easier to review. :-)
> 
> Second, I think this would be much easier to read as:
> 
> if (state.next + size <= state.end) {
>    /* Success */
> } else if (state.next <= state.end) {
>    /* Our block is the one that crosses the line */
> } else {
>    /* Wait like everyone else */
> }
>  
> >           assert(pool->map);
> >           return state.next;
> > -      } else if (state.next == state.end) {
> > -         /* We allocated the first block outside the pool, we have to grow it.
> > -          * pool_state->next acts a mutex: threads who try to allocate now will
> > -          * get block indexes above the current limit and hit futex_wait
> > -          * below. */
> > -         new.next = state.next + pool->block_size;
> > +      } else {
> > +         /* We allocated the firsts blocks outside the pool, we have to grow
> > +          * it. pool_state->next acts a mutex: threads who try to allocate
> > +          * now will get block indexes above the current limit and hit
> > +          * futex_wait below.
> > +          */
> > +         new.next = state.next + n_blocks * pool->block_size;
> >           new.end = anv_block_pool_grow(pool, pool_state);
> > +         /* We assume that just growing once the pool is enough to fulfil the
> > +          * memory requirements
> > +          */
> 
> I think this is probably a reasonable assumption.  That said, it wouldn't hurt to add a size parameter to block_pool_grow but I don't know that it's needed.
>  
> >           assert(new.end >= new.next && new.end % pool->block_size == 0);
> >           old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
> >           if (old.next != state.next)
> >              futex_wake(&pool_state->end, INT_MAX);
> >           return state.next;
> > -      } else {
> > -         futex_wait(&pool_state->end, state.end);
> > -         continue;
> >        }
> >     }
> >  }
> > @@ -531,16 +537,38 @@ anv_block_pool_alloc_new(struct anv_block_pool *pool,
> >  int32_t
> >  anv_block_pool_alloc(struct anv_block_pool *pool)
> >  {
> > +   return anv_block_pool_alloc_n(pool, 1);
> > +}
> > +
> > +int32_t
> > +anv_block_pool_alloc_n(struct anv_block_pool *pool, uint32_t n_blocks)
> > +{
> >     int32_t offset;
> > 
> > +   assert(n_blocks >= 1 && n_blocks <= ANV_MAX_BLOCKS);
> 
> The more I look at this, the more I want it to be in powers of 2.
>  
> > +
> >     /* Try free list first. */
> > -   if (anv_free_list_pop(&pool->free_list, &pool->map, &offset)) {
> > +   if (anv_free_list_pop(&(pool->free_list[n_blocks - 1]), &pool->map, &offset)) {
> >        assert(offset >= 0);
> >        assert(pool->map);
> >        return offset;
> >     }
> > 
> > -   return anv_block_pool_alloc_new(pool, &pool->state);
> > +   /* Try to steal them. */
> > +   for (unsigned int i = n_blocks; i < ANV_MAX_BLOCKS; i++) {
> > +      if (anv_free_list_pop (&(pool->free_list[i]), &pool->map, &offset)) {
> > +         assert(offset >= 0);
> > +         assert(pool->map);
> > +         /* Just return the blocks we do not require */
> > +         int32_t needless_blocks = i + 1 - n_blocks;
> > +         int32_t needless_offset = offset + n_blocks * pool->block_size;
> > +         anv_free_list_push(&(pool->free_list[needless_blocks - 1]), pool->map, needless_offset);
> 
> I really like this.  That way one-shot giant blocks don't stay around forever when we need piles of little ones.  We have no path for defragmentation, but I think that's ok.
>  
> > +         return offset;
> > +      }
> > +   }
> > +
> > +   return anv_block_pool_alloc_new(pool, &pool->state, n_blocks);
> >  }
> > 
> >  /* Allocates a block out of the back of the block pool.
> > @@ -564,7 +592,7 @@ anv_block_pool_alloc_back(struct anv_block_pool *pool)
> >        return offset;
> >     }
> > 
> > -   offset = anv_block_pool_alloc_new(pool, &pool->back_state);
> > +   offset = anv_block_pool_alloc_new(pool, &pool->back_state, 1);
> > 
> >     /* The offset we get out of anv_block_pool_alloc_new() is actually the
> >      * number of bytes downwards from the middle to the end of the block.
> > @@ -576,12 +604,14 @@ anv_block_pool_alloc_back(struct anv_block_pool *pool)
> >  }
> > 
> >  void
> > -anv_block_pool_free(struct anv_block_pool *pool, int32_t offset)
> > +anv_block_pool_free(struct anv_block_pool *pool, int32_t offset, uint32_t n_blocks)
> >  {
> > +   assert(n_blocks >= 1 && n_blocks <= ANV_MAX_BLOCKS);
> > +
> >     if (offset < 0) {
> >        anv_free_list_push(&pool->back_free_list, pool->map, offset);
> >     } else {
> > -      anv_free_list_push(&pool->free_list, pool->map, offset);
> > +      anv_free_list_push(&(pool->free_list[n_blocks - 1]), pool->map, offset);
> >     }
> >  }
> > 
> > @@ -698,6 +728,9 @@ struct anv_state_stream_block {
> >     /* The offset into the block pool at which this block starts */
> >     uint32_t offset;
> > 
> > +   /* Blocks allocated */
> > +   uint32_t n_blocks;
> > +
> >  #ifdef HAVE_VALGRIND
> >     /* A pointer to the first user-allocated thing in this block.  This is
> >      * what valgrind sees as the start of the block.
> > @@ -736,7 +769,7 @@ anv_state_stream_finish(struct anv_state_stream *stream)
> >        struct anv_state_stream_block sb = VG_NOACCESS_READ(next);
> >        VG(VALGRIND_MEMPOOL_FREE(stream, sb._vg_ptr));
> >        VG(VALGRIND_MAKE_MEM_UNDEFINED(next, block_size));
> > -      anv_block_pool_free(stream->block_pool, sb.offset);
> > +      anv_block_pool_free(stream->block_pool, sb.offset, sb.n_blocks);
> >        next = sb.next;
> >     }
> > 
> > @@ -753,19 +786,23 @@ anv_state_stream_alloc(struct anv_state_stream *stream,
> > 
> >     state.offset = align_u32(stream->next, alignment);
> >     if (state.offset + size > stream->end) {
> > -      uint32_t block = anv_block_pool_alloc(stream->block_pool);
> > +      uint32_t n_blocks =
> > +         DIV_ROUND_UP(state.offset - stream->next + size, stream->block_pool->block_size);
> > +      uint32_t block = anv_block_pool_alloc_n(stream->block_pool, n_blocks);
> > +
> >        sb = stream->block_pool->map + block;
> > 
> >        VG(VALGRIND_MAKE_MEM_UNDEFINED(sb, sizeof(*sb)));
> >        sb->next = stream->block;
> >        sb->offset = block;
> > +      sb->n_blocks = n_blocks;
> >        VG(sb->_vg_ptr = NULL);
> > -      VG(VALGRIND_MAKE_MEM_NOACCESS(sb, stream->block_pool->block_size));
> > +      VG(VALGRIND_MAKE_MEM_NOACCESS(sb, n_blocks * stream->block_pool->block_size));
> > 
> >        stream->block = sb;
> >        stream->start = block;
> >        stream->next = block + sizeof(*sb);
> > -      stream->end = block + stream->block_pool->block_size;
> > +      stream->end = block + n_blocks * stream->block_pool->block_size;
> > 
> >        state.offset = align_u32(stream->next, alignment);
> >        assert(state.offset + size <= stream->end);
> > diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
> > index 3f6039e..cc9d9d7 100644
> > --- a/src/intel/vulkan/anv_batch_chain.c
> > +++ b/src/intel/vulkan/anv_batch_chain.c
> > @@ -716,7 +716,7 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
> >     int32_t *bt_block;
> >     u_vector_foreach(bt_block, &cmd_buffer->bt_blocks) {
> >        anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool,
> > -                          *bt_block);
> > +                          *bt_block, 1);
> >     }
> >     u_vector_finish(&cmd_buffer->bt_blocks);
> > 
> > @@ -750,7 +750,7 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
> >     while (u_vector_length(&cmd_buffer->bt_blocks) > 1) {
> >        int32_t *bt_block = u_vector_remove(&cmd_buffer->bt_blocks);
> >        anv_block_pool_free(&cmd_buffer->device->surface_state_block_pool,
> > -                          *bt_block);
> > +                          *bt_block, 1);
> >     }
> >     assert(u_vector_length(&cmd_buffer->bt_blocks) == 1);
> >     cmd_buffer->bt_next = 0;
> > diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
> > index 7682bfc..bf92d64 100644
> > --- a/src/intel/vulkan/anv_private.h
> > +++ b/src/intel/vulkan/anv_private.h
> > @@ -339,6 +339,8 @@ struct anv_block_state {
> >     };
> >  };
> > 
> > +#define ANV_MAX_BLOCKS 256
> > +
> >  struct anv_block_pool {
> >     struct anv_device *device;
> > 
> > @@ -370,7 +372,7 @@ struct anv_block_pool {
> > 
> >     uint32_t block_size;
> > 
> > -   union anv_free_list free_list;
> > +   union anv_free_list free_list[ANV_MAX_BLOCKS];
> >     struct anv_block_state state;
> > 
> >     union anv_free_list back_free_list;
> > @@ -462,8 +464,9 @@ VkResult anv_block_pool_init(struct anv_block_pool *pool,
> >                               struct anv_device *device, uint32_t block_size);
> >  void anv_block_pool_finish(struct anv_block_pool *pool);
> >  int32_t anv_block_pool_alloc(struct anv_block_pool *pool);
> > +int32_t anv_block_pool_alloc_n(struct anv_block_pool *pool, uint32_t n_blocks);
> >  int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool);
> > -void anv_block_pool_free(struct anv_block_pool *pool, int32_t offset);
> > +void anv_block_pool_free(struct anv_block_pool *pool, int32_t offset, uint32_t n_blocks);
> >  void anv_state_pool_init(struct anv_state_pool *pool,
> >                           struct anv_block_pool *block_pool);
> >  void anv_state_pool_finish(struct anv_state_pool *pool);
> > --
> > 2.9.3
> > 
> > 
> 
>