[Mesa-dev] [PATCH 07/18] anv/allocator: Pull the userptr part of block_pool_grow into a helper

Thu Apr 27 16:27:07 UTC 2017

On Wed, 2017-04-26 at 07:35 -0700, Jason Ekstrand wrote:
> ---
>  src/intel/vulkan/anv_allocator.c | 195 +++++++++++++++++++++------------------
>  1 file changed, 104 insertions(+), 91 deletions(-)
> 
> diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
> index a168af5..183d2cb 100644
> --- a/src/intel/vulkan/anv_allocator.c
> +++ b/src/intel/vulkan/anv_allocator.c
> @@ -315,6 +315,96 @@ anv_block_pool_finish(struct anv_block_pool *pool)
>  
>  #define PAGE_SIZE 4096
>  
> +static VkResult
> +anv_block_pool_expand_range(struct anv_block_pool *pool,
> +                            uint32_t center_bo_offset, uint32_t size)
> +{
> +   void *map;
> +   uint32_t gem_handle;
> +   struct anv_mmap_cleanup *cleanup;
> +
> +   /* Assert that we only ever grow the pool */
> +   assert(center_bo_offset >= pool->back_state.end);
> +   assert(size - center_bo_offset >= pool->state.end);
> +
> +   cleanup = u_vector_add(&pool->mmap_cleanups);
> +   if (!cleanup)
> +      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
> +
> +   *cleanup = ANV_MMAP_CLEANUP_INIT;
> +
> +   /* Just leak the old map until we destroy the pool.  We can't munmap it
> +    * without races or imposing locking on the block allocate fast path. On
> +    * the whole the leaked maps adds up to less than the size of the
> +    * current map.  MAP_POPULATE seems like the right thing to do, but we
> +    * should try to get some numbers.
> +    */
> +   map = mmap(NULL, size, PROT_READ | PROT_WRITE,
> +              MAP_SHARED | MAP_POPULATE, pool->fd,
> +              BLOCK_POOL_MEMFD_CENTER - center_bo_offset);
> +   if (map == MAP_FAILED)
> +      return vk_errorf(VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
> +
> +   gem_handle = anv_gem_userptr(pool->device, map, size);
> +   if (gem_handle == 0) {
> +      munmap(map, size);
> +      return vk_errorf(VK_ERROR_TOO_MANY_OBJECTS, "userptr failed: %m");
> +   }
> +
> +   cleanup->map = map;
> +   cleanup->size = size;
> +   cleanup->gem_handle = gem_handle;
> +
> +#if 0
> +   /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
> +    * I915_CACHING_NONE on non-LLC platforms. However, userptr objects are
> +    * always created as I915_CACHING_CACHED, which on non-LLC means
> +    * snooped. That can be useful but comes with a bit of overheard.  Since
> +    * we're eplicitly clflushing and don't want the overhead we need to turn
> +    * it off. */
> +   if (!pool->device->info.has_llc) {
> +      anv_gem_set_caching(pool->device, gem_handle, I915_CACHING_NONE);
> +      anv_gem_set_domain(pool->device, gem_handle,
> +                         I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
> +   }
> +#endif
> +
> +   /* Now that we successfull allocated everything, we can write the new
> +    * values back into pool. */
> +   pool->map = map + center_bo_offset;
> +   pool->center_bo_offset = center_bo_offset;
> +
> +   /* For block pool BOs we have to be a bit careful about where we place them
> +    * in the GTT.  There are two documented workarounds for state base address
> +    * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset
> +    * which state that those two base addresses do not support 48-bit
> +    * addresses and need to be placed in the bottom 32-bit range.
> +    * Unfortunately, this is not quite accurate.
> +    *
> +    * The real problem is that we always set the size of our state pools in
> +    * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most
> +    * likely significantly smaller.  We do this because we do not no at the
> +    * time we emit STATE_BASE_ADDRESS whether or not we will need to expand
> +    * the pool during command buffer building so we don't actually have a
> +    * valid final size.  If the address + size, as seen by STATE_BASE_ADDRESS
> +    * overflows 48 bits, the GPU appears to treat all accesses to the buffer
> +    * as being out of bounds and returns zero.  For dynamic state, this
> +    * usually just leads to rendering corruptions, but shaders that are all
> +    * zero hang the GPU immediately.
> +    *
> +    * The easiest solution to do is exactly what the bogus workarounds say to
> +    * do: restrict these buffers to 32-bit addresses.  We could also pin the
> +    * BO to some particular location of our choosing, but that's significantly
> +    * more work than just not setting a flag.  So, we explicitly DO NOT set
> +    * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the
> +    * hard work for us.
> +    */
> +   anv_bo_init(&pool->bo, gem_handle, size);
> +   pool->bo.map = map;
> +
> +   return VK_SUCCESS;
> +}
> +
>  /** Grows and re-centers the block pool.
>   *
>   * We grow the block pool in one or both directions in such a way that the
> @@ -343,9 +433,7 @@ static uint32_t
>  anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state)
>  {
>     uint32_t size;
> -   void *map;
> -   uint32_t gem_handle;
> -   struct anv_mmap_cleanup *cleanup;
> +   VkResult result = VK_SUCCESS;
>  
>     pthread_mutex_lock(&pool->device->mutex);
>  
> @@ -428,100 +516,25 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state)
>     assert(center_bo_offset % pool->block_size == 0);
>     assert(center_bo_offset % PAGE_SIZE == 0);
>  
> -   /* Assert that we only ever grow the pool */
> -   assert(center_bo_offset >= pool->back_state.end);
> -   assert(size - center_bo_offset >= pool->state.end);
> -
> -   cleanup = u_vector_add(&pool->mmap_cleanups);
> -   if (!cleanup)
> -      goto fail;
> -   *cleanup = ANV_MMAP_CLEANUP_INIT;
> -
> -   /* Just leak the old map until we destroy the pool.  We can't munmap it
> -    * without races or imposing locking on the block allocate fast path. On
> -    * the whole the leaked maps adds up to less than the size of the
> -    * current map.  MAP_POPULATE seems like the right thing to do, but we
> -    * should try to get some numbers.
> -    */
> -   map = mmap(NULL, size, PROT_READ | PROT_WRITE,
> -              MAP_SHARED | MAP_POPULATE, pool->fd,
> -              BLOCK_POOL_MEMFD_CENTER - center_bo_offset);
> -   cleanup->map = map;
> -   cleanup->size = size;
> -
> -   if (map == MAP_FAILED)
> -      goto fail;
> -
> -   gem_handle = anv_gem_userptr(pool->device, map, size);
> -   if (gem_handle == 0)
> -      goto fail;
> -   cleanup->gem_handle = gem_handle;
> -
> -#if 0
> -   /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
> -    * I915_CACHING_NONE on non-LLC platforms. However, userptr objects are
> -    * always created as I915_CACHING_CACHED, which on non-LLC means
> -    * snooped. That can be useful but comes with a bit of overheard.  Since
> -    * we're eplicitly clflushing and don't want the overhead we need to turn
> -    * it off. */
> -   if (!pool->device->info.has_llc) {
> -      anv_gem_set_caching(pool->device, gem_handle, I915_CACHING_NONE);
> -      anv_gem_set_domain(pool->device, gem_handle,
> -                         I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
> -   }
> -#endif
> -
> -   /* Now that we successfull allocated everything, we can write the new
> -    * values back into pool. */
> -   pool->map = map + center_bo_offset;
> -   pool->center_bo_offset = center_bo_offset;
> -
> -   /* For block pool BOs we have to be a bit careful about where we place them
> -    * in the GTT.  There are two documented workarounds for state base address
> -    * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset
> -    * which state that those two base addresses do not support 48-bit
> -    * addresses and need to be placed in the bottom 32-bit range.
> -    * Unfortunately, this is not quite accurate.
> -    *
> -    * The real problem is that we always set the size of our state pools in
> -    * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most
> -    * likely significantly smaller.  We do this because we do not no at the
> -    * time we emit STATE_BASE_ADDRESS whether or not we will need to expand
> -    * the pool during command buffer building so we don't actually have a
> -    * valid final size.  If the address + size, as seen by STATE_BASE_ADDRESS
> -    * overflows 48 bits, the GPU appears to treat all accesses to the buffer
> -    * as being out of bounds and returns zero.  For dynamic state, this
> -    * usually just leads to rendering corruptions, but shaders that are all
> -    * zero hang the GPU immediately.
> -    *
> -    * The easiest solution to do is exactly what the bogus workarounds say to
> -    * do: restrict these buffers to 32-bit addresses.  We could also pin the
> -    * BO to some particular location of our choosing, but that's significantly
> -    * more work than just not setting a flag.  So, we explicitly DO NOT set
> -    * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the
> -    * hard work for us.
> -    */
> -   anv_bo_init(&pool->bo, gem_handle, size);
> -   pool->bo.map = map;
> +   result = anv_block_pool_expand_range(pool, center_bo_offset, size);
>  
>  done:
>     pthread_mutex_unlock(&pool->device->mutex);
>  
> -   /* Return the appropreate new size.  This function never actually
> -    * updates state->next.  Instead, we let the caller do that because it
> -    * needs to do so in order to maintain its concurrency model.
> -    */
> -   if (state == &pool->state) {
> -      return pool->bo.size - pool->center_bo_offset;
> +   if (result == VK_SUCCESS) {
> +      /* Return the appropreate new size.  This function never actually
                       ^^^^^^^^^^
                         typo

Reviewed-by: Juan A. Suarez Romero <jasuarez at igalia.com>

> +       * updates state->next.  Instead, we let the caller do that because it
> +       * needs to do so in order to maintain its concurrency model.
> +       */
> +      if (state == &pool->state) {
> +         return pool->bo.size - pool->center_bo_offset;
> +      } else {
> +         assert(pool->center_bo_offset > 0);
> +         return pool->center_bo_offset;
> +      }
>     } else {
> -      assert(pool->center_bo_offset > 0);
> -      return pool->center_bo_offset;
> +      return 0;
>     }
> -
> -fail:
> -   pthread_mutex_unlock(&pool->device->mutex);
> -
> -   return 0;
>  }
>  
>  static uint32_t