[Mesa-dev] [PATCH 08/14] winsys/amdgpu: enable buffer allocation from slabs
Marek Olšák
maraeo at gmail.com
Sun Sep 18 13:17:09 UTC 2016
Apart from some minor comments, patches 1-8 are:
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
BTW, I think queries can start using smaller buffers too. If I
understand it correctly, the minimum allocation size is 512 bytes,
right? radeon_info::min_alloc_size can be added for the queries.
Marek
On Sun, Sep 18, 2016 at 3:03 PM, Marek Olšák <maraeo at gmail.com> wrote:
> On Tue, Sep 13, 2016 at 11:56 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>>
>> ---
>> src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 165 ++++++++++++++++++++++++++
>> src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 18 +++
>> src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 13 ++
>> src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 5 +
>> 4 files changed, 201 insertions(+)
>>
>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>> index f581d9b..6a61b30 100644
>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>> @@ -31,20 +31,27 @@
>>
>> #include "amdgpu_cs.h"
>>
>> #include "os/os_time.h"
>> #include "state_tracker/drm_driver.h"
>> #include <amdgpu_drm.h>
>> #include <xf86drm.h>
>> #include <stdio.h>
>> #include <inttypes.h>
>>
>> +static struct pb_buffer *
>> +amdgpu_bo_create(struct radeon_winsys *rws,
>> + uint64_t size,
>> + unsigned alignment,
>> + enum radeon_bo_domain domain,
>> + enum radeon_bo_flag flags);
>> +
>> static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
>> enum radeon_bo_usage usage)
>> {
>> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
>> struct amdgpu_winsys *ws = bo->ws;
>> int64_t abs_timeout;
>>
>> if (timeout == 0) {
>> if (p_atomic_read(&bo->num_active_ioctls))
>> return false;
>> @@ -436,20 +443,130 @@ bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
>> {
>> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
>>
>> if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
>> return false;
>> }
>>
>> return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
>> }
>>
>> +bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
>> +{
>> + struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */
>> + bo = container_of(entry, bo, u.slab.entry);
>> +
>> + return amdgpu_bo_can_reclaim(&bo->base);
>> +}
>> +
>> +static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
>> +{
>> + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
>> +
>> + assert(!bo->bo);
>> +
>> + pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry);
>> +}
>> +
>> +static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
>> + amdgpu_bo_slab_destroy
>> + /* other functions are never called */
>> +};
>> +
>> +struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
>> + unsigned entry_size,
>> + unsigned group_index)
>> +{
>> + struct amdgpu_winsys *ws = priv;
>> + struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
>> + enum radeon_bo_domain domains;
>> + enum radeon_bo_flag flags = 0;
>> + uint32_t base_id;
>> +
>> + if (!slab)
>> + return NULL;
>> +
>> + if (heap & 1)
>> + flags |= RADEON_FLAG_GTT_WC;
>> + if (heap & 2)
>> + flags |= RADEON_FLAG_CPU_ACCESS;
>> +
>> + switch (heap >> 2) {
>> + case 0:
>> + domains = RADEON_DOMAIN_VRAM;
>> + break;
>> + default:
>> + case 1:
>> + domains = RADEON_DOMAIN_VRAM_GTT;
>> + break;
>> + case 2:
>> + domains = RADEON_DOMAIN_GTT;
>> + break;
>> + }
>> +
>> + slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base,
>> + 64 * 1024, 64 * 1024,
>> + domains, flags));
>> + if (!slab->buffer)
>> + goto fail;
>> +
>> + assert(slab->buffer->bo);
>> +
>> + slab->base.num_entries = slab->buffer->base.size / entry_size;
>> + slab->base.num_free = slab->base.num_entries;
>> + slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
>> + if (!slab->entries)
>> + goto fail_buffer;
>> +
>> + LIST_INITHEAD(&slab->base.free);
>> +
>> + base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);
>> +
>> + for (unsigned i = 0; i < slab->base.num_entries; ++i) {
>> + struct amdgpu_winsys_bo *bo = &slab->entries[i];
>> +
>> + bo->base.alignment = entry_size;
>> + bo->base.usage = slab->buffer->base.usage;
>> + bo->base.size = entry_size;
>> + bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;
>> + bo->ws = ws;
>> + bo->va = slab->buffer->va + i * entry_size;
>> + bo->initial_domain = domains;
>> + bo->unique_id = base_id + i;
>> + bo->u.slab.entry.slab = &slab->base;
>> + bo->u.slab.entry.group_index = group_index;
>> + bo->u.slab.real = slab->buffer;
>> +
>> + LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free);
>> + }
>> +
>> + return &slab->base;
>> +
>> +fail_buffer:
>> + amdgpu_winsys_bo_reference(&slab->buffer, NULL);
>> +fail:
>> + FREE(slab);
>> + return NULL;
>> +}
>> +
>> +void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
>> +{
>> + struct amdgpu_slab *slab = amdgpu_slab(pslab);
>> +
>> + for (unsigned i = 0; i < slab->base.num_entries; ++i)
>> + amdgpu_bo_remove_fences(&slab->entries[i]);
>> +
>> + FREE(slab->entries);
>> + amdgpu_winsys_bo_reference(&slab->buffer, NULL);
>> + FREE(slab);
>> +}
>> +
>> static unsigned eg_tile_split(unsigned tile_split)
>> {
>> switch (tile_split) {
>> case 0: tile_split = 64; break;
>> case 1: tile_split = 128; break;
>> case 2: tile_split = 256; break;
>> case 3: tile_split = 512; break;
>> default:
>> case 4: tile_split = 1024; break;
>> case 5: tile_split = 2048; break;
>> @@ -548,20 +665,67 @@ static struct pb_buffer *
>> amdgpu_bo_create(struct radeon_winsys *rws,
>> uint64_t size,
>> unsigned alignment,
>> enum radeon_bo_domain domain,
>> enum radeon_bo_flag flags)
>> {
>> struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>> struct amdgpu_winsys_bo *bo;
>> unsigned usage = 0, pb_cache_bucket;
>>
>> + /* Sub-allocate small buffers from slabs. */
>> + if (!(flags & RADEON_FLAG_HANDLE) &&
>> + size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
>> + alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) {
>> + struct pb_slab_entry *entry;
>> + unsigned heap = 0;
>> +
>
> A comment would useful here to say that the following code filters out
> textures. (i.e. HANDLE = textures, INVISIBLE VRAM = tiled textures)
>
> Marek
>
>> + if (flags & RADEON_FLAG_GTT_WC)
>> + heap |= 1;
>> + if (flags & RADEON_FLAG_CPU_ACCESS)
>> + heap |= 2;
>> + if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS))
>> + goto no_slab;
>> +
>> + switch (domain) {
>> + case RADEON_DOMAIN_VRAM:
>> + heap |= 0 * 4;
>> + break;
>> + case RADEON_DOMAIN_VRAM_GTT:
>> + heap |= 1 * 4;
>> + break;
>> + case RADEON_DOMAIN_GTT:
>> + heap |= 2 * 4;
>> + break;
>> + default:
>> + goto no_slab;
>> + }
>> +
>> + entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
>> + if (!entry) {
>> + /* Clear the cache and try again. */
>> + pb_cache_release_all_buffers(&ws->bo_cache);
>> +
>> + entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
>> + }
>> + if (!entry)
>> + return NULL;
>> +
>> + bo = NULL;
>> + bo = container_of(entry, bo, u.slab.entry);
>> +
>> + pipe_reference_init(&bo->base.reference, 1);
>> +
>> + return &bo->base;
>> + }
>> +no_slab:
>> +
>> /* This flag is irrelevant for the cache. */
>> flags &= ~RADEON_FLAG_HANDLE;
>>
>> /* Align size to page size. This is the minimum alignment for normal
>> * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
>> * like constant/uniform buffers, can benefit from better and more reuse.
>> */
>> size = align64(size, ws->info.gart_page_size);
>> alignment = align(alignment, ws->info.gart_page_size);
>>
>> @@ -590,20 +754,21 @@ amdgpu_bo_create(struct radeon_winsys *rws,
>> pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
>> pb_cache_bucket);
>> if (bo)
>> return &bo->base;
>>
>> /* Create a new one. */
>> bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>> pb_cache_bucket);
>> if (!bo) {
>> /* Clear the cache and try again. */
>> + pb_slabs_reclaim(&ws->bo_slabs);
>> pb_cache_release_all_buffers(&ws->bo_cache);
>> bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>> pb_cache_bucket);
>> if (!bo)
>> return NULL;
>> }
>>
>> bo->u.real.use_reusable_pool = true;
>> return &bo->base;
>> }
>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
>> index e5b5cf5..1e25897 100644
>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
>> @@ -74,28 +74,46 @@ struct amdgpu_winsys_bo {
>> * it can only transition from false to true
>> */
>> volatile int is_shared; /* bool (int for atomicity) */
>>
>> /* Fences for buffer synchronization. */
>> unsigned num_fences;
>> unsigned max_fences;
>> struct pipe_fence_handle **fences;
>> };
>>
>> +struct amdgpu_slab {
>> + struct pb_slab base;
>> + struct amdgpu_winsys_bo *buffer;
>> + struct amdgpu_winsys_bo *entries;
>> +};
>> +
>> bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf);
>> void amdgpu_bo_destroy(struct pb_buffer *_buf);
>> void amdgpu_bo_init_functions(struct amdgpu_winsys *ws);
>>
>> +bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry);
>> +struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
>> + unsigned entry_size,
>> + unsigned group_index);
>> +void amdgpu_bo_slab_free(void *priv, struct pb_slab *slab);
>> +
>> static inline
>> struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
>> {
>> return (struct amdgpu_winsys_bo *)bo;
>> }
>>
>> static inline
>> +struct amdgpu_slab *amdgpu_slab(struct pb_slab *slab)
>> +{
>> + return (struct amdgpu_slab *)slab;
>> +}
>> +
>> +static inline
>> void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst,
>> struct amdgpu_winsys_bo *src)
>> {
>> pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
>> }
>>
>> #endif
>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
>> index 3961ee3..c83489d 100644
>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
>> @@ -375,20 +375,21 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws)
>> }
>>
>> static void amdgpu_winsys_destroy(struct radeon_winsys *rws)
>> {
>> struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
>>
>> if (util_queue_is_initialized(&ws->cs_queue))
>> util_queue_destroy(&ws->cs_queue);
>>
>> pipe_mutex_destroy(ws->bo_fence_lock);
>> + pb_slabs_deinit(&ws->bo_slabs);
>> pb_cache_deinit(&ws->bo_cache);
>> pipe_mutex_destroy(ws->global_bo_list_lock);
>> do_winsys_deinit(ws);
>> FREE(rws);
>> }
>>
>> static void amdgpu_winsys_query_info(struct radeon_winsys *rws,
>> struct radeon_info *info)
>> {
>> *info = ((struct amdgpu_winsys *)rws)->info;
>> @@ -540,20 +541,29 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
>> ws->info.drm_minor = drm_minor;
>>
>> if (!do_winsys_init(ws, fd))
>> goto fail_alloc;
>>
>> /* Create managers. */
>> pb_cache_init(&ws->bo_cache, 500000, ws->check_vm ? 1.0f : 2.0f, 0,
>> (ws->info.vram_size + ws->info.gart_size) / 8,
>> amdgpu_bo_destroy, amdgpu_bo_can_reclaim);
>>
>> + if (!pb_slabs_init(&ws->bo_slabs,
>> + AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2,
>> + 12, /* number of heaps (domain/flags combinations) */
>> + ws,
>> + amdgpu_bo_can_reclaim_slab,
>> + amdgpu_bo_slab_alloc,
>> + amdgpu_bo_slab_free))
>> + goto fail_cache;
>> +
>> /* init reference */
>> pipe_reference_init(&ws->reference, 1);
>>
>> /* Set functions. */
>> ws->base.unref = amdgpu_winsys_unref;
>> ws->base.destroy = amdgpu_winsys_destroy;
>> ws->base.query_info = amdgpu_winsys_query_info;
>> ws->base.cs_request_feature = amdgpu_cs_request_feature;
>> ws->base.query_value = amdgpu_query_value;
>> ws->base.read_registers = amdgpu_read_registers;
>> @@ -583,16 +593,19 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
>>
>> util_hash_table_set(dev_tab, dev, ws);
>>
>> /* We must unlock the mutex once the winsys is fully initialized, so that
>> * other threads attempting to create the winsys from the same fd will
>> * get a fully initialized winsys and not just half-way initialized. */
>> pipe_mutex_unlock(dev_tab_mutex);
>>
>> return &ws->base;
>>
>> +fail_cache:
>> + pb_cache_deinit(&ws->bo_cache);
>> + do_winsys_deinit(ws);
>> fail_alloc:
>> FREE(ws);
>> fail:
>> pipe_mutex_unlock(dev_tab_mutex);
>> return NULL;
>> }
>> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
>> index 96d4e6d..69c6638 100644
>> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
>> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
>> @@ -26,31 +26,36 @@
>> */
>> /*
>> * Authors:
>> * Marek Olšák <maraeo at gmail.com>
>> */
>>
>> #ifndef AMDGPU_WINSYS_H
>> #define AMDGPU_WINSYS_H
>>
>> #include "pipebuffer/pb_cache.h"
>> +#include "pipebuffer/pb_slab.h"
>> #include "gallium/drivers/radeon/radeon_winsys.h"
>> #include "addrlib/addrinterface.h"
>> #include "util/u_queue.h"
>> #include <amdgpu.h>
>>
>> struct amdgpu_cs;
>>
>> +#define AMDGPU_SLAB_MIN_SIZE_LOG2 9
>> +#define AMDGPU_SLAB_MAX_SIZE_LOG2 14
>> +
>> struct amdgpu_winsys {
>> struct radeon_winsys base;
>> struct pipe_reference reference;
>> struct pb_cache bo_cache;
>> + struct pb_slabs bo_slabs;
>>
>> amdgpu_device_handle dev;
>>
>> pipe_mutex bo_fence_lock;
>>
>> int num_cs; /* The number of command streams created. */
>> uint32_t next_bo_unique_id;
>> uint64_t allocated_vram;
>> uint64_t allocated_gtt;
>> uint64_t mapped_vram;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list