[Mesa-dev] [PATCH 08/14] winsys/amdgpu: enable buffer allocation from slabs
Nicolai Hähnle
nhaehnle at gmail.com
Tue Sep 13 09:56:19 UTC 2016
From: Nicolai Hähnle <nicolai.haehnle at amd.com>
---
src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 165 ++++++++++++++++++++++++++
src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 18 +++
src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 13 ++
src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 5 +
4 files changed, 201 insertions(+)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index f581d9b..6a61b30 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -31,20 +31,27 @@
#include "amdgpu_cs.h"
#include "os/os_time.h"
#include "state_tracker/drm_driver.h"
#include <amdgpu_drm.h>
#include <xf86drm.h>
#include <stdio.h>
#include <inttypes.h>
+static struct pb_buffer *
+amdgpu_bo_create(struct radeon_winsys *rws,
+ uint64_t size,
+ unsigned alignment,
+ enum radeon_bo_domain domain,
+ enum radeon_bo_flag flags);
+
static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
enum radeon_bo_usage usage)
{
struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
struct amdgpu_winsys *ws = bo->ws;
int64_t abs_timeout;
if (timeout == 0) {
if (p_atomic_read(&bo->num_active_ioctls))
return false;
@@ -436,20 +443,130 @@ bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
{
struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
return false;
}
return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
}
+bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
+{
+ struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */
+ bo = container_of(entry, bo, u.slab.entry);
+
+ return amdgpu_bo_can_reclaim(&bo->base);
+}
+
+static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
+{
+ struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+
+ assert(!bo->bo);
+
+ pb_slab_free(&bo->ws->bo_slabs, &bo->u.slab.entry);
+}
+
+static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
+ amdgpu_bo_slab_destroy
+ /* other functions are never called */
+};
+
+struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
+ unsigned entry_size,
+ unsigned group_index)
+{
+ struct amdgpu_winsys *ws = priv;
+ struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
+ enum radeon_bo_domain domains;
+ enum radeon_bo_flag flags = 0;
+ uint32_t base_id;
+
+ if (!slab)
+ return NULL;
+
+ if (heap & 1)
+ flags |= RADEON_FLAG_GTT_WC;
+ if (heap & 2)
+ flags |= RADEON_FLAG_CPU_ACCESS;
+
+ switch (heap >> 2) {
+ case 0:
+ domains = RADEON_DOMAIN_VRAM;
+ break;
+ default:
+ case 1:
+ domains = RADEON_DOMAIN_VRAM_GTT;
+ break;
+ case 2:
+ domains = RADEON_DOMAIN_GTT;
+ break;
+ }
+
+ slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base,
+ 64 * 1024, 64 * 1024,
+ domains, flags));
+ if (!slab->buffer)
+ goto fail;
+
+ assert(slab->buffer->bo);
+
+ slab->base.num_entries = slab->buffer->base.size / entry_size;
+ slab->base.num_free = slab->base.num_entries;
+ slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
+ if (!slab->entries)
+ goto fail_buffer;
+
+ LIST_INITHEAD(&slab->base.free);
+
+ base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);
+
+ for (unsigned i = 0; i < slab->base.num_entries; ++i) {
+ struct amdgpu_winsys_bo *bo = &slab->entries[i];
+
+ bo->base.alignment = entry_size;
+ bo->base.usage = slab->buffer->base.usage;
+ bo->base.size = entry_size;
+ bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;
+ bo->ws = ws;
+ bo->va = slab->buffer->va + i * entry_size;
+ bo->initial_domain = domains;
+ bo->unique_id = base_id + i;
+ bo->u.slab.entry.slab = &slab->base;
+ bo->u.slab.entry.group_index = group_index;
+ bo->u.slab.real = slab->buffer;
+
+ LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free);
+ }
+
+ return &slab->base;
+
+fail_buffer:
+ amdgpu_winsys_bo_reference(&slab->buffer, NULL);
+fail:
+ FREE(slab);
+ return NULL;
+}
+
+void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
+{
+ struct amdgpu_slab *slab = amdgpu_slab(pslab);
+
+ for (unsigned i = 0; i < slab->base.num_entries; ++i)
+ amdgpu_bo_remove_fences(&slab->entries[i]);
+
+ FREE(slab->entries);
+ amdgpu_winsys_bo_reference(&slab->buffer, NULL);
+ FREE(slab);
+}
+
static unsigned eg_tile_split(unsigned tile_split)
{
switch (tile_split) {
case 0: tile_split = 64; break;
case 1: tile_split = 128; break;
case 2: tile_split = 256; break;
case 3: tile_split = 512; break;
default:
case 4: tile_split = 1024; break;
case 5: tile_split = 2048; break;
@@ -548,20 +665,67 @@ static struct pb_buffer *
amdgpu_bo_create(struct radeon_winsys *rws,
uint64_t size,
unsigned alignment,
enum radeon_bo_domain domain,
enum radeon_bo_flag flags)
{
struct amdgpu_winsys *ws = amdgpu_winsys(rws);
struct amdgpu_winsys_bo *bo;
unsigned usage = 0, pb_cache_bucket;
+ /* Sub-allocate small buffers from slabs. */
+ if (!(flags & RADEON_FLAG_HANDLE) &&
+ size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
+ alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) {
+ struct pb_slab_entry *entry;
+ unsigned heap = 0;
+
+ if (flags & RADEON_FLAG_GTT_WC)
+ heap |= 1;
+ if (flags & RADEON_FLAG_CPU_ACCESS)
+ heap |= 2;
+ if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_CPU_ACCESS))
+ goto no_slab;
+
+ switch (domain) {
+ case RADEON_DOMAIN_VRAM:
+ heap |= 0 * 4;
+ break;
+ case RADEON_DOMAIN_VRAM_GTT:
+ heap |= 1 * 4;
+ break;
+ case RADEON_DOMAIN_GTT:
+ heap |= 2 * 4;
+ break;
+ default:
+ goto no_slab;
+ }
+
+ entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
+ if (!entry) {
+ /* Clear the cache and try again. */
+ pb_cache_release_all_buffers(&ws->bo_cache);
+
+ entry = pb_slab_alloc(&ws->bo_slabs, size, heap);
+ }
+ if (!entry)
+ return NULL;
+
+ bo = NULL;
+ bo = container_of(entry, bo, u.slab.entry);
+
+ pipe_reference_init(&bo->base.reference, 1);
+
+ return &bo->base;
+ }
+no_slab:
+
/* This flag is irrelevant for the cache. */
flags &= ~RADEON_FLAG_HANDLE;
/* Align size to page size. This is the minimum alignment for normal
* BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
* like constant/uniform buffers, can benefit from better and more reuse.
*/
size = align64(size, ws->info.gart_page_size);
alignment = align(alignment, ws->info.gart_page_size);
@@ -590,20 +754,21 @@ amdgpu_bo_create(struct radeon_winsys *rws,
pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
pb_cache_bucket);
if (bo)
return &bo->base;
/* Create a new one. */
bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
pb_cache_bucket);
if (!bo) {
/* Clear the cache and try again. */
+ pb_slabs_reclaim(&ws->bo_slabs);
pb_cache_release_all_buffers(&ws->bo_cache);
bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
pb_cache_bucket);
if (!bo)
return NULL;
}
bo->u.real.use_reusable_pool = true;
return &bo->base;
}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
index e5b5cf5..1e25897 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h
@@ -74,28 +74,46 @@ struct amdgpu_winsys_bo {
* it can only transition from false to true
*/
volatile int is_shared; /* bool (int for atomicity) */
/* Fences for buffer synchronization. */
unsigned num_fences;
unsigned max_fences;
struct pipe_fence_handle **fences;
};
+struct amdgpu_slab {
+ struct pb_slab base;
+ struct amdgpu_winsys_bo *buffer;
+ struct amdgpu_winsys_bo *entries;
+};
+
bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf);
void amdgpu_bo_destroy(struct pb_buffer *_buf);
void amdgpu_bo_init_functions(struct amdgpu_winsys *ws);
+bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry);
+struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
+ unsigned entry_size,
+ unsigned group_index);
+void amdgpu_bo_slab_free(void *priv, struct pb_slab *slab);
+
static inline
struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
{
return (struct amdgpu_winsys_bo *)bo;
}
static inline
+struct amdgpu_slab *amdgpu_slab(struct pb_slab *slab)
+{
+ return (struct amdgpu_slab *)slab;
+}
+
+static inline
void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst,
struct amdgpu_winsys_bo *src)
{
pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
}
#endif
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 3961ee3..c83489d 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -375,20 +375,21 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws)
}
static void amdgpu_winsys_destroy(struct radeon_winsys *rws)
{
struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws;
if (util_queue_is_initialized(&ws->cs_queue))
util_queue_destroy(&ws->cs_queue);
pipe_mutex_destroy(ws->bo_fence_lock);
+ pb_slabs_deinit(&ws->bo_slabs);
pb_cache_deinit(&ws->bo_cache);
pipe_mutex_destroy(ws->global_bo_list_lock);
do_winsys_deinit(ws);
FREE(rws);
}
static void amdgpu_winsys_query_info(struct radeon_winsys *rws,
struct radeon_info *info)
{
*info = ((struct amdgpu_winsys *)rws)->info;
@@ -540,20 +541,29 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
ws->info.drm_minor = drm_minor;
if (!do_winsys_init(ws, fd))
goto fail_alloc;
/* Create managers. */
pb_cache_init(&ws->bo_cache, 500000, ws->check_vm ? 1.0f : 2.0f, 0,
(ws->info.vram_size + ws->info.gart_size) / 8,
amdgpu_bo_destroy, amdgpu_bo_can_reclaim);
+ if (!pb_slabs_init(&ws->bo_slabs,
+ AMDGPU_SLAB_MIN_SIZE_LOG2, AMDGPU_SLAB_MAX_SIZE_LOG2,
+ 12, /* number of heaps (domain/flags combinations) */
+ ws,
+ amdgpu_bo_can_reclaim_slab,
+ amdgpu_bo_slab_alloc,
+ amdgpu_bo_slab_free))
+ goto fail_cache;
+
/* init reference */
pipe_reference_init(&ws->reference, 1);
/* Set functions. */
ws->base.unref = amdgpu_winsys_unref;
ws->base.destroy = amdgpu_winsys_destroy;
ws->base.query_info = amdgpu_winsys_query_info;
ws->base.cs_request_feature = amdgpu_cs_request_feature;
ws->base.query_value = amdgpu_query_value;
ws->base.read_registers = amdgpu_read_registers;
@@ -583,16 +593,19 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
util_hash_table_set(dev_tab, dev, ws);
/* We must unlock the mutex once the winsys is fully initialized, so that
* other threads attempting to create the winsys from the same fd will
* get a fully initialized winsys and not just half-way initialized. */
pipe_mutex_unlock(dev_tab_mutex);
return &ws->base;
+fail_cache:
+ pb_cache_deinit(&ws->bo_cache);
+ do_winsys_deinit(ws);
fail_alloc:
FREE(ws);
fail:
pipe_mutex_unlock(dev_tab_mutex);
return NULL;
}
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index 96d4e6d..69c6638 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -26,31 +26,36 @@
*/
/*
* Authors:
* Marek Olšák <maraeo at gmail.com>
*/
#ifndef AMDGPU_WINSYS_H
#define AMDGPU_WINSYS_H
#include "pipebuffer/pb_cache.h"
+#include "pipebuffer/pb_slab.h"
#include "gallium/drivers/radeon/radeon_winsys.h"
#include "addrlib/addrinterface.h"
#include "util/u_queue.h"
#include <amdgpu.h>
struct amdgpu_cs;
+#define AMDGPU_SLAB_MIN_SIZE_LOG2 9
+#define AMDGPU_SLAB_MAX_SIZE_LOG2 14
+
struct amdgpu_winsys {
struct radeon_winsys base;
struct pipe_reference reference;
struct pb_cache bo_cache;
+ struct pb_slabs bo_slabs;
amdgpu_device_handle dev;
pipe_mutex bo_fence_lock;
int num_cs; /* The number of command streams created. */
uint32_t next_bo_unique_id;
uint64_t allocated_vram;
uint64_t allocated_gtt;
uint64_t mapped_vram;
--
2.7.4
More information about the mesa-dev
mailing list