[Mesa-dev] [PATCH 11/14] winsys/radeon: add slab buffer list
Nicolai Hähnle
nhaehnle at gmail.com
Tue Sep 13 09:56:22 UTC 2016
From: Nicolai Hähnle <nicolai.haehnle at amd.com>
Introducing radeon_bo::hash will reduce collisions between "real" buffers
and buffers from slabs.
---
src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 3 +
src/gallium/winsys/radeon/drm/radeon_drm_bo.h | 1 +
src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 98 ++++++++++++++++++++---
src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 16 +++-
src/gallium/winsys/radeon/drm/radeon_drm_winsys.h | 1 +
5 files changed, 107 insertions(+), 12 deletions(-)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index df6e53c..1725080 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -580,20 +580,21 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,
pipe_reference_init(&bo->base.reference, 1);
bo->base.alignment = alignment;
bo->base.usage = usage;
bo->base.size = size;
bo->base.vtbl = &radeon_bo_vtbl;
bo->rws = rws;
bo->handle = args.handle;
bo->va = 0;
bo->initial_domain = initial_domains;
+ bo->hash = __sync_fetch_and_add(&rws->next_bo_hash, 1);
pipe_mutex_init(bo->u.real.map_mutex);
pb_cache_init_entry(&rws->bo_cache, &bo->u.real.cache_entry, &bo->base,
pb_cache_bucket);
if (rws->info.has_virtual_memory) {
struct drm_radeon_gem_va va;
unsigned va_gap_size;
va_gap_size = rws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
bo->va = radeon_bomgr_find_va(rws, size + va_gap_size, alignment);
@@ -857,20 +858,21 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
/* Initialize it. */
pipe_reference_init(&bo->base.reference, 1);
bo->handle = args.handle;
bo->base.alignment = 0;
bo->base.size = size;
bo->base.vtbl = &radeon_bo_vtbl;
bo->rws = ws;
bo->user_ptr = pointer;
bo->va = 0;
bo->initial_domain = RADEON_DOMAIN_GTT;
+ bo->hash = __sync_fetch_and_add(&ws->next_bo_hash, 1);
pipe_mutex_init(bo->u.real.map_mutex);
util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo);
pipe_mutex_unlock(ws->bo_handles_mutex);
if (ws->info.has_virtual_memory) {
struct drm_radeon_gem_va va;
bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20);
@@ -990,20 +992,21 @@ static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws,
bo->handle = handle;
/* Initialize it. */
pipe_reference_init(&bo->base.reference, 1);
bo->base.alignment = 0;
bo->base.size = (unsigned) size;
bo->base.vtbl = &radeon_bo_vtbl;
bo->rws = ws;
bo->va = 0;
+ bo->hash = __sync_fetch_and_add(&ws->next_bo_hash, 1);
pipe_mutex_init(bo->u.real.map_mutex);
if (bo->flink_name)
util_hash_table_set(ws->bo_names, (void*)(uintptr_t)bo->flink_name, bo);
util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo);
done:
pipe_mutex_unlock(ws->bo_handles_mutex);
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
index b9a4a05..8e35a38 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
@@ -52,20 +52,21 @@ struct radeon_bo {
struct radeon_bo *real;
} slab;
} u;
struct radeon_drm_winsys *rws;
void *user_ptr; /* from buffer_from_ptr */
uint32_t handle; /* 0 for slab entries */
uint32_t flink_name;
uint64_t va;
+ uint32_t hash;
enum radeon_bo_domain initial_domain;
/* how many command streams is this bo referenced in? */
int num_cs_references;
/* how many command streams, which are being emitted in a separate
* thread, is this bo referenced in? */
int num_active_ioctls;
};
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 20f90cf..9fbd378 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -122,34 +122,40 @@ static bool radeon_init_cs_context(struct radeon_cs_context *csc,
}
static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
{
unsigned i;
for (i = 0; i < csc->num_relocs; i++) {
p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
}
+ for (i = 0; i < csc->num_slab_buffers; ++i) {
+ p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
+ radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
+ }
csc->num_relocs = 0;
csc->num_validated_relocs = 0;
+ csc->num_slab_buffers = 0;
csc->chunks[0].length_dw = 0;
csc->chunks[1].length_dw = 0;
for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
csc->reloc_indices_hashlist[i] = -1;
}
}
static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
{
radeon_cs_context_cleanup(csc);
+ FREE(csc->slab_buffers);
FREE(csc->relocs_bo);
FREE(csc->relocs);
}
static struct radeon_winsys_cs *
radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
enum ring_type ring_type,
void (*flush)(void *ctx, unsigned flags,
struct pipe_fence_handle **fence),
@@ -184,52 +190,62 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
cs->base.current.buf = cs->csc->buf;
cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
cs->ring_type = ring_type;
p_atomic_inc(&ws->num_cs);
return &cs->base;
}
int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
{
- unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+ unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+ struct radeon_bo_item *buffers;
+ unsigned num_buffers;
int i = csc->reloc_indices_hashlist[hash];
+ if (bo->handle) {
+ buffers = csc->relocs_bo;
+ num_buffers = csc->num_relocs;
+ } else {
+ buffers = csc->slab_buffers;
+ num_buffers = csc->num_slab_buffers;
+ }
+
/* not found or found */
- if (i == -1 || csc->relocs_bo[i].bo == bo)
+ if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
return i;
/* Hash collision, look for the BO in the list of relocs linearly. */
- for (i = csc->num_relocs - 1; i >= 0; i--) {
- if (csc->relocs_bo[i].bo == bo) {
+ for (i = num_buffers - 1; i >= 0; i--) {
+ if (buffers[i].bo == bo) {
/* Put this reloc in the hash list.
* This will prevent additional hash collisions if there are
* several consecutive lookup_buffer calls for the same buffer.
*
* Example: Assuming buffers A,B,C collide in the hash list,
* the following sequence of relocs:
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
* will collide here: ^ and here: ^,
* meaning that we should get very few collisions in the end. */
csc->reloc_indices_hashlist[hash] = i;
return i;
}
}
return -1;
}
-static unsigned radeon_lookup_or_add_buffer(struct radeon_drm_cs *cs,
- struct radeon_bo *bo)
+static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
+ struct radeon_bo *bo)
{
struct radeon_cs_context *csc = cs->csc;
struct drm_radeon_cs_reloc *reloc;
- unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+ unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
int i = -1;
i = radeon_lookup_buffer(csc, bo);
if (i >= 0) {
/* For async DMA, every add_buffer call must add a buffer to the list
* no matter how many duplicates there are. This is due to the fact
* the DMA CS checker doesn't use NOP packets for offset patching,
* but always uses the i-th buffer from the list to patch the i-th
* offset. If there are N offsets in a DMA CS, there must also be N
@@ -252,56 +268,113 @@ static unsigned radeon_lookup_or_add_buffer(struct radeon_drm_cs *cs,
csc->relocs_bo = realloc(csc->relocs_bo, size);
size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
csc->relocs = realloc(csc->relocs, size);
csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
}
/* Initialize the new relocation. */
csc->relocs_bo[csc->num_relocs].bo = NULL;
- csc->relocs_bo[csc->num_relocs].priority_usage = 0;
+ csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
p_atomic_inc(&bo->num_cs_references);
reloc = &csc->relocs[csc->num_relocs];
reloc->handle = bo->handle;
reloc->read_domains = 0;
reloc->write_domain = 0;
reloc->flags = 0;
csc->reloc_indices_hashlist[hash] = csc->num_relocs;
csc->chunks[1].length_dw += RELOC_DWORDS;
return csc->num_relocs++;
}
+static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
+ struct radeon_bo *bo)
+{
+ struct radeon_cs_context *csc = cs->csc;
+ unsigned hash;
+ struct radeon_bo_item *item;
+ int idx;
+ int real_idx;
+
+ idx = radeon_lookup_buffer(csc, bo);
+ if (idx >= 0)
+ return idx;
+
+ real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
+
+ /* Check if the backing array is large enough. */
+ if (csc->num_slab_buffers >= csc->max_slab_buffers) {
+ unsigned new_max = MAX2(csc->max_slab_buffers + 16,
+ (unsigned)(csc->max_slab_buffers * 1.3));
+ struct radeon_bo_item *new_buffers =
+ REALLOC(csc->slab_buffers,
+ csc->max_slab_buffers * sizeof(*new_buffers),
+ new_max * sizeof(*new_buffers));
+ if (!new_buffers) {
+ fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
+ return -1;
+ }
+
+ csc->max_slab_buffers = new_max;
+ csc->slab_buffers = new_buffers;
+ }
+
+ /* Initialize the new relocation. */
+ idx = csc->num_slab_buffers++;
+ item = &csc->slab_buffers[idx];
+
+ item->bo = NULL;
+ item->u.slab.real_idx = real_idx;
+ radeon_bo_reference(&item->bo, bo);
+ p_atomic_inc(&bo->num_cs_references);
+
+ hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+ csc->reloc_indices_hashlist[hash] = idx;
+
+ return idx;
+}
+
static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
struct pb_buffer *buf,
enum radeon_bo_usage usage,
enum radeon_bo_domain domains,
enum radeon_bo_priority priority)
{
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
struct radeon_bo *bo = (struct radeon_bo*)buf;
enum radeon_bo_domain added_domains;
enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
struct drm_radeon_cs_reloc *reloc;
- unsigned index = radeon_lookup_or_add_buffer(cs, bo);
+ int index;
+
+ if (!bo->handle) {
+ index = radeon_lookup_or_add_slab_buffer(cs, bo);
+ if (index < 0)
+ return 0;
+
+ index = cs->csc->slab_buffers[index].u.slab.real_idx;
+ } else {
+ index = radeon_lookup_or_add_real_buffer(cs, bo);
+ }
reloc = &cs->csc->relocs[index];
added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
reloc->read_domains |= rd;
reloc->write_domain |= wd;
reloc->flags = MAX2(reloc->flags, priority);
- cs->csc->relocs_bo[index].priority_usage |= 1llu << priority;
+ cs->csc->relocs_bo[index].u.real.priority_usage |= 1llu << priority;
if (added_domains & RADEON_DOMAIN_VRAM)
cs->base.used_vram += bo->base.size;
else if (added_domains & RADEON_DOMAIN_GTT)
cs->base.used_gart += bo->base.size;
return index;
}
static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
@@ -359,21 +432,21 @@ static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
struct radeon_bo_list_item *list)
{
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
int i;
if (list) {
for (i = 0; i < cs->csc->num_relocs; i++) {
list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
- list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage;
+ list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
}
}
return cs->csc->num_relocs;
}
void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
{
struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
unsigned i;
int r;
@@ -577,20 +650,23 @@ static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
struct radeon_bo *bo = (struct radeon_bo*)_buf;
int index;
if (!bo->num_cs_references)
return false;
index = radeon_lookup_buffer(cs->csc, bo);
if (index == -1)
return false;
+ if (!bo->handle)
+ index = cs->csc->slab_buffers[index].u.slab.real_idx;
+
if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
return true;
if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
return true;
return false;
}
/* FENCES */
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index bd55548..f9b26af 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -24,39 +24,50 @@
* of the Software.
*/
#ifndef RADEON_DRM_CS_H
#define RADEON_DRM_CS_H
#include "radeon_drm_bo.h"
struct radeon_bo_item {
struct radeon_bo *bo;
- uint64_t priority_usage;
+ union {
+ struct {
+ uint64_t priority_usage;
+ } real;
+ struct {
+ unsigned real_idx;
+ } slab;
+ } u;
};
struct radeon_cs_context {
uint32_t buf[16 * 1024];
int fd;
struct drm_radeon_cs cs;
struct drm_radeon_cs_chunk chunks[3];
uint64_t chunk_array[3];
uint32_t flags[2];
/* Buffers. */
unsigned max_relocs;
unsigned num_relocs;
unsigned num_validated_relocs;
struct radeon_bo_item *relocs_bo;
struct drm_radeon_cs_reloc *relocs;
+ unsigned num_slab_buffers;
+ unsigned max_slab_buffers;
+ struct radeon_bo_item *slab_buffers;
+
int reloc_indices_hashlist[4096];
};
struct radeon_drm_cs {
struct radeon_winsys_cs base;
enum ring_type ring_type;
/* We flip between these two CS. While one is being consumed
* by the kernel in another thread, the other one is being filled
* by the pipe driver. */
@@ -101,20 +112,23 @@ radeon_bo_is_referenced_by_cs_for_write(struct radeon_drm_cs *cs,
{
int index;
if (!bo->num_cs_references)
return false;
index = radeon_lookup_buffer(cs->csc, bo);
if (index == -1)
return false;
+ if (!bo->handle)
+ index = cs->csc->slab_buffers[index].u.slab.real_idx;
+
return cs->csc->relocs[index].write_domain != 0;
}
static inline bool
radeon_bo_is_referenced_by_any_cs(struct radeon_bo *bo)
{
return bo->num_cs_references != 0;
}
void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs);
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index 27fbe90..5514980 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -68,20 +68,21 @@ struct radeon_drm_winsys {
struct pb_cache bo_cache;
int fd; /* DRM file descriptor */
int num_cs; /* The number of command streams created. */
uint64_t allocated_vram;
uint64_t allocated_gtt;
uint64_t mapped_vram;
uint64_t mapped_gtt;
uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */
uint64_t num_cs_flushes;
+ uint32_t next_bo_hash;
enum radeon_generation gen;
struct radeon_info info;
uint32_t va_start;
uint32_t va_unmap_working;
uint32_t accel_working2;
/* List of buffer GEM names. Protected by bo_handles_mutex. */
struct util_hash_table *bo_names;
/* List of buffer handles. Protectded by bo_handles_mutex. */
--
2.7.4
More information about the mesa-dev
mailing list