[Mesa-dev] [PATCH 11/14] winsys/radeon: add slab buffer list

Tue Sep 13 09:56:22 UTC 2016

From: Nicolai Hähnle <nicolai.haehnle at amd.com>

Introducing radeon_bo::hash will reduce collisions between "real" buffers
and buffers from slabs.
---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c     |  3 +
 src/gallium/winsys/radeon/drm/radeon_drm_bo.h     |  1 +
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c     | 98 ++++++++++++++++++++---
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h     | 16 +++-
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.h |  1 +
 5 files changed, 107 insertions(+), 12 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index df6e53c..1725080 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -580,20 +580,21 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,
 
     pipe_reference_init(&bo->base.reference, 1);
     bo->base.alignment = alignment;
     bo->base.usage = usage;
     bo->base.size = size;
     bo->base.vtbl = &radeon_bo_vtbl;
     bo->rws = rws;
     bo->handle = args.handle;
     bo->va = 0;
     bo->initial_domain = initial_domains;
+    bo->hash = __sync_fetch_and_add(&rws->next_bo_hash, 1);
     pipe_mutex_init(bo->u.real.map_mutex);
     pb_cache_init_entry(&rws->bo_cache, &bo->u.real.cache_entry, &bo->base,
                         pb_cache_bucket);
 
     if (rws->info.has_virtual_memory) {
         struct drm_radeon_gem_va va;
         unsigned va_gap_size;
 
         va_gap_size = rws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
         bo->va = radeon_bomgr_find_va(rws, size + va_gap_size, alignment);
@@ -857,20 +858,21 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
     /* Initialize it. */
     pipe_reference_init(&bo->base.reference, 1);
     bo->handle = args.handle;
     bo->base.alignment = 0;
     bo->base.size = size;
     bo->base.vtbl = &radeon_bo_vtbl;
     bo->rws = ws;
     bo->user_ptr = pointer;
     bo->va = 0;
     bo->initial_domain = RADEON_DOMAIN_GTT;
+    bo->hash = __sync_fetch_and_add(&ws->next_bo_hash, 1);
     pipe_mutex_init(bo->u.real.map_mutex);
 
     util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo);
 
     pipe_mutex_unlock(ws->bo_handles_mutex);
 
     if (ws->info.has_virtual_memory) {
         struct drm_radeon_gem_va va;
 
         bo->va = radeon_bomgr_find_va(ws, bo->base.size, 1 << 20);
@@ -990,20 +992,21 @@ static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws,
 
     bo->handle = handle;
 
     /* Initialize it. */
     pipe_reference_init(&bo->base.reference, 1);
     bo->base.alignment = 0;
     bo->base.size = (unsigned) size;
     bo->base.vtbl = &radeon_bo_vtbl;
     bo->rws = ws;
     bo->va = 0;
+    bo->hash = __sync_fetch_and_add(&ws->next_bo_hash, 1);
     pipe_mutex_init(bo->u.real.map_mutex);
 
     if (bo->flink_name)
         util_hash_table_set(ws->bo_names, (void*)(uintptr_t)bo->flink_name, bo);
 
     util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo);
 
 done:
     pipe_mutex_unlock(ws->bo_handles_mutex);
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
index b9a4a05..8e35a38 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h
@@ -52,20 +52,21 @@ struct radeon_bo {
             struct radeon_bo *real;
         } slab;
     } u;
 
     struct radeon_drm_winsys *rws;
     void *user_ptr; /* from buffer_from_ptr */
 
     uint32_t handle; /* 0 for slab entries */
     uint32_t flink_name;
     uint64_t va;
+    uint32_t hash;
     enum radeon_bo_domain initial_domain;
 
     /* how many command streams is this bo referenced in? */
     int num_cs_references;
 
     /* how many command streams, which are being emitted in a separate
      * thread, is this bo referenced in? */
     int num_active_ioctls;
 };
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 20f90cf..9fbd378 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -122,34 +122,40 @@ static bool radeon_init_cs_context(struct radeon_cs_context *csc,
 }
 
 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
 {
     unsigned i;
 
     for (i = 0; i < csc->num_relocs; i++) {
         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
     }
+    for (i = 0; i < csc->num_slab_buffers; ++i) {
+        p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
+        radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
+    }
 
     csc->num_relocs = 0;
     csc->num_validated_relocs = 0;
+    csc->num_slab_buffers = 0;
     csc->chunks[0].length_dw = 0;
     csc->chunks[1].length_dw = 0;
 
     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
         csc->reloc_indices_hashlist[i] = -1;
     }
 }
 
 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
 {
     radeon_cs_context_cleanup(csc);
+    FREE(csc->slab_buffers);
     FREE(csc->relocs_bo);
     FREE(csc->relocs);
 }
 
 
 static struct radeon_winsys_cs *
 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
                      enum ring_type ring_type,
                      void (*flush)(void *ctx, unsigned flags,
                                    struct pipe_fence_handle **fence),
@@ -184,52 +190,62 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
     cs->base.current.buf = cs->csc->buf;
     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
     cs->ring_type = ring_type;
 
     p_atomic_inc(&ws->num_cs);
     return &cs->base;
 }
 
 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
 {
-    unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+    struct radeon_bo_item *buffers;
+    unsigned num_buffers;
     int i = csc->reloc_indices_hashlist[hash];
 
+    if (bo->handle) {
+        buffers = csc->relocs_bo;
+        num_buffers = csc->num_relocs;
+    } else {
+        buffers = csc->slab_buffers;
+        num_buffers = csc->num_slab_buffers;
+    }
+
     /* not found or found */
-    if (i == -1 || csc->relocs_bo[i].bo == bo)
+    if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
         return i;
 
     /* Hash collision, look for the BO in the list of relocs linearly. */
-    for (i = csc->num_relocs - 1; i >= 0; i--) {
-        if (csc->relocs_bo[i].bo == bo) {
+    for (i = num_buffers - 1; i >= 0; i--) {
+        if (buffers[i].bo == bo) {
             /* Put this reloc in the hash list.
              * This will prevent additional hash collisions if there are
              * several consecutive lookup_buffer calls for the same buffer.
              *
              * Example: Assuming buffers A,B,C collide in the hash list,
              * the following sequence of relocs:
              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
              * will collide here: ^ and here:   ^,
              * meaning that we should get very few collisions in the end. */
             csc->reloc_indices_hashlist[hash] = i;
             return i;
         }
     }
     return -1;
 }
 
-static unsigned radeon_lookup_or_add_buffer(struct radeon_drm_cs *cs,
-                                            struct radeon_bo *bo)
+static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
+                                                 struct radeon_bo *bo)
 {
     struct radeon_cs_context *csc = cs->csc;
     struct drm_radeon_cs_reloc *reloc;
-    unsigned hash = bo->handle & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
     int i = -1;
 
     i = radeon_lookup_buffer(csc, bo);
 
     if (i >= 0) {
         /* For async DMA, every add_buffer call must add a buffer to the list
          * no matter how many duplicates there are. This is due to the fact
          * the DMA CS checker doesn't use NOP packets for offset patching,
          * but always uses the i-th buffer from the list to patch the i-th
          * offset. If there are N offsets in a DMA CS, there must also be N
@@ -252,56 +268,113 @@ static unsigned radeon_lookup_or_add_buffer(struct radeon_drm_cs *cs,
         csc->relocs_bo = realloc(csc->relocs_bo, size);
 
         size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
         csc->relocs = realloc(csc->relocs, size);
 
         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
     }
 
     /* Initialize the new relocation. */
     csc->relocs_bo[csc->num_relocs].bo = NULL;
-    csc->relocs_bo[csc->num_relocs].priority_usage = 0;
+    csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
     radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
     p_atomic_inc(&bo->num_cs_references);
     reloc = &csc->relocs[csc->num_relocs];
     reloc->handle = bo->handle;
     reloc->read_domains = 0;
     reloc->write_domain = 0;
     reloc->flags = 0;
 
     csc->reloc_indices_hashlist[hash] = csc->num_relocs;
 
     csc->chunks[1].length_dw += RELOC_DWORDS;
 
     return csc->num_relocs++;
 }
 
+static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
+                                            struct radeon_bo *bo)
+{
+    struct radeon_cs_context *csc = cs->csc;
+    unsigned hash;
+    struct radeon_bo_item *item;
+    int idx;
+    int real_idx;
+
+    idx = radeon_lookup_buffer(csc, bo);
+    if (idx >= 0)
+        return idx;
+
+    real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
+
+    /* Check if the backing array is large enough. */
+    if (csc->num_slab_buffers >= csc->max_slab_buffers) {
+        unsigned new_max = MAX2(csc->max_slab_buffers + 16,
+                                (unsigned)(csc->max_slab_buffers * 1.3));
+        struct radeon_bo_item *new_buffers =
+            REALLOC(csc->slab_buffers,
+                    csc->max_slab_buffers * sizeof(*new_buffers),
+                    new_max * sizeof(*new_buffers));
+        if (!new_buffers) {
+            fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
+            return -1;
+        }
+
+        csc->max_slab_buffers = new_max;
+        csc->slab_buffers = new_buffers;
+    }
+
+    /* Initialize the new relocation. */
+    idx = csc->num_slab_buffers++;
+    item = &csc->slab_buffers[idx];
+
+    item->bo = NULL;
+    item->u.slab.real_idx = real_idx;
+    radeon_bo_reference(&item->bo, bo);
+    p_atomic_inc(&bo->num_cs_references);
+
+    hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
+    csc->reloc_indices_hashlist[hash] = idx;
+
+    return idx;
+}
+
 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
                                         struct pb_buffer *buf,
                                         enum radeon_bo_usage usage,
                                         enum radeon_bo_domain domains,
                                         enum radeon_bo_priority priority)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_bo *bo = (struct radeon_bo*)buf;
     enum radeon_bo_domain added_domains;
     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
     struct drm_radeon_cs_reloc *reloc;
-    unsigned index = radeon_lookup_or_add_buffer(cs, bo);
+    int index;
+
+    if (!bo->handle) {
+        index = radeon_lookup_or_add_slab_buffer(cs, bo);
+        if (index < 0)
+            return 0;
+
+        index = cs->csc->slab_buffers[index].u.slab.real_idx;
+    } else {
+        index = radeon_lookup_or_add_real_buffer(cs, bo);
+    }
 
     reloc = &cs->csc->relocs[index];
     added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
     reloc->read_domains |= rd;
     reloc->write_domain |= wd;
     reloc->flags = MAX2(reloc->flags, priority);
-    cs->csc->relocs_bo[index].priority_usage |= 1llu << priority;
+    cs->csc->relocs_bo[index].u.real.priority_usage |= 1llu << priority;
 
     if (added_domains & RADEON_DOMAIN_VRAM)
         cs->base.used_vram += bo->base.size;
     else if (added_domains & RADEON_DOMAIN_GTT)
         cs->base.used_gart += bo->base.size;
 
     return index;
 }
 
 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
@@ -359,21 +432,21 @@ static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
                                               struct radeon_bo_list_item *list)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     int i;
 
     if (list) {
         for (i = 0; i < cs->csc->num_relocs; i++) {
             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
-            list[i].priority_usage = cs->csc->relocs_bo[i].priority_usage;
+            list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
         }
     }
     return cs->csc->num_relocs;
 }
 
 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
 {
     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
     unsigned i;
     int r;
@@ -577,20 +650,23 @@ static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
     struct radeon_bo *bo = (struct radeon_bo*)_buf;
     int index;
 
     if (!bo->num_cs_references)
         return false;
 
     index = radeon_lookup_buffer(cs->csc, bo);
     if (index == -1)
         return false;
 
+    if (!bo->handle)
+        index = cs->csc->slab_buffers[index].u.slab.real_idx;
+
     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
         return true;
     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
         return true;
 
     return false;
 }
 
 /* FENCES */
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index bd55548..f9b26af 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -24,39 +24,50 @@
  * of the Software.
  */
 
 #ifndef RADEON_DRM_CS_H
 #define RADEON_DRM_CS_H
 
 #include "radeon_drm_bo.h"
 
 struct radeon_bo_item {
     struct radeon_bo    *bo;
-    uint64_t            priority_usage;
+    union {
+        struct {
+            uint64_t    priority_usage;
+        } real;
+        struct {
+            unsigned    real_idx;
+        } slab;
+    } u;
 };
 
 struct radeon_cs_context {
     uint32_t                    buf[16 * 1024];
 
     int                         fd;
     struct drm_radeon_cs        cs;
     struct drm_radeon_cs_chunk  chunks[3];
     uint64_t                    chunk_array[3];
     uint32_t                    flags[2];
 
     /* Buffers. */
     unsigned                    max_relocs;
     unsigned                    num_relocs;
     unsigned                    num_validated_relocs;
     struct radeon_bo_item       *relocs_bo;
     struct drm_radeon_cs_reloc  *relocs;
 
+    unsigned                    num_slab_buffers;
+    unsigned                    max_slab_buffers;
+    struct radeon_bo_item       *slab_buffers;
+
     int                         reloc_indices_hashlist[4096];
 };
 
 struct radeon_drm_cs {
     struct radeon_winsys_cs base;
     enum ring_type          ring_type;
 
     /* We flip between these two CS. While one is being consumed
      * by the kernel in another thread, the other one is being filled
      * by the pipe driver. */
@@ -101,20 +112,23 @@ radeon_bo_is_referenced_by_cs_for_write(struct radeon_drm_cs *cs,
 {
     int index;
 
     if (!bo->num_cs_references)
         return false;
 
     index = radeon_lookup_buffer(cs->csc, bo);
     if (index == -1)
         return false;
 
+    if (!bo->handle)
+        index = cs->csc->slab_buffers[index].u.slab.real_idx;
+
     return cs->csc->relocs[index].write_domain != 0;
 }
 
 static inline bool
 radeon_bo_is_referenced_by_any_cs(struct radeon_bo *bo)
 {
     return bo->num_cs_references != 0;
 }
 
 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs);
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index 27fbe90..5514980 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -68,20 +68,21 @@ struct radeon_drm_winsys {
     struct pb_cache bo_cache;
 
     int fd; /* DRM file descriptor */
     int num_cs; /* The number of command streams created. */
     uint64_t allocated_vram;
     uint64_t allocated_gtt;
     uint64_t mapped_vram;
     uint64_t mapped_gtt;
     uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */
     uint64_t num_cs_flushes;
+    uint32_t next_bo_hash;
 
     enum radeon_generation gen;
     struct radeon_info info;
     uint32_t va_start;
     uint32_t va_unmap_working;
     uint32_t accel_working2;
 
     /* List of buffer GEM names. Protected by bo_handles_mutex. */
     struct util_hash_table *bo_names;
     /* List of buffer handles. Protectded by bo_handles_mutex. */
-- 
2.7.4