[Mesa-dev] [PATCH 2/5] gallium/pb_cache: divide the cache into buckets for reducing cache misses

Mon Jul 18 12:35:51 UTC 2016

From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c |  4 +-
 src/gallium/auxiliary/pipebuffer/pb_cache.c        | 49 ++++++++++++++--------
 src/gallium/auxiliary/pipebuffer/pb_cache.h        | 12 ++++--
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c          |  4 +-
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c      |  4 +-
 5 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 4dbf3ff..250f739 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -210,7 +210,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    /* get a buffer from the cache */
    buf = (struct pb_cache_buffer *)
          pb_cache_reclaim_buffer(&mgr->cache, size, desc->alignment,
-                                 desc->usage);
+                                 desc->usage, 0);
    if (buf)
       return &buf->base;
 
@@ -243,7 +243,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    
    buf->base.vtbl = &pb_cache_buffer_vtbl;
    buf->mgr = mgr;
-   pb_cache_init_entry(&mgr->cache, &buf->cache_entry, &buf->base);
+   pb_cache_init_entry(&mgr->cache, &buf->cache_entry, &buf->base, 0);
    
    return &buf->base;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_cache.c b/src/gallium/auxiliary/pipebuffer/pb_cache.c
index 6a43cbc..664867b 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_cache.c
@@ -53,7 +53,7 @@ destroy_buffer_locked(struct pb_cache_entry *entry)
  * Free as many cache buffers from the list head as possible.
  */
 static void
-release_expired_buffers_locked(struct pb_cache *mgr)
+release_expired_buffers_locked(struct list_head *cache)
 {
    struct list_head *curr, *next;
    struct pb_cache_entry *entry;
@@ -61,9 +61,9 @@ release_expired_buffers_locked(struct pb_cache *mgr)
 
    now = os_time_get();
 
-   curr = mgr->cache.next;
+   curr = cache->next;
    next = curr->next;
-   while (curr != &mgr->cache) {
+   while (curr != cache) {
       entry = LIST_ENTRY(struct pb_cache_entry, curr, head);
 
       if (!os_time_timeout(entry->start, entry->end, now))
@@ -84,11 +84,14 @@ void
 pb_cache_add_buffer(struct pb_cache_entry *entry)
 {
    struct pb_cache *mgr = entry->mgr;
+   struct list_head *cache = &mgr->buckets[entry->bucket_index];
+   unsigned i;
 
    pipe_mutex_lock(mgr->mutex);
    assert(!pipe_is_referenced(&entry->buffer->reference));
 
-   release_expired_buffers_locked(mgr);
+   for (i = 0; i < ARRAY_SIZE(mgr->buckets); i++)
+      release_expired_buffers_locked(&mgr->buckets[i]);
 
    /* Directly release any buffer that exceeds the limit. */
    if (mgr->cache_size + entry->buffer->size > mgr->max_cache_size) {
@@ -99,7 +102,7 @@ pb_cache_add_buffer(struct pb_cache_entry *entry)
 
    entry->start = os_time_get();
    entry->end = entry->start + mgr->usecs;
-   LIST_ADDTAIL(&entry->head, &mgr->cache);
+   LIST_ADDTAIL(&entry->head, cache);
    ++mgr->num_buffers;
    mgr->cache_size += entry->buffer->size;
    pipe_mutex_unlock(mgr->mutex);
@@ -140,23 +143,25 @@ pb_cache_is_buffer_compat(struct pb_cache_entry *entry,
  */
 struct pb_buffer *
 pb_cache_reclaim_buffer(struct pb_cache *mgr, pb_size size,
-                        unsigned alignment, unsigned usage)
+                        unsigned alignment, unsigned usage,
+                        unsigned bucket_index)
 {
    struct pb_cache_entry *entry;
    struct pb_cache_entry *cur_entry;
    struct list_head *cur, *next;
    int64_t now;
    int ret = 0;
+   struct list_head *cache = &mgr->buckets[bucket_index];
 
    pipe_mutex_lock(mgr->mutex);
 
    entry = NULL;
-   cur = mgr->cache.next;
+   cur = cache->next;
    next = cur->next;
 
    /* search in the expired buffers, freeing them in the process */
    now = os_time_get();
-   while (cur != &mgr->cache) {
+   while (cur != cache) {
       cur_entry = LIST_ENTRY(struct pb_cache_entry, cur, head);
 
       if (!entry && (ret = pb_cache_is_buffer_compat(cur_entry, size,
@@ -178,7 +183,7 @@ pb_cache_reclaim_buffer(struct pb_cache *mgr, pb_size size,
 
    /* keep searching in the hot buffers */
    if (!entry && ret != -1) {
-      while (cur != &mgr->cache) {
+      while (cur != cache) {
          cur_entry = LIST_ENTRY(struct pb_cache_entry, cur, head);
          ret = pb_cache_is_buffer_compat(cur_entry, size, alignment, usage);
 
@@ -219,26 +224,32 @@ pb_cache_release_all_buffers(struct pb_cache *mgr)
 {
    struct list_head *curr, *next;
    struct pb_cache_entry *buf;
+   unsigned i;
 
    pipe_mutex_lock(mgr->mutex);
-   curr = mgr->cache.next;
-   next = curr->next;
-   while (curr != &mgr->cache) {
-      buf = LIST_ENTRY(struct pb_cache_entry, curr, head);
-      destroy_buffer_locked(buf);
-      curr = next;
+   for (i = 0; i < ARRAY_SIZE(mgr->buckets); i++) {
+      struct list_head *cache = &mgr->buckets[i];
+
+      curr = cache->next;
       next = curr->next;
+      while (curr != cache) {
+         buf = LIST_ENTRY(struct pb_cache_entry, curr, head);
+         destroy_buffer_locked(buf);
+         curr = next;
+         next = curr->next;
+      }
    }
    pipe_mutex_unlock(mgr->mutex);
 }
 
 void
 pb_cache_init_entry(struct pb_cache *mgr, struct pb_cache_entry *entry,
-                    struct pb_buffer *buf)
+                    struct pb_buffer *buf, unsigned bucket_index)
 {
    memset(entry, 0, sizeof(*entry));
    entry->buffer = buf;
    entry->mgr = mgr;
+   entry->bucket_index = bucket_index;
 }
 
 /**
@@ -262,7 +273,11 @@ pb_cache_init(struct pb_cache *mgr, uint usecs, float size_factor,
               void (*destroy_buffer)(struct pb_buffer *buf),
               bool (*can_reclaim)(struct pb_buffer *buf))
 {
-   LIST_INITHEAD(&mgr->cache);
+   unsigned i;
+
+   for (i = 0; i < ARRAY_SIZE(mgr->buckets); i++)
+      LIST_INITHEAD(&mgr->buckets[i]);
+
    pipe_mutex_init(mgr->mutex);
    mgr->cache_size = 0;
    mgr->max_cache_size = maximum_cache_size;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_cache.h b/src/gallium/auxiliary/pipebuffer/pb_cache.h
index f0fa012..aa83cc8 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_cache.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_cache.h
@@ -42,11 +42,16 @@ struct pb_cache_entry
    struct pb_buffer *buffer; /**< Pointer to the structure this is part of. */
    struct pb_cache *mgr;
    int64_t start, end; /**< Caching time interval */
+   unsigned bucket_index;
 };
 
 struct pb_cache
 {
-   struct list_head cache;
+   /* The cache is divided into buckets for minimizing cache misses.
+    * The driver controls which buffer goes into which bucket.
+    */
+   struct list_head buckets[8];
+
    pipe_mutex mutex;
    uint64_t cache_size;
    uint64_t max_cache_size;
@@ -61,10 +66,11 @@ struct pb_cache
 
 void pb_cache_add_buffer(struct pb_cache_entry *entry);
 struct pb_buffer *pb_cache_reclaim_buffer(struct pb_cache *mgr, pb_size size,
-                                       unsigned alignment, unsigned usage);
+                                          unsigned alignment, unsigned usage,
+                                          unsigned bucket_index);
 void pb_cache_release_all_buffers(struct pb_cache *mgr);
 void pb_cache_init_entry(struct pb_cache *mgr, struct pb_cache_entry *entry,
-                         struct pb_buffer *buf);
+                         struct pb_buffer *buf, unsigned bucket_index);
 void pb_cache_init(struct pb_cache *mgr, uint usecs, float size_factor,
                    unsigned bypass_usage, uint64_t maximum_cache_size,
                    void (*destroy_buffer)(struct pb_buffer *buf),
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index b4dc6c7..3545b78 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -303,7 +303,7 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
       return NULL;
    }
 
-   pb_cache_init_entry(&ws->bo_cache, &bo->cache_entry, &bo->base);
+   pb_cache_init_entry(&ws->bo_cache, &bo->cache_entry, &bo->base, 0);
    request.alloc_size = size;
    request.phys_alignment = alignment;
 
@@ -508,7 +508,7 @@ amdgpu_bo_create(struct radeon_winsys *rws,
 
    /* Get a buffer from the cache. */
    bo = (struct amdgpu_winsys_bo*)
-        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage);
+        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, 0);
    if (bo)
       return &bo->base;
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 52b1ccd..82dec8e 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -551,7 +551,7 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,
     bo->va = 0;
     bo->initial_domain = initial_domains;
     pipe_mutex_init(bo->map_mutex);
-    pb_cache_init_entry(&rws->bo_cache, &bo->cache_entry, &bo->base);
+    pb_cache_init_entry(&rws->bo_cache, &bo->cache_entry, &bo->base, 0);
 
     if (rws->info.has_virtual_memory) {
         struct drm_radeon_gem_va va;
@@ -746,7 +746,7 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
     assert(flags < sizeof(usage) * 8 - 3);
     usage |= 1 << (flags + 3);
 
-    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage));
+    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, 0));
     if (bo)
         return &bo->base;
 
-- 
2.7.4