Mesa (master): panfrost: Implement index buffer cache

Thu Feb 27 11:53:47 UTC 2020

Module: Mesa
Branch: master
Commit: d385c5840f9f5683e0ca2dcb254b494562838a90
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d385c5840f9f5683e0ca2dcb254b494562838a90

Author: Alyssa Rosenzweig <alyssa.rosenzweig at collabora.com>
Date:   Wed Feb 19 10:32:20 2020 -0500

panfrost: Implement index buffer cache

For index bufer resources (not user index buffers), we're able to cache
results. In practice, the cache works pretty dang well. It's still
important that the min/max computation is efficient (since when the
cache misses it'll run at draw-time and we don't want jank), but this
can eliminate a lot of computations entirely.

We use a custom data structure for caching. Search is O(N) to the size
but sizes are capped so it's effectively O(1). Insertion is O(1) with
automatic oldest eviction, on the assumption that the oldest results are
the least likely to still be useful. We might also experiment with other
heuristics based on actual usage later.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig at collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon at collabora.com>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3880>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3880>

---

 src/gallium/drivers/panfrost/pan_context.c  | 37 ++++++++++++++++++++
 src/gallium/drivers/panfrost/pan_resource.c | 52 ++++++++++++++++++++++++++++-
 src/gallium/drivers/panfrost/pan_resource.h | 26 +++++++++++++++
 3 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index 0381e410437..598009ba1c0 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -1274,6 +1274,8 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
                 needs_indices = false;
         }
 
+        uint64_t ht_key = 0;
+
         if (!info->has_user_indices) {
                 /* Only resources can be directly mapped */
                 panfrost_batch_add_bo(batch, rsrc->bo,
@@ -1281,6 +1283,24 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
                                       PAN_BO_ACCESS_READ |
                                       PAN_BO_ACCESS_VERTEX_TILER);
                 out = rsrc->bo->gpu + offset;
+
+                /* Check the cache */
+                if (rsrc->index_cache) {
+                        ht_key = (((uint64_t) info->count) << 32) | info->start;
+
+                        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+
+                        for (unsigned i = 0; i < cache->size; ++i) {
+                                if (cache->keys[i] == ht_key) {
+                                        uint64_t hit = cache->values[i];
+
+                                        *min_index = hit & 0xffffffff;
+                                        *max_index = hit >> 32;
+                                        needs_indices = false;
+                                        break;
+                                }
+                        }
+                }
         } else {
                 /* Otherwise, we need to upload to transient memory */
                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
@@ -1290,8 +1310,25 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
         if (needs_indices) {
                 /* Fallback */
                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
+
+                if (!info->has_user_indices && rsrc->index_cache) {
+                        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+                        uint64_t value = (*min_index) | (((uint64_t) *max_index) << 32);
+                        unsigned index = 0;
+
+                        if (cache->size == PANFROST_MINMAX_SIZE) {
+                                index = cache->index++;
+                                cache->index = cache->index % PANFROST_MINMAX_SIZE;
+                        } else {
+                                index = cache->size++;
+                        }
+
+                        cache->keys[index] =  ht_key;
+                        cache->values[index] = value;
+                }
         }
 
+
         return out;
 }
 
diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c
index d4c134f6246..b9d3cf31e3a 100644
--- a/src/gallium/drivers/panfrost/pan_resource.c
+++ b/src/gallium/drivers/panfrost/pan_resource.c
@@ -513,6 +513,9 @@ panfrost_resource_create(struct pipe_screen *screen,
         panfrost_resource_create_bo(pscreen, so);
         panfrost_resource_reset_damage(so);
 
+        if (template->bind & PIPE_BIND_INDEX_BUFFER)
+                so->index_cache = rzalloc(so, struct panfrost_minmax_cache);
+
         return (struct pipe_resource *)so;
 }
 
@@ -533,6 +536,40 @@ panfrost_resource_destroy(struct pipe_screen *screen,
         ralloc_free(rsrc);
 }
 
+/* If we've been caching min/max indices and we update the index
+ * buffer, that may invalidate the min/max. Check what's been cached vs
+ * what we've written, and throw out invalid entries. */
+
+static void
+panfrost_invalidate_index_cache(struct panfrost_resource *rsrc, struct pipe_transfer *transfer)
+{
+        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+
+        /* Ensure there is a cache to invalidate and a write */
+        if (!rsrc->index_cache) return;
+        if (!(transfer->usage & PIPE_TRANSFER_WRITE)) return;
+
+        unsigned valid_count = 0;
+
+        for (unsigned i = 0; i < cache->size; ++i) {
+                uint64_t key = cache->keys[i];
+
+                uint32_t start = key & 0xffffffff;
+                uint32_t count = key >> 32;
+
+                /* 1D range intersection */
+                bool invalid = MAX2(transfer->box.x, start) < MIN2(transfer->box.x + transfer->box.width, start + count);
+                if (!invalid) {
+                        cache->keys[valid_count] = key;
+                        cache->values[valid_count] = cache->values[i];
+                        valid_count++;
+                }
+        }
+
+        cache->size = valid_count;
+        cache->index = 0;
+}
+
 static void *
 panfrost_transfer_map(struct pipe_context *pctx,
                       struct pipe_resource *resource,
@@ -635,6 +672,15 @@ panfrost_transfer_map(struct pipe_context *pctx,
 
                 return transfer->map;
         } else {
+                /* Direct, persistent writes create holes in time for
+                 * caching... I don't know if this is actually possible but we
+                 * should still get it right */
+
+                unsigned dpw = PIPE_TRANSFER_MAP_DIRECTLY | PIPE_TRANSFER_WRITE | PIPE_TRANSFER_PERSISTENT;
+
+                if ((usage & dpw) == dpw && rsrc->index_cache)
+                        return NULL;
+
                 transfer->base.stride = rsrc->slices[level].stride;
                 transfer->base.layer_stride = panfrost_get_layer_stride(
                                 rsrc->slices, rsrc->base.target == PIPE_TEXTURE_3D,
@@ -643,8 +689,10 @@ panfrost_transfer_map(struct pipe_context *pctx,
                 /* By mapping direct-write, we're implicitly already
                  * initialized (maybe), so be conservative */
 
-                if ((usage & PIPE_TRANSFER_WRITE) && (usage & PIPE_TRANSFER_MAP_DIRECTLY))
+                if ((usage & PIPE_TRANSFER_WRITE) && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) {
                         rsrc->slices[level].initialized = true;
+                        panfrost_invalidate_index_cache(rsrc, &transfer->base);
+                }
 
                 return bo->cpu
                        + rsrc->slices[level].offset
@@ -693,6 +741,8 @@ panfrost_transfer_unmap(struct pipe_context *pctx,
                        transfer->box.x,
                        transfer->box.x + transfer->box.width);
 
+        panfrost_invalidate_index_cache(prsrc, transfer);
+
         /* Derefence the resource */
         pipe_resource_reference(&transfer->resource, NULL);
 
diff --git a/src/gallium/drivers/panfrost/pan_resource.h b/src/gallium/drivers/panfrost/pan_resource.h
index 7173526023f..2728c7f0aeb 100644
--- a/src/gallium/drivers/panfrost/pan_resource.h
+++ b/src/gallium/drivers/panfrost/pan_resource.h
@@ -33,6 +33,29 @@
 #include "drm-uapi/drm.h"
 #include "util/u_range.h"
 
+/* Index buffer min/max cache. We need to caclculate the min/max for arbitrary
+ * slices (start, start + count) of the index buffer at drawtime. As this can
+ * be quite expensive, we cache. Conceptually, we just use a hash table mapping
+ * the key (start, count) to the value (min, max). In practice, mesa's hash
+ * table implementation is higher overhead than we would like and makes
+ * handling memory usage a little complicated. So we use this data structure
+ * instead. Searching is O(n) to the size, but the size is capped at the
+ * PANFROST_MINMAX_SIZE constant (so this is a tradeoff between cache hit/miss
+ * ratio and cache search speed). Note that keys are adjacent so we get cache
+ * line alignment benefits. Insertion is O(1) and in-order until the cache
+ * fills up, after that it evicts the oldest cached value in a ring facilitated
+ * by index.
+ */
+
+#define PANFROST_MINMAX_SIZE 64
+
+struct panfrost_minmax_cache {
+        uint64_t keys[PANFROST_MINMAX_SIZE];
+        uint64_t values[PANFROST_MINMAX_SIZE];
+        unsigned size;
+        unsigned index;
+};
+
 struct panfrost_resource {
         struct pipe_resource base;
         struct {
@@ -60,6 +83,9 @@ struct panfrost_resource {
         bool checksummed;
 
         enum pipe_format internal_format;
+
+        /* Cached min/max values for index buffers */
+        struct panfrost_minmax_cache *index_cache;
 };
 
 static inline struct panfrost_resource *