[Intel-gfx] [PATCH] dma-buf: Replace reservation shared fence array with a compressed radix tree

Chris Wilson chris at chris-wilson.co.uk
Mon Nov 14 08:31:58 UTC 2016


The primary operation on the shared fence arrays is insertion and
retrieval.  Retrieval is reasonably fast, as we just copy the array, but
insertion into the shared fence array is slow as we must iterate over all
current fences to discard an older fence.  (Note also that since the
array is not autopruning, we cannot hook in the fence signal callback due
to locking constraints, thus it can grow very, very large.)  This, coupled
with the atomics to acquire and release the shared fence, causes a severe
performance degradation and scaling bottleneck, as felt by i915.ko in
commit d07f0e59b2c7 ("drm/i915: Move GEM activity tracking into a common
struct reservation_object").  To speed up insertion, we want a direct
replacement of the per-context slot. Enter the radix tree.

The kernel already has a couple of options for integer radix trees, but
both are not suitable for us for a couple of reasons.  First we need a
per-tree preallocation in order to ensure that adding a fence does not
fail.  Both radixtree and idr only offer preallocation under a preempt
disabled critical section, unsuitable for our existing users.  idr has
an internal __idr_pre_get that could be exported - but idr does not yet
support inserting entries at an arbitrary location.  Secondly, neither
tree supports full u64 integers as used by the fence context identifiers
(though radixtree would support that on 64bit platforms).  And finally,
if we ignore the API shortcomings, radixtree still appears too high in
the profiles!

So what are our requirements for the shared fence array?

	- guaranteed insertion
	- fast insertion
	- RCU-safe, fast traversal
	- 64bit identifiers

To guarantee insertion, we need to preallocate enough storage to satisfy
a later insertion.  The reservation object API requires every
add_shared_fence to preceded by a call to reserve_shared_fence.  For an
uncompressed radix tree we would need to preallocate enough layers to cover
the full 64bit height, but with a compressed radix tree we only require a
maximum of 2 spare layers.

The compressed tree also offers a useful property wrt to the pattern of
fences used.  The fence contexts are allocated as a pure cyclic atomic64,
i.e. it is sparse and ever incrementing.  However, timelines tend to
cluster (i.e. they will allocate a cluster of fence contexts for
themselves and primarily use these).  So we may see that we have a
mixture of low fence contents and a group of very high contexts (e.g.
a group at id<16 and a group at id>65536).  The compressed tree avoids
having to allocate the intermediate layers (reducing the pointer dancing
on lookup and traversal) - still not as dense as the current compact
array, but under typical conditions as good as.  (However, the current
array requires contiguous allocations - a scarce resource for the ever
expanding array!)  The density could be improved by switching from a
simple dangerously wrapping atomic64 to an ida for fence context
allocation.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Sumit Semwal <sumit.semwal at linaro.org>
Cc: Alex Deucher <alexander.deucher at amd.com>
Cc: "Christian König" <christian.koenig at amd.com>
Cc: David Airlie <airlied at linux.ie>
Cc: Lucas Stach <l.stach at pengutronix.de>
Cc: Russell King <linux+etnaviv at armlinux.org.uk>
Cc: Christian Gmeiner <christian.gmeiner at gmail.com>
Cc: Daniel Vetter <daniel.vetter at intel.com>
Cc: Jani Nikula <jani.nikula at linux.intel.com>
Cc: Rob Clark <robdclark at gmail.com>
Cc: Ben Skeggs <bskeggs at redhat.com>
Cc: Chunming Zhou <David1.Zhou at amd.com>
Cc: Ken Wang <Qingqing.Wang at amd.com>
Cc: Monk Liu <monk.liu at amd.com>
Cc: Sinclair Yeh <syeh at vmware.com>
Cc: Thomas Hellstrom <thellstrom at vmware.com>
Cc: "Michel Dänzer" <michel.daenzer at amd.com>
Cc: Flora Cui <Flora.Cui at amd.com>
Cc: dri-devel at lists.freedesktop.org
---
 drivers/dma-buf/Kconfig                        |  13 +
 drivers/dma-buf/Makefile                       |   1 +
 drivers/dma-buf/dma-buf.c                      |  35 +-
 drivers/dma-buf/reservation.c                  | 650 +++++++++++++------------
 drivers/dma-buf/test-reservation.c             | 308 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c       |  14 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c          |  15 +-
 drivers/gpu/drm/etnaviv/etnaviv_gpu.c          |  14 +-
 drivers/gpu/drm/i915/i915_gem.c                |  17 +-
 drivers/gpu/drm/msm/msm_gem.c                  |  29 +-
 drivers/gpu/drm/nouveau/nouveau_fence.c        |  15 +-
 drivers/gpu/drm/nouveau/nouveau_gem.c          |  21 +-
 drivers/gpu/drm/qxl/qxl_debugfs.c              |   9 +-
 drivers/gpu/drm/radeon/radeon_sync.c           |  12 +-
 drivers/gpu/drm/ttm/ttm_bo.c                   |  10 +-
 include/linux/reservation.h                    | 154 ++++--
 tools/testing/selftests/dma-buf/reservation.sh |  11 +
 17 files changed, 856 insertions(+), 472 deletions(-)
 create mode 100644 drivers/dma-buf/test-reservation.c
 create mode 100644 tools/testing/selftests/dma-buf/reservation.sh

diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig
index ed3b785bae37..c74481c715b8 100644
--- a/drivers/dma-buf/Kconfig
+++ b/drivers/dma-buf/Kconfig
@@ -30,4 +30,17 @@ config SW_SYNC
 	  WARNING: improper use of this can result in deadlocking kernel
 	  drivers from userspace. Intended for test and debug only.
 
+config RESERVATION_SELFTEST
+	tristate "kselftests for reservation_objects"
+	depends on DEBUG_KERNEL
+	default n
+	---help---
+	  This option provides a kernel module that can be used to test
+	  the struct reservation_object API. This option is not useful
+	  for distributions or general kernels, but only for kernel
+	  developers working on dma-buf, dma-fences and, of course,
+	  reservation_objects.
+
+	  Say N if you are unsure
+
 endmenu
diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile
index c33bf8863147..48333a59006b 100644
--- a/drivers/dma-buf/Makefile
+++ b/drivers/dma-buf/Makefile
@@ -1,3 +1,4 @@
 obj-y := dma-buf.o dma-fence.o dma-fence-array.o reservation.o seqno-fence.o
 obj-$(CONFIG_SYNC_FILE)		+= sync_file.o
 obj-$(CONFIG_SW_SYNC)		+= sw_sync.o sync_debug.o
+obj-$(CONFIG_RESERVATION_SELFTEST) += test-reservation.o
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index e72e64484131..bc87b48a0d3d 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -139,10 +139,9 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
 {
 	struct dma_buf *dmabuf;
 	struct reservation_object *resv;
-	struct reservation_object_list *fobj;
-	struct dma_fence *fence_excl;
+	struct dma_fence *excl;
 	unsigned long events;
-	unsigned shared_count, seq;
+	unsigned int seq;
 
 	dmabuf = file->private_data;
 	if (!dmabuf || !dmabuf->resv)
@@ -160,22 +159,18 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
 	seq = read_seqcount_begin(&resv->seq);
 	rcu_read_lock();
 
-	fobj = rcu_dereference(resv->fence);
-	if (fobj)
-		shared_count = fobj->shared_count;
-	else
-		shared_count = 0;
-	fence_excl = rcu_dereference(resv->fence_excl);
+	excl = rcu_dereference(resv->excl);
 	if (read_seqcount_retry(&resv->seq, seq)) {
 		rcu_read_unlock();
 		goto retry;
 	}
 
-	if (fence_excl && (!(events & POLLOUT) || shared_count == 0)) {
+	if (excl &&
+	    (!(events & POLLOUT) || !reservation_object_has_shared(resv))) {
 		struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_excl;
 		unsigned long pevents = POLLIN;
 
-		if (shared_count == 0)
+		if (!reservation_object_has_shared(resv))
 			pevents |= POLLOUT;
 
 		spin_lock_irq(&dmabuf->poll.lock);
@@ -187,28 +182,28 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
 		spin_unlock_irq(&dmabuf->poll.lock);
 
 		if (events & pevents) {
-			if (!dma_fence_get_rcu(fence_excl)) {
+			if (!dma_fence_get_rcu(excl)) {
 				/* force a recheck */
 				events &= ~pevents;
 				dma_buf_poll_cb(NULL, &dcb->cb);
-			} else if (!dma_fence_add_callback(fence_excl, &dcb->cb,
+			} else if (!dma_fence_add_callback(excl, &dcb->cb,
 							   dma_buf_poll_cb)) {
 				events &= ~pevents;
-				dma_fence_put(fence_excl);
+				dma_fence_put(excl);
 			} else {
 				/*
 				 * No callback queued, wake up any additional
 				 * waiters.
 				 */
-				dma_fence_put(fence_excl);
+				dma_fence_put(excl);
 				dma_buf_poll_cb(NULL, &dcb->cb);
 			}
 		}
 	}
 
-	if ((events & POLLOUT) && shared_count > 0) {
+	if ((events & POLLOUT) && reservation_object_has_shared(resv)) {
 		struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_shared;
-		int i;
+		struct reservation_shared_iter iter;
 
 		/* Only queue a new callback if no event has fired yet */
 		spin_lock_irq(&dmabuf->poll.lock);
@@ -221,8 +216,8 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
 		if (!(events & POLLOUT))
 			goto out;
 
-		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence = rcu_dereference(fobj->shared[i]);
+		reservation_object_for_each_shared(resv, iter) {
+			struct dma_fence *fence = iter.fence;
 
 			if (!dma_fence_get_rcu(fence)) {
 				/*
@@ -245,7 +240,7 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll)
 		}
 
 		/* No callback queued, wake up any additional waiters. */
-		if (i == shared_count)
+		if (events & POLLOUT)
 			dma_buf_poll_cb(NULL, &dcb->cb);
 	}
 
diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 393817e849ed..65bf881b5974 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -46,6 +46,14 @@
  * write-side updates.
  */
 
+#if 0
+#define assert(x) BUG_ON(!(x))
+#define dbg(x) pr_err x
+#else
+#define assert(x) do { } while (0)
+#define dbg(x) do { } while (0)
+#endif
+
 DEFINE_WW_CLASS(reservation_ww_class);
 EXPORT_SYMBOL(reservation_ww_class);
 
@@ -55,149 +63,262 @@ EXPORT_SYMBOL(reservation_seqcount_class);
 const char reservation_seqcount_string[] = "reservation_seqcount";
 EXPORT_SYMBOL(reservation_seqcount_string);
 
-/**
- * reservation_object_reserve_shared - Reserve space to add a shared
- * fence to a reservation_object.
- * @obj: reservation object
- *
- * Should be called before reservation_object_add_shared_fence().  Must
- * be called with obj->lock held.
- *
- * RETURNS
- * Zero for success, or -errno
- */
-int reservation_object_reserve_shared(struct reservation_object *obj)
+#define SHIFT ilog2(NSHARED)
+#define MASK (NSHARED - 1)
+
+void __reservation_shared_iter_next(struct reservation_shared_iter *iter)
 {
-	struct reservation_object_list *fobj, *old;
-	u32 max;
-
-	old = reservation_object_get_list(obj);
-
-	if (old && old->shared_max) {
-		if (old->shared_count < old->shared_max) {
-			/* perform an in-place update */
-			kfree(obj->staged);
-			obj->staged = NULL;
-			return 0;
-		} else
-			max = old->shared_max * 2;
-	} else
-		max = 4;
-
-	/*
-	 * resize obj->staged or allocate if it doesn't exist,
-	 * noop if already correct size
-	 */
-	fobj = krealloc(obj->staged, offsetof(typeof(*fobj), shared[max]),
-			GFP_KERNEL);
-	if (!fobj)
-		return -ENOMEM;
+	struct reservation_shared_layer *p = iter->p;
+	int pos, h;
 
-	obj->staged = fobj;
-	fobj->shared_max = max;
-	return 0;
-}
-EXPORT_SYMBOL(reservation_object_reserve_shared);
+	do {
+		p = p->parent;
+		if (!p) {
+			iter->fence = NULL;
+			return;
+		}
 
-static void
-reservation_object_add_shared_inplace(struct reservation_object *obj,
-				      struct reservation_object_list *fobj,
-				      struct dma_fence *fence)
-{
-	u32 i;
+		h = p->height / SHIFT;
+		pos = fns(p->bitmap, iter->stack[h] + 1);
+	} while (!pos);
 
-	dma_fence_get(fence);
+	iter->stack[h] = --pos;
 
-	preempt_disable();
-	write_seqcount_begin(&obj->seq);
+	__reservation_shared_iter_fill(iter, p->slot[pos]);
+}
+EXPORT_SYMBOL(__reservation_shared_iter_next);
 
-	for (i = 0; i < fobj->shared_count; ++i) {
-		struct dma_fence *old_fence;
+#define ptr_mask(ptr) (__alignof__(*(ptr)) - 1)
 
-		old_fence = rcu_dereference_protected(fobj->shared[i],
-						reservation_object_held(obj));
+#define ptr_mask_bits(ptr) ({						\
+	unsigned long __v = (unsigned long)(ptr);			\
+	(typeof(ptr))(__v & ~ptr_mask(ptr));				\
+})
 
-		if (old_fence->context == fence->context) {
-			/* memory barrier is added by write_seqcount_begin */
-			RCU_INIT_POINTER(fobj->shared[i], fence);
-			write_seqcount_end(&obj->seq);
-			preempt_enable();
+#define ptr_get_bits(ptr) ({						\
+	unsigned long __v = (unsigned long)(ptr);			\
+	(__v & ptr_mask(ptr));						\
+})
 
-			dma_fence_put(old_fence);
-			return;
-		}
+#define ptr_set_bits(ptr, x) ({						\
+	unsigned long __v = (unsigned long)(ptr);			\
+	(typeof(ptr))(__v | (x));					\
+})
+
+static void shared_free_layers(struct reservation_shared_layer *p)
+{
+	unsigned int i;
+
+	if (p->height) {
+		for (; (i = ffs(p->bitmap)); p->bitmap &= ~0 << i)
+			shared_free_layers(p->slot[i - 1]);
+	} else {
+		for (; (i = ffs(p->bitmap)); p->bitmap &= ~0 << i)
+			dma_fence_put(p->slot[i - 1]);
 	}
 
-	/*
-	 * memory barrier is added by write_seqcount_begin,
-	 * fobj->shared_count is protected by this lock too
-	 */
-	RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
-	fobj->shared_count++;
+	/* Defer the free until after any concurrent readers finish. */
+	p->parent = NULL;
+	kfree_rcu(p, rcu);
+}
 
-	write_seqcount_end(&obj->seq);
-	preempt_enable();
+static void shared_free(struct reservation_shared *shared)
+{
+	if (!shared->top)
+		return;
+
+	shared_free_layers(shared->top);
 }
 
-static void
-reservation_object_add_shared_replace(struct reservation_object *obj,
-				      struct reservation_object_list *old,
-				      struct reservation_object_list *fobj,
-				      struct dma_fence *fence)
+void reservation_shared_destroy(struct reservation_shared *shared)
 {
-	unsigned i;
-	struct dma_fence *old_fence = NULL;
+	struct reservation_shared_layer *p;
 
-	dma_fence_get(fence);
+	shared_free(shared);
 
-	if (!old) {
-		RCU_INIT_POINTER(fobj->shared[0], fence);
-		fobj->shared_count = 1;
-		goto done;
+	while (shared->freed) {
+		p = ptr_mask_bits(shared->freed);
+		shared->freed = p->parent;
+		kfree(p);
 	}
+}
+EXPORT_SYMBOL(reservation_shared_destroy);
 
-	/*
-	 * no need to bump fence refcounts, rcu_read access
-	 * requires the use of kref_get_unless_zero, and the
-	 * references from the old struct are carried over to
-	 * the new.
-	 */
-	fobj->shared_count = old->shared_count;
+__malloc static struct reservation_shared_layer *
+shared_alloc_layer(struct reservation_shared *shared)
+{
+	struct reservation_shared_layer *p;
+
+	p = ptr_mask_bits(shared->freed);
+	shared->freed = p->parent;
+
+	return p;
+}
 
-	for (i = 0; i < old->shared_count; ++i) {
-		struct dma_fence *check;
+static int layer_idx(const struct reservation_shared_layer *p, u64 id)
+{
+	return (id >> p->height) & MASK;
+}
 
-		check = rcu_dereference_protected(old->shared[i],
-						reservation_object_held(obj));
+static void *
+shared_fence_replace(struct reservation_shared *shared, u64 id, void *item)
+{
+	struct reservation_shared_layer *p, *cur;
+	unsigned int idx;
 
-		if (!old_fence && check->context == fence->context) {
-			old_fence = check;
-			RCU_INIT_POINTER(fobj->shared[i], fence);
-		} else
-			RCU_INIT_POINTER(fobj->shared[i], check);
+	dbg(("%s(id=%llx)\n", __func__, id));
+
+	/* First see if this fence is in the same layer as the previous fence */
+	p = shared->hint;
+	if (p && (id >> SHIFT) == p->prefix) {
+		assert(p->height == 0);
+		goto found_layer;
 	}
-	if (!old_fence) {
-		RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
-		fobj->shared_count++;
+
+	p = shared->top;
+	if (!p) {
+		cur = shared_alloc_layer(shared);
+		cur->parent = NULL;
+		shared->top = cur;
+		goto new_layer;
 	}
 
-done:
-	preempt_disable();
-	write_seqcount_begin(&obj->seq);
-	/*
-	 * RCU_INIT_POINTER can be used here,
-	 * seqcount provides the necessary barriers
+	/* No shortcut, we have to descend the tree to find the right layer
+	 * containing this fence.
+	 *
+	 * Each layer in the tree holds 16 (NSHARED) pointers, either fences
+	 * or lower layers. Leaf nodes (height = 0) contain the fences, all
+	 * other nodes (height > 0) are internal layers that point to a lower
+	 * node. Each internal layer has at least 2 descendents.
+	 *
+	 * Starting at the top, we check whether the current prefix matches. If
+	 * it doesn't, we have gone passed our layer and need to insert a join
+	 * into the tree, and a new leaf node as a descendent as well as the
+	 * original layer.
+	 *
+	 * The matching prefix means we are still following the right branch
+	 * of the tree. If it has height 0, we have found our leaf and just
+	 * need to replace the fence slot with ourselves. If the height is
+	 * not zero, our slot contains the next layer in the tree (unless
+	 * it is empty, in which case we can add ourselves as a new leaf).
+	 * As descend the tree the prefix grows (and height decreases).
 	 */
-	RCU_INIT_POINTER(obj->fence, fobj);
-	write_seqcount_end(&obj->seq);
-	preempt_enable();
+	do {
+		dbg(("id=%llx, p->(prefix=%llx, height=%d), delta=%llx, idx=%x\n",
+		     id, p->prefix, p->height,
+		     id >> p->height >> SHIFT,
+		     layer_idx(p, id)));
+
+		if ((id >> p->height >> SHIFT) != p->prefix) {
+			/* insert a join above the current layer */
+			cur = shared_alloc_layer(shared);
+			cur->height = ALIGN(fls64((id >> p->height >> SHIFT) ^ p->prefix),
+					    SHIFT) + p->height;
+			cur->prefix = id >> cur->height >> SHIFT;
+
+			dbg(("id=%llx, join prefix=%llu, height=%d\n",
+			     id, cur->prefix, cur->height));
+
+			assert((id >> cur->height >> SHIFT) == cur->prefix);
+			assert(cur->height > p->height);
+
+			if (p->parent) {
+				assert(p->parent->slot[layer_idx(p->parent, id)] == p);
+				assert(cur->height < p->parent->height);
+				p->parent->slot[layer_idx(p->parent, id)] = cur;
+			} else {
+				shared->top = cur;
+			}
+			cur->parent = p->parent;
+
+			idx = p->prefix >> (cur->height - p->height - SHIFT) & MASK;
+			cur->slot[idx] = p;
+			cur->bitmap |= BIT(idx);
+			p->parent = cur;
+		} else if (!p->height) {
+			/* matching base layer */
+			shared->hint = p;
+			goto found_layer;
+		} else {
+			/* descend into the next layer */
+			idx = layer_idx(p, id);
+			cur = p->slot[idx];
+			if (unlikely(!cur)) {
+				cur = shared_alloc_layer(shared);
+				p->slot[idx] = cur;
+				p->bitmap |= BIT(idx);
+				cur->parent = p;
+				goto new_layer;
+			}
+		}
+
+		p = cur;
+	} while (1);
+
+found_layer:
+	assert(p->height == 0);
+	idx = id & MASK;
+	cur = p->slot[idx];
+	p->slot[idx] = item;
+	p->bitmap |= BIT(idx);
+	dbg(("id=%llx, found existing layer prefix=%llx, idx=%x [bitmap=%x]\n",
+	     id, p->prefix, idx, p->bitmap));
+	return (void *)cur;
+
+new_layer:
+	assert(cur->height == 0);
+	assert(cur->bitmap == 0);
+	cur->prefix = id >> SHIFT;
+	cur->slot[id & MASK] = item;
+	cur->bitmap = BIT(id & MASK);
+	shared->hint = cur;
+	dbg(("id=%llx, new layer prefix=%llx, idx=%x [bitmap=%x]\n",
+	     id, cur->prefix, (int)(id & MASK), cur->bitmap));
+	return NULL;
+}
 
-	if (old)
-		kfree_rcu(old, rcu);
+/**
+ * reservation_object_reserve_shared - Reserve space to add a shared
+ * fence to a reservation_object.
+ * @obj: reservation object
+ *
+ * Should be called before reservation_object_add_shared_fence().  Must
+ * be called with obj->lock held.
+ *
+ * RETURNS
+ * Zero for success, or -errno
+ */
+int reservation_object_reserve_shared(struct reservation_object *obj)
+{
+	struct reservation_shared *shared = &obj->shared;
+	int count;
+
+	reservation_object_assert_held(obj);
+	might_sleep();
+
+	/* To guarantee being able to replace a fence in the radixtree,
+	 * we need at most 2 layers: one to create a join in the tree,
+	 * and one to contain the fence. Typically we expect to reuse
+	 * a layer and so avoid any insertions.
+	 *
+	 * We use the low bits of the freed list to track its length
+	 * since we only need a couple of bits.
+	 */
+	count = ptr_get_bits(shared->freed);
+	while (count++ < 2) {
+		struct reservation_shared_layer *p;
 
-	if (old_fence)
-		dma_fence_put(old_fence);
+		p = kzalloc(sizeof(*p), GFP_KERNEL);
+		if (unlikely(!p))
+			return -ENOMEM;
+
+		p->parent = shared->freed;
+		shared->freed = ptr_set_bits(p, count);
+	}
+
+	return 0;
 }
+EXPORT_SYMBOL(reservation_object_reserve_shared);
 
 /**
  * reservation_object_add_shared_fence - Add a fence to a shared slot
@@ -210,16 +331,21 @@ reservation_object_add_shared_replace(struct reservation_object *obj,
 void reservation_object_add_shared_fence(struct reservation_object *obj,
 					 struct dma_fence *fence)
 {
-	struct reservation_object_list *old, *fobj = obj->staged;
+	struct dma_fence *old_fence;
+
+	reservation_object_assert_held(obj);
+
+	dma_fence_get(fence);
+
+	preempt_disable();
+	write_seqcount_begin(&obj->seq);
+
+	old_fence = shared_fence_replace(&obj->shared, fence->context, fence);
 
-	old = reservation_object_get_list(obj);
-	obj->staged = NULL;
+	write_seqcount_end(&obj->seq);
+	preempt_enable();
 
-	if (!fobj) {
-		BUG_ON(old->shared_count >= old->shared_max);
-		reservation_object_add_shared_inplace(obj, old, fence);
-	} else
-		reservation_object_add_shared_replace(obj, old, fobj, fence);
+	dma_fence_put(old_fence);
 }
 EXPORT_SYMBOL(reservation_object_add_shared_fence);
 
@@ -233,33 +359,27 @@ EXPORT_SYMBOL(reservation_object_add_shared_fence);
 void reservation_object_add_excl_fence(struct reservation_object *obj,
 				       struct dma_fence *fence)
 {
-	struct dma_fence *old_fence = reservation_object_get_excl(obj);
-	struct reservation_object_list *old;
-	u32 i = 0;
+	struct dma_fence *old_fence = obj->excl;
+	struct reservation_shared old_shared = obj->shared;
 
-	old = reservation_object_get_list(obj);
-	if (old)
-		i = old->shared_count;
+	reservation_object_assert_held(obj);
 
-	if (fence)
-		dma_fence_get(fence);
+	dma_fence_get(fence);
 
 	preempt_disable();
 	write_seqcount_begin(&obj->seq);
+
 	/* write_seqcount_begin provides the necessary memory barrier */
-	RCU_INIT_POINTER(obj->fence_excl, fence);
-	if (old)
-		old->shared_count = 0;
+	RCU_INIT_POINTER(obj->excl, fence);
+	reservation_shared_init(&obj->shared);
+
 	write_seqcount_end(&obj->seq);
 	preempt_enable();
 
-	/* inplace update, no shared fences */
-	while (i--)
-		dma_fence_put(rcu_dereference_protected(old->shared[i],
-						reservation_object_held(obj)));
+	shared_free(&old_shared);
+	obj->shared.freed = old_shared.freed;
 
-	if (old_fence)
-		dma_fence_put(old_fence);
+	dma_fence_put(old_fence);
 }
 EXPORT_SYMBOL(reservation_object_add_excl_fence);
 
@@ -280,75 +400,77 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
 				      unsigned *pshared_count,
 				      struct dma_fence ***pshared)
 {
-	struct dma_fence **shared = NULL;
-	struct dma_fence *fence_excl;
-	unsigned int shared_count;
-	int ret = 1;
-
-	do {
-		struct reservation_object_list *fobj;
-		unsigned seq;
-		unsigned int i;
+	struct dma_fence **shared = NULL, *excl;
+	unsigned int sz = 0, count;
 
-		shared_count = i = 0;
+	rcu_read_lock();
+	for (;;) {
+		struct reservation_shared_iter iter;
+		unsigned int seq;
 
-		rcu_read_lock();
+restart:
 		seq = read_seqcount_begin(&obj->seq);
 
-		fence_excl = rcu_dereference(obj->fence_excl);
-		if (fence_excl && !dma_fence_get_rcu(fence_excl))
-			goto unlock;
-
-		fobj = rcu_dereference(obj->fence);
-		if (fobj) {
-			struct dma_fence **nshared;
-			size_t sz = sizeof(*shared) * fobj->shared_max;
-
-			nshared = krealloc(shared, sz,
-					   GFP_NOWAIT | __GFP_NOWARN);
-			if (!nshared) {
-				rcu_read_unlock();
-				nshared = krealloc(shared, sz, GFP_KERNEL);
-				if (nshared) {
-					shared = nshared;
-					continue;
-				}
+		excl = rcu_dereference(obj->excl);
+		if (excl && !dma_fence_get_rcu(excl))
+			continue;
+
+		count = 0;
+		reservation_object_for_each_shared(obj, iter) {
+			if (dma_fence_is_signaled(iter.fence))
+				continue;
 
-				ret = -ENOMEM;
+			if (!dma_fence_get_rcu(iter.fence))
 				break;
-			}
-			shared = nshared;
-			shared_count = fobj->shared_count;
 
-			for (i = 0; i < shared_count; ++i) {
-				shared[i] = rcu_dereference(fobj->shared[i]);
-				if (!dma_fence_get_rcu(shared[i]))
-					break;
+			if (count == sz) {
+				struct dma_fence **nshared;
+
+				sz = sz ? 2 * sz : 4;
+				nshared = krealloc(shared,
+						   sz * sizeof(*shared),
+						   GFP_NOWAIT | __GFP_NOWARN);
+				if (!nshared) {
+					rcu_read_unlock();
+
+					dma_fence_put(excl);
+					dma_fence_put(iter.fence);
+					while (count--)
+						dma_fence_put(shared[count]);
+					kfree(shared);
+
+					shared = kmalloc(sz, GFP_TEMPORARY);
+					if (!nshared)
+						return -ENOMEM;
+
+					rcu_read_lock();
+					goto restart;
+				}
+
+				shared = nshared;
 			}
-		}
 
-		if (i != shared_count || read_seqcount_retry(&obj->seq, seq)) {
-			while (i--)
-				dma_fence_put(shared[i]);
-			dma_fence_put(fence_excl);
-			goto unlock;
+			shared[count++] = iter.fence;
 		}
 
-		ret = 0;
-unlock:
-		rcu_read_unlock();
-	} while (ret);
+		if (!read_seqcount_retry(&obj->seq, seq))
+			break;
 
-	if (!shared_count) {
+		while (count--)
+			dma_fence_put(shared[count]);
+		dma_fence_put(excl);
+	}
+	rcu_read_unlock();
+
+	if (!count) {
 		kfree(shared);
 		shared = NULL;
 	}
 
-	*pshared_count = shared_count;
+	*pshared_count = count;
 	*pshared = shared;
-	*pfence_excl = fence_excl;
-
-	return ret;
+	*pfence_excl = excl;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(reservation_object_get_fences_rcu);
 
@@ -366,99 +488,46 @@ EXPORT_SYMBOL_GPL(reservation_object_get_fences_rcu);
  */
 long reservation_object_wait_timeout_rcu(struct reservation_object *obj,
 					 bool wait_all, bool intr,
-					 unsigned long timeout)
+					 long timeout)
 {
-	struct dma_fence *fence;
-	unsigned seq, shared_count, i = 0;
-	long ret = timeout ? timeout : 1;
+	struct reservation_shared_iter iter;
+	unsigned int seq;
 
+	rcu_read_lock();
 retry:
-	fence = NULL;
-	shared_count = 0;
 	seq = read_seqcount_begin(&obj->seq);
-	rcu_read_lock();
 
 	if (wait_all) {
-		struct reservation_object_list *fobj =
-						rcu_dereference(obj->fence);
-
-		if (fobj)
-			shared_count = fobj->shared_count;
-
-		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *lfence = rcu_dereference(fobj->shared[i]);
-
-			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
-				     &lfence->flags))
-				continue;
-
-			if (!dma_fence_get_rcu(lfence))
-				goto unlock_retry;
-
-			if (dma_fence_is_signaled(lfence)) {
-				dma_fence_put(lfence);
-				continue;
-			}
-
-			fence = lfence;
-			break;
-		}
+		reservation_object_for_each_shared(obj, iter)
+			if (!dma_fence_is_signaled(iter.fence))
+				goto wait;
 	}
 
-	if (!shared_count) {
-		struct dma_fence *fence_excl = rcu_dereference(obj->fence_excl);
+	iter.fence = rcu_dereference(obj->excl);
+	if (iter.fence && !dma_fence_is_signaled(iter.fence))
+		goto wait;
 
-		if (fence_excl &&
-		    !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
-			      &fence_excl->flags)) {
-			if (!dma_fence_get_rcu(fence_excl))
-				goto unlock_retry;
+	if (read_seqcount_retry(&obj->seq, seq))
+		goto retry;
 
-			if (dma_fence_is_signaled(fence_excl))
-				dma_fence_put(fence_excl);
-			else
-				fence = fence_excl;
-		}
-	}
+	rcu_read_unlock();
+	return timeout;
 
+wait:
+	if (!dma_fence_get_rcu(iter.fence))
+		goto retry;
 	rcu_read_unlock();
-	if (fence) {
-		if (read_seqcount_retry(&obj->seq, seq)) {
-			dma_fence_put(fence);
-			goto retry;
-		}
 
-		ret = dma_fence_wait_timeout(fence, intr, ret);
-		dma_fence_put(fence);
-		if (ret > 0 && wait_all && (i + 1 < shared_count))
-			goto retry;
-	}
-	return ret;
+	timeout = dma_fence_wait_timeout(iter.fence, intr, timeout);
+	dma_fence_put(iter.fence);
+	if (timeout < 0)
+		return timeout;
 
-unlock_retry:
-	rcu_read_unlock();
+	rcu_read_lock();
 	goto retry;
 }
 EXPORT_SYMBOL_GPL(reservation_object_wait_timeout_rcu);
 
-
-static inline int
-reservation_object_test_signaled_single(struct dma_fence *passed_fence)
-{
-	struct dma_fence *fence, *lfence = passed_fence;
-	int ret = 1;
-
-	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &lfence->flags)) {
-		fence = dma_fence_get_rcu(lfence);
-		if (!fence)
-			return -1;
-
-		ret = !!dma_fence_is_signaled(fence);
-		dma_fence_put(fence);
-	}
-	return ret;
-}
-
 /**
  * reservation_object_test_signaled_rcu - Test if a reservation object's
  * fences have been signaled.
@@ -472,52 +541,29 @@ reservation_object_test_signaled_single(struct dma_fence *passed_fence)
 bool reservation_object_test_signaled_rcu(struct reservation_object *obj,
 					  bool test_all)
 {
-	unsigned seq, shared_count;
-	int ret;
+	struct reservation_shared_iter iter;
+	bool ret = false;
+	unsigned int seq;
 
 	rcu_read_lock();
 retry:
-	ret = true;
-	shared_count = 0;
 	seq = read_seqcount_begin(&obj->seq);
 
 	if (test_all) {
-		unsigned i;
-
-		struct reservation_object_list *fobj =
-						rcu_dereference(obj->fence);
-
-		if (fobj)
-			shared_count = fobj->shared_count;
-
-		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence = rcu_dereference(fobj->shared[i]);
-
-			ret = reservation_object_test_signaled_single(fence);
-			if (ret < 0)
-				goto retry;
-			else if (!ret)
-				break;
-		}
-
-		if (read_seqcount_retry(&obj->seq, seq))
-			goto retry;
+		reservation_object_for_each_shared(obj, iter)
+			if (!dma_fence_is_signaled(iter.fence))
+				goto busy;
 	}
 
-	if (!shared_count) {
-		struct dma_fence *fence_excl = rcu_dereference(obj->fence_excl);
+	iter.fence = rcu_dereference(obj->excl);
+	if (iter.fence && !dma_fence_is_signaled(iter.fence))
+		goto busy;
 
-		if (fence_excl) {
-			ret = reservation_object_test_signaled_single(
-								fence_excl);
-			if (ret < 0)
-				goto retry;
-
-			if (read_seqcount_retry(&obj->seq, seq))
-				goto retry;
-		}
-	}
+	if (read_seqcount_retry(&obj->seq, seq))
+		goto retry;
 
+	ret = true;
+busy:
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/drivers/dma-buf/test-reservation.c b/drivers/dma-buf/test-reservation.c
new file mode 100644
index 000000000000..bd07186c9457
--- /dev/null
+++ b/drivers/dma-buf/test-reservation.c
@@ -0,0 +1,308 @@
+/*
+ * Test cases for struct reservation_object
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-fence.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reservation.h>
+
+#define SHIFT (ilog2(NSHARED))
+#define NFENCES 4096
+
+static const char *fake_get_driver_name(struct dma_fence *fence)
+{
+	return "test-reservation";
+}
+
+static const char *fake_get_timeline_name(struct dma_fence *fence)
+{
+	return "test";
+}
+
+static bool fake_enable_signaling(struct dma_fence *fence)
+{
+	return true;
+}
+
+static void fake_release(struct dma_fence *fence)
+{
+	WARN(1, "invalid fence unref\n");
+}
+
+const struct dma_fence_ops fake_fence_ops = {
+	.get_driver_name = fake_get_driver_name,
+	.get_timeline_name = fake_get_timeline_name,
+	.enable_signaling = fake_enable_signaling,
+	.wait = dma_fence_default_wait,
+	.release = fake_release,
+};
+
+static int validate_layer(struct reservation_shared_layer *parent,
+			  struct reservation_shared_layer *p,
+			  int idx)
+{
+	int ret;
+	int n;
+
+	if (parent && p->height >= parent->height) {
+		pr_err("child layer (prefix=%llx) has greater height [%d] than parent [%d] (prefix=%llx)\n",
+		       p->prefix, p->height, parent->height, parent->prefix);
+		return -EINVAL;
+	}
+
+	if (parent && (p->prefix >> parent->height) != (parent->prefix >> p->height)) {
+		pr_err("child layer (prefix=%llx, height=%d) does not fit in parent (prefix %llx, height %d)\n",
+		       p->prefix, p->height,
+		       parent->prefix, parent->height);
+		return -EINVAL;
+	}
+
+	if (parent && ((p->prefix >> (parent->height - p->height - SHIFT)) & (NSHARED - 1)) != idx) {
+		pr_err("child layer (prefix=%llx, height=%d) in position %d does not match expected %d of parent (prefix=%llx, height=%d)\n",
+
+
+		       p->prefix, p->height,
+		       idx, (int)((p->prefix >> (parent->height - p->height - SHIFT) & (NSHARED - 1)) != idx),
+		       parent->prefix, parent->height);
+		return -EINVAL;
+	}
+
+	for (n = 0; n < NSHARED; n++) {
+		bool has_bit = p->bitmap & BIT(n);
+		bool has_child = p->slot[n];
+
+		if (has_bit ^ has_child) {
+			pr_err("layer (prefix=%llx, height=%d) inconsistent bitmap position %d, has_child=%d, has_bit=%dn",
+			       p->prefix, p->height, n, has_bit, has_child);
+			return -EINVAL;
+		}
+
+		if (!p->slot[n] || !p->height)
+			continue;
+
+		ret = validate_layer(p, p->slot[n], n);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int validate_tree(struct reservation_object *resv)
+{
+	if (!resv->shared.top)
+		return 0;
+
+	return validate_layer(NULL, resv->shared.top, 0);
+}
+
+enum direction {
+	forward,
+	backward,
+	random,
+};
+
+static const char *direction_string(enum direction dir)
+{
+	switch (dir) {
+	case forward: return "forward";
+	case backward: return "backward";
+	case random: return "random";
+	default: return "unknown!";
+	}
+}
+
+static int test_fences(u64 stride, enum direction dir)
+{
+	DEFINE_SPINLOCK(lock);
+	struct dma_fence *fences;
+	struct reservation_object resv;
+	struct reservation_shared_iter iter;
+	struct dma_fence **shared, *excl;
+	unsigned int nshared, n;
+	int *order;
+	u64 context;
+	int ret = -EINVAL;
+
+	fences = kmalloc_array(NFENCES, sizeof(*fences), GFP_KERNEL);
+	if (!fences)
+		return -ENOMEM;
+
+	order = kmalloc_array(NFENCES, sizeof(*order), GFP_KERNEL);
+	if (!order) {
+		kfree(fences);
+		return -ENOMEM;
+	}
+
+	pr_info("Testing %d fences with context stride %llu, %s\n",
+		NFENCES, stride, direction_string(dir));
+
+	reservation_object_init(&resv);
+
+	context = 1;
+	for (n = 0; n < NFENCES; n++) {
+		dma_fence_init(&fences[n], &fake_fence_ops, &lock,
+			       context, n);
+		order[n] = dir == backward ? NFENCES - n - 1 : n;
+		context += stride;
+	}
+
+	if (dir == random) {
+		for (n = NFENCES-1; n > 1; n--) {
+			int r = get_random_int() % (n + 1);
+			if (r != n) {
+				int tmp = order[n];
+				order[n] = order[r];
+				order[r] = tmp;
+			}
+		}
+	}
+
+	ww_mutex_lock(&resv.lock, NULL);
+	for (n = 0; n < NFENCES; n++) {
+		if (reservation_object_reserve_shared(&resv) == 0)
+			reservation_object_add_shared_fence(&resv,
+							    &fences[order[n]]);
+	}
+	ww_mutex_unlock(&resv.lock);
+	kfree(order);
+
+	if (validate_tree(&resv)) {
+		pr_err("reservation object has an invalid tree!\n");
+		goto out;
+	}
+
+	if (!reservation_object_has_shared(&resv)) {
+		pr_err("reservation object has no shared fences!\n");
+		goto out;
+	}
+
+	n = 0;
+	reservation_object_for_each_shared(&resv, iter) {
+		if (iter.fence != &fences[n]) {
+			pr_err("fence[%d] iter out of order: found %llx [%d], expected %llx\n",
+			       n,
+			       iter.fence->context,
+			       iter.fence->seqno,
+			       fences[n].context);
+			goto out;
+		}
+		n++;
+	}
+	if (n != NFENCES) {
+		pr_err("iterated over %d shared fences, expected %d\n", nshared, NFENCES);
+		goto out;
+	}
+
+	if (reservation_object_get_fences_rcu(&resv, &excl,
+					      &nshared, &shared)) {
+		pr_err("reservation_object_get_fences_rcu failed\n");
+		goto out;
+	}
+
+	if (excl) {
+		pr_err("reservation_object_get_fences_rcu reported an exclusive fence\n");
+		goto out;
+	}
+
+	if (nshared != NFENCES) {
+		pr_err("reservation_object_get_fences_rcu reported %d shared fences, expected %d\n", nshared, NFENCES);
+		goto out;
+	}
+
+	for (n = 0; n < NFENCES; n++) {
+		if (shared[n] != &fences[n]) {
+			pr_err("fence[%d] iter out of order: found %llx [%d], expected %llx\n",
+			       n,
+			       shared[n]->context,
+			       shared[n]->seqno,
+			       fences[n].context);
+			goto out;
+		}
+		dma_fence_put(shared[n]);
+	}
+	kfree(shared);
+
+	if (!reservation_object_test_signaled_rcu(&resv, false)) {
+		pr_err("reservation object not signaled [exclusive]\n");
+		goto out;
+	}
+
+	if (reservation_object_test_signaled_rcu(&resv, true)) {
+		pr_err("reservation object was signaled [all]\n");
+		goto out;
+	}
+
+	//reservation_object_wait_timeout_rcu(&resv, true, false, 0);
+
+	reservation_object_add_excl_fence(&resv, NULL);
+
+	for (n = 0; n < NFENCES; n++) {
+		if (atomic_read(&fences[n].refcount.refcount) > 1) {
+			pr_err("fence[%d] leaked, refcount now %d\n",
+			       n, atomic_read(&fences[n].refcount.refcount));
+			goto out;
+		}
+	}
+
+	if (reservation_object_has_shared(&resv)) {
+		pr_err("reservation object did not discard shared fences!\n");
+		goto out;
+	}
+
+	if (!reservation_object_test_signaled_rcu(&resv, false)) {
+		pr_err("empty reservation object not signaled [exclusive]\n");
+		goto out;
+	}
+
+	if (!reservation_object_test_signaled_rcu(&resv, true)) {
+		pr_err("empty reservation object not signaled [all]\n");
+		goto out;
+	}
+
+	reservation_object_fini(&resv);
+
+	ret = 0;
+out:
+	kfree(fences);
+	return ret;
+}
+
+static int __init test_reservation_init(void)
+{
+	const u64 max_stride = ~0ull / NFENCES;
+	int strides[] = { NSHARED-1, NSHARED, NSHARED+1 };
+	int ret, n;
+
+	pr_info("Testing reservation objects\n");
+
+	for (n = 0; n < ARRAY_SIZE(strides); n++) {
+		u64 stride;
+
+		for (stride = 1; stride < max_stride; stride *= strides[n]) {
+			enum direction dir;
+
+			for (dir = forward; dir <= random; dir++) {
+				ret = test_fences(stride, dir);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void __exit test_reservation_cleanup(void)
+{
+}
+
+module_init(test_reservation_init);
+module_exit(test_reservation_cleanup);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index ed814e6d0207..a660d36c0392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -178,10 +178,9 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
 		     struct reservation_object *resv,
 		     void *owner)
 {
-	struct reservation_object_list *flist;
+	struct reservation_shared_iter iter;
 	struct dma_fence *f;
 	void *fence_owner;
-	unsigned i;
 	int r = 0;
 
 	if (resv == NULL)
@@ -190,14 +189,11 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
 	/* always sync to the exclusive fence */
 	f = reservation_object_get_excl(resv);
 	r = amdgpu_sync_fence(adev, sync, f);
+	if (r)
+		return ret;
 
-	flist = reservation_object_get_list(resv);
-	if (!flist || r)
-		return r;
-
-	for (i = 0; i < flist->shared_count; ++i) {
-		f = rcu_dereference_protected(flist->shared[i],
-					      reservation_object_held(resv));
+	reservation_object_for_each_shared(resv, iter) {
+		f = iter.fence;
 		if (amdgpu_sync_same_dev(adev, f)) {
 			/* VM updates are only interesting
 			 * for other VM updates and moves.
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 7d066a91d778..5473b1b8bb80 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -481,7 +481,7 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 {
 	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
 	struct reservation_object *robj = etnaviv_obj->resv;
-	struct reservation_object_list *fobj;
+	struct reservation_shared_iter iter;
 	struct dma_fence *fence;
 	unsigned long off = drm_vma_node_start(&obj->vma_node);
 
@@ -491,17 +491,10 @@ static void etnaviv_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 			off, etnaviv_obj->vaddr, obj->size);
 
 	rcu_read_lock();
-	fobj = rcu_dereference(robj->fence);
-	if (fobj) {
-		unsigned int i, shared_count = fobj->shared_count;
+	reservation_object_for_each_shared(robj, iter)
+		etnaviv_gem_describe_fence(iter.fence, "Shared", m);
 
-		for (i = 0; i < shared_count; i++) {
-			fence = rcu_dereference(fobj->shared[i]);
-			etnaviv_gem_describe_fence(fence, "Shared", m);
-		}
-	}
-
-	fence = rcu_dereference(robj->fence_excl);
+	fence = rcu_dereference(robj->excl);
 	if (fence)
 		etnaviv_gem_describe_fence(fence, "Exclusive", m);
 	rcu_read_unlock();
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index d2211825e5c8..7fcdf9dc86f8 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -1020,9 +1020,9 @@ int etnaviv_gpu_fence_sync_obj(struct etnaviv_gem_object *etnaviv_obj,
 	unsigned int context, bool exclusive)
 {
 	struct reservation_object *robj = etnaviv_obj->resv;
-	struct reservation_object_list *fobj;
+	struct reservation_shared_iter iter;
 	struct dma_fence *fence;
-	int i, ret;
+	int ret;
 
 	if (!exclusive) {
 		ret = reservation_object_reserve_shared(robj);
@@ -1034,8 +1034,7 @@ int etnaviv_gpu_fence_sync_obj(struct etnaviv_gem_object *etnaviv_obj,
 	 * If we have any shared fences, then the exclusive fence
 	 * should be ignored as it will already have been signalled.
 	 */
-	fobj = reservation_object_get_list(robj);
-	if (!fobj || fobj->shared_count == 0) {
+	if (!reservation_object_has_shared(robj)) {
 		/* Wait on any existing exclusive fence which isn't our own */
 		fence = reservation_object_get_excl(robj);
 		if (fence && fence->context != context) {
@@ -1045,12 +1044,11 @@ int etnaviv_gpu_fence_sync_obj(struct etnaviv_gem_object *etnaviv_obj,
 		}
 	}
 
-	if (!exclusive || !fobj)
+	if (!exclusive)
 		return 0;
 
-	for (i = 0; i < fobj->shared_count; i++) {
-		fence = rcu_dereference_protected(fobj->shared[i],
-						reservation_object_held(robj));
+	reservation_object_for_each_shared(resv, iter) {
+		fence = iter.fence;
 		if (fence->context != context) {
 			ret = dma_fence_wait(fence, true);
 			if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ed4465d22dde..d58a6390f242 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3700,7 +3700,7 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 {
 	struct drm_i915_gem_busy *args = data;
 	struct drm_i915_gem_object *obj;
-	struct reservation_object_list *list;
+	struct reservation_shared_iter iter;
 	unsigned int seq;
 	int err;
 
@@ -3730,20 +3730,11 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 	seq = raw_read_seqcount(&obj->resv->seq);
 
 	/* Translate the exclusive fence to the READ *and* WRITE engine */
-	args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
+	args->busy = busy_check_writer(rcu_dereference(obj->resv->excl));
 
 	/* Translate shared fences to READ set of engines */
-	list = rcu_dereference(obj->resv->fence);
-	if (list) {
-		unsigned int shared_count = list->shared_count, i;
-
-		for (i = 0; i < shared_count; ++i) {
-			struct dma_fence *fence =
-				rcu_dereference(list->shared[i]);
-
-			args->busy |= busy_check_reader(fence);
-		}
-	}
+	reservation_object_for_each_shared(obj->resv, iter)
+		args->busy |= busy_check_reader(iter.fence);
 
 	if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
 		goto retry;
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 57db7dbbb618..eeee29968d0e 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -520,9 +520,9 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
 		struct msm_fence_context *fctx, bool exclusive)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
-	struct reservation_object_list *fobj;
+	struct reservation_shared_iter iter;
 	struct dma_fence *fence;
-	int i, ret;
+	int ret;
 
 	if (!exclusive) {
 		/* NOTE: _reserve_shared() must happen before _add_shared_fence(),
@@ -535,8 +535,7 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
 			return ret;
 	}
 
-	fobj = reservation_object_get_list(msm_obj->resv);
-	if (!fobj || (fobj->shared_count == 0)) {
+	if (!reservation_object_has_shared(msm_obj->resv)) {
 		fence = reservation_object_get_excl(msm_obj->resv);
 		/* don't need to wait on our own fences, since ring is fifo */
 		if (fence && (fence->context != fctx->context)) {
@@ -546,12 +545,11 @@ int msm_gem_sync_object(struct drm_gem_object *obj,
 		}
 	}
 
-	if (!exclusive || !fobj)
+	if (!exclusive)
 		return 0;
 
-	for (i = 0; i < fobj->shared_count; i++) {
-		fence = rcu_dereference_protected(fobj->shared[i],
-						reservation_object_held(msm_obj->resv));
+	reservation_object_for_each_shared(msm_obj->resv, iter) {
+		fence = iter.fence;
 		if (fence->context != fctx->context) {
 			ret = dma_fence_wait(fence, true);
 			if (ret)
@@ -630,7 +628,7 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
 	struct reservation_object *robj = msm_obj->resv;
-	struct reservation_object_list *fobj;
+	struct reservation_shared_iter iter;
 	struct dma_fence *fence;
 	uint64_t off = drm_vma_node_start(&obj->vma_node);
 	const char *madv;
@@ -656,17 +654,10 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 			off, msm_obj->vaddr, obj->size, madv);
 
 	rcu_read_lock();
-	fobj = rcu_dereference(robj->fence);
-	if (fobj) {
-		unsigned int i, shared_count = fobj->shared_count;
-
-		for (i = 0; i < shared_count; i++) {
-			fence = rcu_dereference(fobj->shared[i]);
-			describe_fence(fence, "Shared", m);
-		}
-	}
+	reservation_object_for_each_shared(fobj, iter)
+		describe_fence(iter.fence, "Shared", m);
 
-	fence = rcu_dereference(robj->fence_excl);
+	fence = rcu_dereference(robj->excl);
 	if (fence)
 		describe_fence(fence, "Exclusive", m);
 	rcu_read_unlock();
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index f2f348f0160c..c5b0967f5f6e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -393,21 +393,19 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 	struct nouveau_fence_chan *fctx = chan->fence;
 	struct dma_fence *fence;
 	struct reservation_object *resv = nvbo->bo.resv;
-	struct reservation_object_list *fobj;
+	struct reservation_shared_iter iter;
 	struct nouveau_fence *f;
-	int ret = 0, i;
+	int ret = 0;
 
 	if (!exclusive) {
 		ret = reservation_object_reserve_shared(resv);
-
 		if (ret)
 			return ret;
 	}
 
-	fobj = reservation_object_get_list(resv);
 	fence = reservation_object_get_excl(resv);
 
-	if (fence && (!exclusive || !fobj || !fobj->shared_count)) {
+	if (fence && (!exclusive || !reservation_object_has_shared(resv))) {
 		struct nouveau_channel *prev = NULL;
 		bool must_wait = true;
 
@@ -426,15 +424,14 @@ nouveau_fence_sync(struct nouveau_bo *nvbo, struct nouveau_channel *chan, bool e
 		return ret;
 	}
 
-	if (!exclusive || !fobj)
+	if (!exclusive)
 		return ret;
 
-	for (i = 0; i < fobj->shared_count && !ret; ++i) {
+	reservation_object_for_each_shared(resv, iter) {
 		struct nouveau_channel *prev = NULL;
 		bool must_wait = true;
 
-		fence = rcu_dereference_protected(fobj->shared[i],
-						reservation_object_held(resv));
+		fence = iter.fence;
 
 		f = nouveau_local_fence(fence, chan->drm);
 		if (f) {
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 201b52b750dd..dead9630e472 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -118,19 +118,22 @@ nouveau_gem_object_unmap(struct nouveau_bo *nvbo, struct nvkm_vma *vma)
 {
 	const bool mapped = nvbo->bo.mem.mem_type != TTM_PL_SYSTEM;
 	struct reservation_object *resv = nvbo->bo.resv;
-	struct reservation_object_list *fobj;
+	struct reservation_shared_iter iter;
 	struct dma_fence *fence = NULL;
 
-	fobj = reservation_object_get_list(resv);
-
 	list_del(&vma->head);
 
-	if (fobj && fobj->shared_count > 1)
-		ttm_bo_wait(&nvbo->bo, false, false);
-	else if (fobj && fobj->shared_count == 1)
-		fence = rcu_dereference_protected(fobj->shared[0],
-						reservation_object_held(resv));
-	else
+	reservation_object_for_each_shared(resv, iter) {
+		if (fence) {
+			ttm_bo_wait(&nvbo->bo, false, false);
+			fence = NULL;
+			break;
+		}
+
+		fence = iter.fence;
+	}
+
+	if (!fence)
 		fence = reservation_object_get_excl(nvbo->bo.resv);
 
 	if (fence && mapped) {
diff --git a/drivers/gpu/drm/qxl/qxl_debugfs.c b/drivers/gpu/drm/qxl/qxl_debugfs.c
index 6911b8c44492..df00b77ac546 100644
--- a/drivers/gpu/drm/qxl/qxl_debugfs.c
+++ b/drivers/gpu/drm/qxl/qxl_debugfs.c
@@ -58,12 +58,13 @@ qxl_debugfs_buffers_info(struct seq_file *m, void *data)
 	struct qxl_bo *bo;
 
 	list_for_each_entry(bo, &qdev->gem.objects, list) {
-		struct reservation_object_list *fobj;
-		int rel;
+		struct reservation_shared_iter iter;
+		unsigned long rel;
 
+		rel = 0;
 		rcu_read_lock();
-		fobj = rcu_dereference(bo->tbo.resv->fence);
-		rel = fobj ? fobj->shared_count : 0;
+		reservation_object_for_each_shared(bo->tbo.resv, iter)
+			rel++;
 		rcu_read_unlock();
 
 		seq_printf(m, "size %ld, pc %d, num releases %d\n",
diff --git a/drivers/gpu/drm/radeon/radeon_sync.c b/drivers/gpu/drm/radeon/radeon_sync.c
index be5d7a38d3aa..72b56ed09df4 100644
--- a/drivers/gpu/drm/radeon/radeon_sync.c
+++ b/drivers/gpu/drm/radeon/radeon_sync.c
@@ -91,10 +91,9 @@ int radeon_sync_resv(struct radeon_device *rdev,
 		     struct reservation_object *resv,
 		     bool shared)
 {
-	struct reservation_object_list *flist;
+	struct reservation_shared_iter iter;
 	struct dma_fence *f;
 	struct radeon_fence *fence;
-	unsigned i;
 	int r = 0;
 
 	/* always sync to the exclusive fence */
@@ -105,14 +104,11 @@ int radeon_sync_resv(struct radeon_device *rdev,
 	else if (f)
 		r = dma_fence_wait(f, true);
 
-	flist = reservation_object_get_list(resv);
-	if (shared || !flist || r)
+	if (shared || !reservation_object_has_shared(resv) || r)
 		return r;
 
-	for (i = 0; i < flist->shared_count; ++i) {
-		f = rcu_dereference_protected(flist->shared[i],
-					      reservation_object_held(resv));
-		fence = to_radeon_fence(f);
+	resevation_object_for_each_shared(resv, iter) {
+		fence = to_radeon_fence(iter.fence);
 		if (fence && fence->rdev == rdev)
 			radeon_sync_fence(sync, fence);
 		else
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index d5063618efa7..5844c4aa6e1c 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -425,19 +425,15 @@ static void ttm_bo_cleanup_memtype_use(struct ttm_buffer_object *bo)
 
 static void ttm_bo_flush_all_fences(struct ttm_buffer_object *bo)
 {
-	struct reservation_object_list *fobj;
+	struct reservation_shared_iter iter;
 	struct dma_fence *fence;
-	int i;
 
-	fobj = reservation_object_get_list(bo->resv);
 	fence = reservation_object_get_excl(bo->resv);
 	if (fence && !fence->ops->signaled)
 		dma_fence_enable_sw_signaling(fence);
 
-	for (i = 0; fobj && i < fobj->shared_count; ++i) {
-		fence = rcu_dereference_protected(fobj->shared[i],
-					reservation_object_held(bo->resv));
-
+	reservation_object_for_each_shared(bo->resv, iter) {
+		fence = iter.fence;
 		if (!fence->ops->signaled)
 			dma_fence_enable_sw_signaling(fence);
 	}
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index 2e313cca08f0..3e93b0d7dacc 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -49,34 +49,47 @@ extern struct ww_class reservation_ww_class;
 extern struct lock_class_key reservation_seqcount_class;
 extern const char reservation_seqcount_string[];
 
-/**
- * struct reservation_object_list - a list of shared fences
- * @rcu: for internal use
- * @shared_count: table of shared fences
- * @shared_max: for growing shared fence table
- * @shared: shared fence table
- */
-struct reservation_object_list {
-	struct rcu_head rcu;
-	u32 shared_count, shared_max;
-	struct dma_fence __rcu *shared[];
+struct reservation_shared_layer;
+
+#define NSHARED 16
+
+struct reservation_shared_layer {
+	union {
+		u64 prefix;
+		struct rcu_head rcu;
+	};
+	unsigned int height;
+	unsigned int bitmap;
+	void *slot[NSHARED];
+	struct reservation_shared_layer *parent;
+};
+
+struct reservation_shared {
+	struct reservation_shared_layer *hint;
+	struct reservation_shared_layer *top;
+	struct reservation_shared_layer *freed;
 };
 
+static inline void reservation_shared_init(struct reservation_shared *shared)
+{
+	memset(shared, 0, sizeof(*shared));
+}
+
+void reservation_shared_destroy(struct reservation_shared *shared);
+
 /**
  * struct reservation_object - a reservation object manages fences for a buffer
  * @lock: update side lock
  * @seq: sequence count for managing RCU read-side synchronization
- * @fence_excl: the exclusive fence, if there is one currently
- * @fence: list of current shared fences
- * @staged: staged copy of shared fences for RCU updates
+ * @excl: the exclusive fence, if there is one currently
+ * @shared: list of current shared fences
  */
 struct reservation_object {
 	struct ww_mutex lock;
 	seqcount_t seq;
 
-	struct dma_fence __rcu *fence_excl;
-	struct reservation_object_list __rcu *fence;
-	struct reservation_object_list *staged;
+	struct dma_fence __rcu *excl;
+	struct reservation_shared shared;
 };
 
 #define reservation_object_held(obj) lockdep_is_held(&(obj)->lock.base)
@@ -93,9 +106,8 @@ reservation_object_init(struct reservation_object *obj)
 	ww_mutex_init(&obj->lock, &reservation_ww_class);
 
 	__seqcount_init(&obj->seq, reservation_seqcount_string, &reservation_seqcount_class);
-	RCU_INIT_POINTER(obj->fence, NULL);
-	RCU_INIT_POINTER(obj->fence_excl, NULL);
-	obj->staged = NULL;
+	RCU_INIT_POINTER(obj->excl, NULL);
+	reservation_shared_init(&obj->shared);
 }
 
 /**
@@ -105,46 +117,18 @@ reservation_object_init(struct reservation_object *obj)
 static inline void
 reservation_object_fini(struct reservation_object *obj)
 {
-	int i;
-	struct reservation_object_list *fobj;
-	struct dma_fence *excl;
-
 	/*
 	 * This object should be dead and all references must have
 	 * been released to it, so no need to be protected with rcu.
 	 */
-	excl = rcu_dereference_protected(obj->fence_excl, 1);
-	if (excl)
-		dma_fence_put(excl);
-
-	fobj = rcu_dereference_protected(obj->fence, 1);
-	if (fobj) {
-		for (i = 0; i < fobj->shared_count; ++i)
-			dma_fence_put(rcu_dereference_protected(fobj->shared[i], 1));
+	dma_fence_put(rcu_dereference_protected(obj->excl, 1));
 
-		kfree(fobj);
-	}
-	kfree(obj->staged);
+	reservation_shared_destroy(&obj->shared);
 
 	ww_mutex_destroy(&obj->lock);
 }
 
 /**
- * reservation_object_get_list - get the reservation object's
- * shared fence list, with update-side lock held
- * @obj: the reservation object
- *
- * Returns the shared fence list.  Does NOT take references to
- * the fence.  The obj->lock must be held.
- */
-static inline struct reservation_object_list *
-reservation_object_get_list(struct reservation_object *obj)
-{
-	return rcu_dereference_protected(obj->fence,
-					 reservation_object_held(obj));
-}
-
-/**
  * reservation_object_get_excl - get the reservation object's
  * exclusive fence, with update-side lock held
  * @obj: the reservation object
@@ -158,7 +142,7 @@ reservation_object_get_list(struct reservation_object *obj)
 static inline struct dma_fence *
 reservation_object_get_excl(struct reservation_object *obj)
 {
-	return rcu_dereference_protected(obj->fence_excl,
+	return rcu_dereference_protected(obj->excl,
 					 reservation_object_held(obj));
 }
 
@@ -181,7 +165,7 @@ reservation_object_get_excl_rcu(struct reservation_object *obj)
 retry:
 	seq = read_seqcount_begin(&obj->seq);
 	rcu_read_lock();
-	fence = rcu_dereference(obj->fence_excl);
+	fence = rcu_dereference(obj->excl);
 	if (read_seqcount_retry(&obj->seq, seq)) {
 		rcu_read_unlock();
 		goto retry;
@@ -191,6 +175,12 @@ reservation_object_get_excl_rcu(struct reservation_object *obj)
 	return fence;
 }
 
+static inline bool
+reservation_object_has_shared(struct reservation_object *obj)
+{
+	return READ_ONCE(obj->shared.top);
+}
+
 int reservation_object_reserve_shared(struct reservation_object *obj);
 void reservation_object_add_shared_fence(struct reservation_object *obj,
 					 struct dma_fence *fence);
@@ -205,9 +195,67 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj,
 
 long reservation_object_wait_timeout_rcu(struct reservation_object *obj,
 					 bool wait_all, bool intr,
-					 unsigned long timeout);
+					 long timeout);
 
 bool reservation_object_test_signaled_rcu(struct reservation_object *obj,
 					  bool test_all);
 
+struct reservation_shared_iter {
+	struct dma_fence *fence;
+	struct reservation_shared_layer *p;
+	u8 stack[16];
+};
+
+static inline void
+__reservation_shared_iter_fill(struct reservation_shared_iter *iter,
+			       struct reservation_shared_layer *p)
+{
+	int h;
+
+	do {
+		int pos = ffs(p->bitmap) - 1;
+
+		h = p->height / ilog2(NSHARED);
+		iter->stack[h] = pos;
+
+		iter->p = p;
+		p = p->slot[pos];
+	} while (h);
+
+	iter->fence = (void *)p;
+}
+
+static inline void
+reservation_shared_iter_init(struct reservation_object *obj,
+			     struct reservation_shared_iter *iter)
+{
+	if (obj->shared.top)
+		__reservation_shared_iter_fill(iter, obj->shared.top);
+	else
+		iter->fence = NULL;
+}
+
+#define fns(x, bit) (ffs((x) & (~(typeof(x))0 << (bit))))
+
+void __reservation_shared_iter_next(struct reservation_shared_iter *iter);
+
+static inline void
+reservation_shared_iter_next(struct reservation_shared_iter *iter)
+{
+	int pos;
+
+	pos = fns(iter->p->bitmap, iter->stack[0] + 1);
+	if (likely(pos)) {
+		iter->stack[0] = --pos;
+		iter->fence = iter->p->slot[pos];
+	} else {
+		__reservation_shared_iter_next(iter);
+	}
+}
+
+#define reservation_object_for_each_shared(obj, __i)			\
+	for (reservation_shared_iter_init(obj, &(__i));			\
+	     __i.fence;							\
+	     reservation_shared_iter_next(&(__i)))
+
 #endif /* _LINUX_RESERVATION_H */
diff --git a/tools/testing/selftests/dma-buf/reservation.sh b/tools/testing/selftests/dma-buf/reservation.sh
new file mode 100644
index 000000000000..17bdea831eaf
--- /dev/null
+++ b/tools/testing/selftests/dma-buf/reservation.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# Runs API tests for reservation_object using test-reservation kernel module
+
+if /sbin/modprobe -q test-reservation; then
+       /sbin/modprobe -q -r test-reservation
+       echo "dma-buf/reservation: ok"
+else
+       echo "dma-buf/reservation: [FAIL]"
+       exit 1
+fi
+
-- 
2.10.2



More information about the Intel-gfx mailing list