[Intel-gfx] [PATCH v10] drm/i915: Squash repeated awaits on the same fence

Fri Apr 28 09:32:58 UTC 2017

On 28/04/2017 08:41, Chris Wilson wrote:
> Track the latest fence waited upon on each context, and only add a new
> asynchronous wait if the new fence is more recent than the recorded
> fence for that context. This requires us to filter out unordered
> timelines, which are noted by DMA_FENCE_NO_CONTEXT. However, in the
> absence of a universal identifier, we have to use our own
> i915->mm.unordered_timeline token.
>
> v2: Throw around the debug crutches
> v3: Inline the likely case of the pre-allocation cache being full.
> v4: Drop the pre-allocation support, we can lose the most recent fence
> in case of allocation failure -- it just means we may emit more awaits
> than strictly necessary but will not break.
> v5: Trim allocation size for leaf nodes, they only need an array of u32
> not pointers.
> v6: Create mock_timeline to tidy selftest writing
> v7: s/intel_timeline_sync_get/intel_timeline_sync_is_later/ (Tvrtko)
> v8: Prune the stale sync points when we idle.
> v9: Include a small benchmark in the kselftests
> v10: Separate the idr implementation into its own compartment.

FYI I am reading v11 and commenting here. Hopefully it works out. :)

>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/Makefile                      |   1 +
>  drivers/gpu/drm/i915/i915_gem.c                    |   1 +
>  drivers/gpu/drm/i915/i915_gem.h                    |   2 +
>  drivers/gpu/drm/i915/i915_gem_request.c            |   9 +
>  drivers/gpu/drm/i915/i915_gem_timeline.c           |  92 +++++-
>  drivers/gpu/drm/i915/i915_gem_timeline.h           |  36 ++
>  drivers/gpu/drm/i915/i915_syncmap.c                | 362 +++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_syncmap.h                |  39 +++
>  drivers/gpu/drm/i915/selftests/i915_gem_timeline.c | 257 +++++++++++++++
>  .../gpu/drm/i915/selftests/i915_mock_selftests.h   |   1 +
>  drivers/gpu/drm/i915/selftests/mock_timeline.c     |  45 +++
>  drivers/gpu/drm/i915/selftests/mock_timeline.h     |  33 ++
>  12 files changed, 860 insertions(+), 18 deletions(-)
>  create mode 100644 drivers/gpu/drm/i915/i915_syncmap.c
>  create mode 100644 drivers/gpu/drm/i915/i915_syncmap.h
>  create mode 100644 drivers/gpu/drm/i915/selftests/i915_gem_timeline.c
>  create mode 100644 drivers/gpu/drm/i915/selftests/mock_timeline.c
>  create mode 100644 drivers/gpu/drm/i915/selftests/mock_timeline.h
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 2cf04504e494..7b05fb802f4c 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -16,6 +16,7 @@ i915-y := i915_drv.o \
>  	  i915_params.o \
>  	  i915_pci.o \
>            i915_suspend.o \
> +	  i915_syncmap.o \
>  	  i915_sw_fence.o \
>  	  i915_sysfs.o \
>  	  intel_csr.o \
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c1fa3c103f38..f886ef492036 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3214,6 +3214,7 @@ i915_gem_idle_work_handler(struct work_struct *work)
>  		intel_engine_disarm_breadcrumbs(engine);
>  		i915_gem_batch_pool_fini(&engine->batch_pool);
>  	}
> +	i915_gem_timelines_mark_idle(dev_priv);
>
>  	GEM_BUG_ON(!dev_priv->gt.awake);
>  	dev_priv->gt.awake = false;
> diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
> index 5a49487368ca..ee54597465b6 100644
> --- a/drivers/gpu/drm/i915/i915_gem.h
> +++ b/drivers/gpu/drm/i915/i915_gem.h
> @@ -25,6 +25,8 @@
>  #ifndef __I915_GEM_H__
>  #define __I915_GEM_H__
>
> +#include <linux/bug.h>
> +
>  #ifdef CONFIG_DRM_I915_DEBUG_GEM
>  #define GEM_BUG_ON(expr) BUG_ON(expr)
>  #define GEM_WARN_ON(expr) WARN_ON(expr)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 5fa4e52ded06..807fc1b65dd1 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -772,6 +772,11 @@ i915_gem_request_await_dma_fence(struct drm_i915_gem_request *req,
>  		if (fence->context == req->fence.context)
>  			continue;
>
> +		/* Squash repeated waits to the same timelines */
> +		if (fence->context != req->i915->mm.unordered_timeline &&
> +		    intel_timeline_sync_is_later(req->timeline, fence))
> +			continue;
> +
>  		if (dma_fence_is_i915(fence))
>  			ret = i915_gem_request_await_request(req,
>  							     to_request(fence));
> @@ -781,6 +786,10 @@ i915_gem_request_await_dma_fence(struct drm_i915_gem_request *req,
>  							    GFP_KERNEL);
>  		if (ret < 0)
>  			return ret;
> +
> +		/* Record the latest fence used against each timeline */
> +		if (fence->context != req->i915->mm.unordered_timeline)
> +			intel_timeline_sync_set(req->timeline, fence);
>  	} while (--nchild);
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
> index b596ca7ee058..a28a65db82e9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
> @@ -24,6 +24,31 @@
>
>  #include "i915_drv.h"

#include "i915_syncmap.h"?

I think the over-reliance on i915_drv.h being an include all is hurting 
us in some cases, and even though it is probably very hard to untangle, 
perhaps it is worth making new source files cleaner in that respect.

>
> +static void __intel_timeline_init(struct intel_timeline *tl,
> +				  struct i915_gem_timeline *parent,
> +				  u64 context,
> +				  struct lock_class_key *lockclass,
> +				  const char *lockname)
> +{
> +	tl->fence_context = context;
> +	tl->common = parent;
> +#ifdef CONFIG_DEBUG_SPINLOCK
> +	__raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
> +#else
> +	spin_lock_init(&tl->lock);
> +#endif
> +	init_request_active(&tl->last_request, NULL);
> +	INIT_LIST_HEAD(&tl->requests);
> +	i915_syncmap_init(&tl->sync);
> +}
> +
> +static void __intel_timeline_fini(struct intel_timeline *tl)
> +{
> +	GEM_BUG_ON(!list_empty(&tl->requests));
> +
> +	i915_syncmap_free(&tl->sync);
> +}
> +
>  static int __i915_gem_timeline_init(struct drm_i915_private *i915,
>  				    struct i915_gem_timeline *timeline,
>  				    const char *name,
> @@ -35,6 +60,12 @@ static int __i915_gem_timeline_init(struct drm_i915_private *i915,
>
>  	lockdep_assert_held(&i915->drm.struct_mutex);
>
> +	/*
> +	 * Ideally we want a set of engines on a single leaf as we expect
> +	 * to mostly be tracking synchronisation between engines.
> +	 */
> +	BUILD_BUG_ON(KSYNCMAP < I915_NUM_ENGINES);

Maybe also add BUILD_BUG_ON(!is_power_of_2(KSYNCMAP)) just in case.

> +
>  	timeline->i915 = i915;
>  	timeline->name = kstrdup(name ?: "[kernel]", GFP_KERNEL);
>  	if (!timeline->name)
> @@ -44,19 +75,10 @@ static int __i915_gem_timeline_init(struct drm_i915_private *i915,
>
>  	/* Called during early_init before we know how many engines there are */
>  	fences = dma_fence_context_alloc(ARRAY_SIZE(timeline->engine));
> -	for (i = 0; i < ARRAY_SIZE(timeline->engine); i++) {
> -		struct intel_timeline *tl = &timeline->engine[i];
> -
> -		tl->fence_context = fences++;
> -		tl->common = timeline;
> -#ifdef CONFIG_DEBUG_SPINLOCK
> -		__raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
> -#else
> -		spin_lock_init(&tl->lock);
> -#endif
> -		init_request_active(&tl->last_request, NULL);
> -		INIT_LIST_HEAD(&tl->requests);
> -	}
> +	for (i = 0; i < ARRAY_SIZE(timeline->engine); i++)
> +		__intel_timeline_init(&timeline->engine[i],
> +				      timeline, fences++,
> +				      lockclass, lockname);
>
>  	return 0;
>  }
> @@ -81,18 +103,52 @@ int i915_gem_timeline_init__global(struct drm_i915_private *i915)
>  					&class, "&global_timeline->lock");
>  }
>
> +/**
> + * i915_gem_timelines_mark_idle -- called when the driver idles
> + * @i915 - the drm_i915_private device
> + *
> + * When the driver is completely idle, we know that all of our sync points
> + * have been signaled and our tracking is then entirely redundant. Any request
> + * to wait upon an older sync point will be completed instantly as we know
> + * the fence is signaled and therefore we will not even look them up in the
> + * sync point map.
> + */
> +void i915_gem_timelines_mark_idle(struct drm_i915_private *i915)
> +{
> +	struct i915_gem_timeline *timeline;
> +	int i;
> +
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +
> +	list_for_each_entry(timeline, &i915->gt.timelines, link) {
> +		for (i = 0; i < ARRAY_SIZE(timeline->engine); i++) {
> +			struct intel_timeline *tl = &timeline->engine[i];
> +
> +			/*
> +			 * All known fences are completed so we can scrap
> +			 * the current sync point tracking and start afresh,
> +			 * any attempt to wait upon a previous sync point
> +			 * will be skipped as the fence was signaled.
> +			 */
> +			i915_syncmap_free(&tl->sync);
> +		}
> +	}
> +}
> +
>  void i915_gem_timeline_fini(struct i915_gem_timeline *timeline)
>  {
>  	int i;
>
>  	lockdep_assert_held(&timeline->i915->drm.struct_mutex);
>
> -	for (i = 0; i < ARRAY_SIZE(timeline->engine); i++) {
> -		struct intel_timeline *tl = &timeline->engine[i];
> -
> -		GEM_BUG_ON(!list_empty(&tl->requests));
> -	}
> +	for (i = 0; i < ARRAY_SIZE(timeline->engine); i++)
> +		__intel_timeline_fini(&timeline->engine[i]);
>
>  	list_del(&timeline->link);
>  	kfree(timeline->name);
>  }
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +#include "selftests/mock_timeline.c"
> +#include "selftests/i915_gem_timeline.c"
> +#endif
> diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.h b/drivers/gpu/drm/i915/i915_gem_timeline.h
> index 6c53e14cab2a..86ade2890902 100644
> --- a/drivers/gpu/drm/i915/i915_gem_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_gem_timeline.h
> @@ -27,7 +27,9 @@
>
>  #include <linux/list.h>
>
> +#include "i915_utils.h"
>  #include "i915_gem_request.h"
> +#include "i915_syncmap.h"
>
>  struct i915_gem_timeline;
>
> @@ -55,6 +57,15 @@ struct intel_timeline {
>  	 * struct_mutex.
>  	 */
>  	struct i915_gem_active last_request;
> +
> +	/**
> +	 * We track the most recent seqno that we wait on in every context so
> +	 * that we only have to emit a new await and dependency on a more
> +	 * recent sync point. As the contexts may executed out-of-order, we
> +	 * have to track each individually and cannot not rely on an absolute
> +	 * global_seqno.
> +	 */
> +	struct i915_syncmap *sync;
>  	u32 sync_seqno[I915_NUM_ENGINES];
>
>  	struct i915_gem_timeline *common;
> @@ -73,6 +84,31 @@ int i915_gem_timeline_init(struct drm_i915_private *i915,
>  			   struct i915_gem_timeline *tl,
>  			   const char *name);
>  int i915_gem_timeline_init__global(struct drm_i915_private *i915);
> +void i915_gem_timelines_mark_idle(struct drm_i915_private *i915);
>  void i915_gem_timeline_fini(struct i915_gem_timeline *tl);
>
> +static inline int __intel_timeline_sync_set(struct intel_timeline *tl,
> +					    u64 context, u32 seqno)
> +{
> +	return i915_syncmap_set(&tl->sync, context, seqno);
> +}
> +
> +static inline int intel_timeline_sync_set(struct intel_timeline *tl,
> +					  const struct dma_fence *fence)
> +{
> +	return __intel_timeline_sync_set(tl, fence->context, fence->seqno);
> +}
> +
> +static inline bool __intel_timeline_sync_is_later(struct intel_timeline *tl,
> +						  u64 context, u32 seqno)
> +{
> +	return i915_syncmap_is_later(&tl->sync, context, seqno);
> +}
> +
> +static inline bool intel_timeline_sync_is_later(struct intel_timeline *tl,
> +						const struct dma_fence *fence)
> +{
> +	return __intel_timeline_sync_is_later(tl, fence->context, fence->seqno);
> +}
> +
>  #endif
> diff --git a/drivers/gpu/drm/i915/i915_syncmap.c b/drivers/gpu/drm/i915/i915_syncmap.c
> new file mode 100644
> index 000000000000..70762c3772a0
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_syncmap.c
> @@ -0,0 +1,362 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/slab.h>
> +
> +#include "i915_syncmap.h"
> +
> +#include "i915_gem.h" /* GEM_BUG_ON() */
> +
> +#define SHIFT ilog2(KSYNCMAP)
> +#define MASK (KSYNCMAP - 1)
> +
> +/*
> + * struct i915_syncmap is a layer of a radixtree that maps a u64 fence
> + * context id to the last u32 fence seqno waited upon from that context.
> + * Unlike lib/radixtree it uses a parent pointer that allows traversal back to
> + * the root. This allows us to access the whole tree via a single pointer
> + * to the most recently used layer. We expect fence contexts to be dense
> + * and most reuse to be on the same i915_gem_context but on neighbouring
> + * engines (i.e. on adjacent contexts) and reuse the same leaf, a very
> + * effective lookup cache. If the new lookup is not on the same leaf, we
> + * expect it to be on the neighbouring branch.
> + *
> + * A leaf holds an array of u32 seqno, and has height 0. The bitmap field
> + * allows us to store whether a particular seqno is valid (i.e. allows us
> + * to distinguish unset from 0).
> + *
> + * A branch holds an array of layer pointers, and has height > 0, and always
> + * has at least 2 layers (either branches or leaves) below it.
> + */
> +
> +struct i915_syncmap {
> +	u64 prefix;
> +	unsigned int height;
> +	unsigned int bitmap;
> +	struct i915_syncmap *parent;
> +	/*
> +	 * Following this header is an array of either seqno or child pointers:
> +	 * union {
> +	 *	u32 seqno[KSYNCMAP];
> +	 *	struct i915_syncmap *child[KSYNMAP];
> +	 * };
> +	 */
> +};
> +
> +/**
> + * i915_syncmap_init -- initialise the #i915_syncmap
> + * @root - pointer to the #i915_syncmap
> + */
> +void i915_syncmap_init(struct i915_syncmap **root)
> +{
> +	BUILD_BUG_ON(KSYNCMAP > BITS_PER_BYTE * sizeof((*root)->bitmap));
> +	*root = NULL;
> +}
> +
> +static inline u32 *__sync_seqno(struct i915_syncmap *p)
> +{
> +	GEM_BUG_ON(p->height);
> +	return (u32 *)(p + 1);
> +}
> +
> +static inline struct i915_syncmap **__sync_child(struct i915_syncmap *p)
> +{
> +	GEM_BUG_ON(!p->height);
> +	return (struct i915_syncmap **)(p + 1);
> +}
> +
> +static inline unsigned int __sync_idx(const struct i915_syncmap *p, u64 id)
> +{
> +	return (id >> p->height) & MASK;
> +}
> +
> +static inline u64 __sync_prefix(const struct i915_syncmap *p, u64 id)
> +{
> +	return id >> p->height >> SHIFT;
> +}
> +
> +static inline u64 __sync_leaf(const struct i915_syncmap *p, u64 id)
> +{
> +	GEM_BUG_ON(p->height);
> +	return id >> SHIFT;
> +}
> +
> +static inline bool seqno_later(u32 a, u32 b)
> +{
> +	return (s32)(a - b) >= 0;
> +}
> +
> +/**
> + * i915_syncmap_is_later -- compare against the last know sync point
> + * @root - pointer to the #i915_syncmap
> + * @id - the context id (other timeline) we are synchronising to
> + * @seqno - the sequence number along the other timeline
> + *
> + * If we have already synchronised this @root with another (@id) then we can
> + * omit any repeated or earlier synchronisation requests. If the two timelines
> + * are already coupled, we can also omit the dependency between the two as that
> + * is already known via the timeline.
> + *
> + * Returns true if the two timelines are already synchronised wrt to @seqno,
> + * false if not and the synchronisation must be emitted.
> + */
> +bool i915_syncmap_is_later(struct i915_syncmap **root, u64 id, u32 seqno)
> +{
> +	struct i915_syncmap *p;
> +	unsigned int idx;
> +
> +	p = *root;
> +	if (!p)
> +		return false;
> +
> +	if (likely(__sync_leaf(p, id) == p->prefix))
> +		goto found;

Are you sure likely is appropriate here?

> +
> +	/* First climb the tree back to a parent branch */
> +	do {
> +		p = p->parent;
> +		if (!p)
> +			return false;
> +
> +		if (__sync_prefix(p, id) == p->prefix)
> +			break;
> +	} while (1);
> +
> +	/* And then descend again until we find our leaf */
> +	do {
> +		if (!p->height)
> +			break;
> +
> +		p = __sync_child(p)[__sync_idx(p, id)];
> +		if (!p)
> +			return false;
> +
> +		if (__sync_prefix(p, id) != p->prefix)
> +			return false;
> +	} while (1);
> +
> +	*root = p;
> +found:
> +	idx = id & MASK;

Would:

	GEM_BUG_ON(p->height);
	idx = __sync_idx(p, id);

be correct (even if more verbose) here instead of idx = id & MASK?

> +	if (!(p->bitmap & BIT(idx)))
> +		return false;

I was thinking briefly whether seqno+bitmap get/set helpers would be 
helpful but I think there is no need. With the __sync_*_prefix algorithm 
is much more readable already.

> +
> +	return seqno_later(__sync_seqno(p)[idx], seqno);
> +}
> +
> +static struct i915_syncmap *
> +__sync_alloc_leaf(struct i915_syncmap *parent, u64 id)
> +{
> +	struct i915_syncmap *p;
> +
> +	p = kmalloc(sizeof(*p) + KSYNCMAP * sizeof(u32), GFP_KERNEL);
> +	if (unlikely(!p))
> +		return NULL;
> +
> +	p->parent = parent;
> +	p->height = 0;
> +	p->bitmap = 0;
> +	p->prefix = __sync_leaf(p, id);
> +	return p;
> +}
> +
> +static noinline int
> +__i915_syncmap_set(struct i915_syncmap **root, u64 id, u32 seqno)
> +{
> +	struct i915_syncmap *p = *root;
> +	unsigned int idx;
> +
> +	if (!p) {
> +		p = __sync_alloc_leaf(NULL, id);
> +		if (unlikely(!p))
> +			return -ENOMEM;
> +
> +		goto found;
> +	}
> +

GEM_BUG_ON(p->prefix == __sync_leaf_prefix(p, id)) ? Or maybe better try 
to handle it rather than expect caller will never do that? By handling 
it I mean not immediately strt climbing the tree just below but check 
for the condition first and goto found.

> +	/* Climb back up the tree until we find a common prefix */
> +	do {
> +		if (!p->parent)
> +			break;
> +
> +		p = p->parent;
> +
> +		if (__sync_prefix(p, id) == p->prefix)
> +			break;
> +	} while (1);
> +
> +	/*
> +	 * No shortcut, we have to descend the tree to find the right layer
> +	 * containing this fence.
> +	 *
> +	 * Each layer in the tree holds 16 (KSYNCMAP) pointers, either fences
> +	 * or lower layers. Leaf nodes (height = 0) contain the fences, all
> +	 * other nodes (height > 0) are internal layers that point to a lower
> +	 * node. Each internal layer has at least 2 descendents.
> +	 *
> +	 * Starting at the top, we check whether the current prefix matches. If
> +	 * it doesn't, we have gone passed our layer and need to insert a join
> +	 * into the tree, and a new leaf node as a descendent as well as the
> +	 * original layer.
> +	 *
> +	 * The matching prefix means we are still following the right branch
> +	 * of the tree. If it has height 0, we have found our leaf and just
> +	 * need to replace the fence slot with ourselves. If the height is
> +	 * not zero, our slot contains the next layer in the tree (unless
> +	 * it is empty, in which case we can add ourselves as a new leaf).
> +	 * As descend the tree the prefix grows (and height decreases).
> +	 */
> +	do {
> +		struct i915_syncmap *next;
> +
> +		if (__sync_prefix(p, id) != p->prefix) {
> +			unsigned int above;
> +
> +			/* insert a join above the current layer */
> +			next = kzalloc(sizeof(*next) + KSYNCMAP * sizeof(next),
> +				       GFP_KERNEL);

Next is above, right? Would common_parent be correct? Or 
lowest_common_parent? Possibly not the name because it is too long, but 
I'm trying to figure out if I got the fls and xor business right.

> +			if (unlikely(!next))
> +				return -ENOMEM;
> +
> +			above = fls64(__sync_prefix(p, id) ^ p->prefix);
> +			above = round_up(above, SHIFT);
> +			next->height = above + p->height;
> +			next->prefix = __sync_prefix(next, id);
> +
> +			if (p->parent)
> +				__sync_child(p->parent)[__sync_idx(p->parent, id)] = next;
> +			next->parent = p->parent;
> +
> +			idx = p->prefix >> (above - SHIFT) & MASK;
> +			__sync_child(next)[idx] = p;
> +			next->bitmap |= BIT(idx);
> +			p->parent = next;
> +
> +			/* ascend to the join */
> +			p = next;
> +		} else {
> +			if (!p->height)
> +				break;
> +		}
> +
> +		/* descend into the next layer */
> +		GEM_BUG_ON(!p->height);
> +		idx = __sync_idx(p, id);
> +		next = __sync_child(p)[idx];
> +		if (unlikely(!next)) {

Why is this one unlikely?

> +			next = __sync_alloc_leaf(p, id);
> +			if (unlikely(!next))
> +				return -ENOMEM;
> +
> +			__sync_child(p)[idx] = next;
> +			p->bitmap |= BIT(idx);
> +
> +			p = next;
> +			break;
> +		}
> +
> +		p = next;
> +	} while (1);
> +
> +found:
> +	GEM_BUG_ON(p->prefix != __sync_leaf(p, id));
> +	idx = id & MASK;
> +	__sync_seqno(p)[idx] = seqno;
> +	p->bitmap |= BIT(idx);

Actually __sync_set_seqno(p, id, seqno) might be useful here and in 
i915_syncmap_set below.

The below looks OK for the moment. Still getting to terms with the above 
loop. Postponing drawing the diagram.. :)
Regards,

Tvrtko

> +	*root = p;
> +	return 0;
> +}
> +
> +/**
> + * i915_syncmap_set -- mark the most recent syncpoint between contexts
> + * @root - pointer to the #i915_syncmap
> + * @id - the context id (other timeline) we have synchronised to
> + * @seqno - the sequence number along the other timeline
> + *
> + * When we synchronise this @root with another (@id), we also know that we have
> + * synchronized with all previous seqno along that timeline. If we then have
> + * a request to synchronise with the same seqno or older, we can omit it,
> + * see i915_syncmap_is_later()
> + *
> + * Returns 0 on success, or a negative error code.
> + */
> +int i915_syncmap_set(struct i915_syncmap **root, u64 id, u32 seqno)
> +{
> +	struct i915_syncmap *p = *root;
> +
> +	/*
> +	 * We expect to be called in sequence following a is_later(id), which
> +	 * should have preloaded the root for us.
> +	 */
> +	if (likely(p && __sync_leaf(p, id) == p->prefix)) {
> +		unsigned int idx = id & MASK;
> +
> +		__sync_seqno(p)[idx] = seqno;
> +		p->bitmap |= BIT(idx);
> +		return 0;
> +	}
> +
> +	return __i915_syncmap_set(root, id, seqno);
> +}
> +
> +static void __sync_free(struct i915_syncmap *p)
> +{
> +	if (p->height) {
> +		unsigned int i;
> +
> +		while ((i = ffs(p->bitmap))) {
> +			p->bitmap &= ~0u << i;
> +			__sync_free(__sync_child(p)[i - 1]);
> +		}
> +	}
> +
> +	kfree(p);
> +}
> +
> +/**
> + * i915_syncmap_free -- free all memory associated with the syncmap
> + * @root - pointer to the #i915_syncmap
> + *
> + * Either when the timeline is to be freed and we no longer need the sync
> + * point tracking, or when the fences are all known to be signaled and the
> + * sync point tracking is redundant, we can free the #i915_syncmap to recover
> + * its allocations.
> + *
> + * Will reinitialise the @root pointer so that the #i915_syncmap is ready for
> + * reuse.
> + */
> +void i915_syncmap_free(struct i915_syncmap **root)
> +{
> +	struct i915_syncmap *p;
> +
> +	p = *root;
> +	if (!p)
> +		return;
> +
> +	while (p->parent)
> +		p = p->parent;
> +
> +	__sync_free(p);
> +	*root = NULL;
> +}
> diff --git a/drivers/gpu/drm/i915/i915_syncmap.h b/drivers/gpu/drm/i915/i915_syncmap.h
> new file mode 100644
> index 000000000000..7ca827d812ae
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_syncmap.h
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __I915_SYNCMAP_H__
> +#define __I915_SYNCMAP_H__
> +
> +#include <linux/types.h>
> +
> +struct i915_syncmap;
> +
> +void i915_syncmap_init(struct i915_syncmap **root);
> +bool i915_syncmap_is_later(struct i915_syncmap **root, u64 id, u32 seqno);
> +int i915_syncmap_set(struct i915_syncmap **root, u64 id, u32 seqno);
> +void i915_syncmap_free(struct i915_syncmap **root);
> +
> +#define KSYNCMAP 16
> +
> +#endif /* __I915_SYNCMAP_H__ */
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_timeline.c b/drivers/gpu/drm/i915/selftests/i915_gem_timeline.c
> new file mode 100644
> index 000000000000..2058e754c86d
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_timeline.c
> @@ -0,0 +1,257 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/random.h>
> +
> +#include "../i915_selftest.h"
> +#include "mock_gem_device.h"
> +#include "mock_timeline.h"
> +
> +static int igt_sync(void *arg)
> +{
> +	const struct {
> +		const char *name;
> +		u32 seqno;
> +		bool expected;
> +		bool set;
> +	} pass[] = {
> +		{ "unset", 0, false, false },
> +		{ "new", 0, false, true },
> +		{ "0a", 0, true, true },
> +		{ "1a", 1, false, true },
> +		{ "1b", 1, true, true },
> +		{ "0b", 0, true, false },
> +		{ "2a", 2, false, true },
> +		{ "4", 4, false, true },
> +		{ "INT_MAX", INT_MAX, false, true },
> +		{ "INT_MAX-1", INT_MAX-1, true, false },
> +		{ "INT_MAX+1", (u32)INT_MAX+1, false, true },
> +		{ "INT_MAX", INT_MAX, true, false },
> +		{ "UINT_MAX", UINT_MAX, false, true },
> +		{ "wrap", 0, false, true },
> +		{ "unwrap", UINT_MAX, true, false },
> +		{},
> +	}, *p;
> +	struct intel_timeline *tl;
> +	int order, offset;
> +	int ret;
> +
> +	tl = mock_timeline(0);
> +	if (!tl)
> +		return -ENOMEM;
> +
> +	for (p = pass; p->name; p++) {
> +		for (order = 1; order < 64; order++) {
> +			for (offset = -1; offset <= (order > 1); offset++) {
> +				u64 ctx = BIT_ULL(order) + offset;
> +
> +				if (__intel_timeline_sync_is_later
> +				    (tl, ctx, p->seqno) != p->expected) {
> +					pr_err("1: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
> +					       p->name, ctx, p->seqno, yesno(p->expected));
> +					ret = -EINVAL;
> +					goto out;
> +				}
> +
> +				if (p->set) {
> +					ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
> +					if (ret)
> +						goto out;
> +				}
> +			}
> +		}
> +	}
> +	mock_timeline_destroy(tl);
> +
> +	tl = mock_timeline(0);
> +	if (!tl)
> +		return -ENOMEM;
> +
> +	for (order = 1; order < 64; order++) {
> +		for (offset = -1; offset <= (order > 1); offset++) {
> +			u64 ctx = BIT_ULL(order) + offset;
> +
> +			for (p = pass; p->name; p++) {
> +				if (__intel_timeline_sync_is_later
> +				    (tl, ctx, p->seqno) != p->expected) {
> +					pr_err("2: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
> +					       p->name, ctx, p->seqno, yesno(p->expected));
> +					ret = -EINVAL;
> +					goto out;
> +				}
> +
> +				if (p->set) {
> +					ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
> +					if (ret)
> +						goto out;
> +				}
> +			}
> +		}
> +	}
> +
> +out:
> +	mock_timeline_destroy(tl);
> +	return ret;
> +}
> +
> +static u64 prandom_u64_state(struct rnd_state *rnd)
> +{
> +	u64 x;
> +
> +	x = prandom_u32_state(rnd);
> +	x <<= 32;
> +	x |= prandom_u32_state(rnd);
> +
> +	return x;
> +}
> +
> +static unsigned int random_engine(struct rnd_state *rnd)
> +{
> +	return ((u64)prandom_u32_state(rnd) * I915_NUM_ENGINES) >> 32;
> +}
> +
> +static int bench_sync(void *arg)
> +{
> +	struct rnd_state prng;
> +	struct intel_timeline *tl;
> +	unsigned long end_time, count;
> +	ktime_t kt;
> +
> +	tl = mock_timeline(0);
> +	if (!tl)
> +		return -ENOMEM;
> +
> +	prandom_seed_state(&prng, i915_selftest.random_seed);
> +	count = 0;
> +	kt = ktime_get();
> +	end_time = jiffies + HZ/10;
> +	do {
> +		u64 id = prandom_u64_state(&prng);
> +
> +		__intel_timeline_sync_set(tl, id, 0);
> +		count++;
> +	} while (!time_after(jiffies, end_time));
> +	kt = ktime_sub(ktime_get(), kt);
> +	pr_info("%s: %lu random insertions, %lluns/insert\n",
> +		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
> +
> +	prandom_seed_state(&prng, i915_selftest.random_seed);
> +	end_time = count;
> +	kt = ktime_get();
> +	while (end_time--) {
> +		u64 id = prandom_u64_state(&prng);
> +
> +		if (!__intel_timeline_sync_is_later(tl, id, 0)) {
> +			mock_timeline_destroy(tl);
> +			pr_err("Lookup of %llu failed\n", id);
> +			return -EINVAL;
> +		}
> +	}
> +	kt = ktime_sub(ktime_get(), kt);
> +	pr_info("%s: %lu random lookups, %lluns/lookup\n",
> +		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
> +
> +	mock_timeline_destroy(tl);
> +
> +	tl = mock_timeline(0);
> +	if (!tl)
> +		return -ENOMEM;
> +
> +	count = 0;
> +	kt = ktime_get();
> +	end_time = jiffies + HZ/10;
> +	do {
> +		__intel_timeline_sync_set(tl, count++, 0);
> +	} while (!time_after(jiffies, end_time));
> +	kt = ktime_sub(ktime_get(), kt);
> +	pr_info("%s: %lu in-order insertions, %lluns/insert\n",
> +		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
> +
> +	end_time = count;
> +	kt = ktime_get();
> +	while (end_time--) {
> +		if (!__intel_timeline_sync_is_later(tl, end_time, 0)) {
> +			pr_err("Lookup of %lu failed\n", end_time);
> +			mock_timeline_destroy(tl);
> +			return -EINVAL;
> +		}
> +	}
> +	kt = ktime_sub(ktime_get(), kt);
> +	pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
> +		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
> +
> +	mock_timeline_destroy(tl);
> +
> +	tl = mock_timeline(0);
> +	if (!tl)
> +		return -ENOMEM;
> +
> +	prandom_seed_state(&prng, i915_selftest.random_seed);
> +	count = 0;
> +	kt = ktime_get();
> +	end_time = jiffies + HZ/10;
> +	do {
> +		u32 id = random_engine(&prng);
> +		u32 seqno = prandom_u32_state(&prng);
> +
> +		if (!__intel_timeline_sync_is_later(tl, id, seqno))
> +			__intel_timeline_sync_set(tl, id, seqno);
> +
> +		count++;
> +	} while (!time_after(jiffies, end_time));
> +	kt = ktime_sub(ktime_get(), kt);
> +	pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
> +		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
> +	mock_timeline_destroy(tl);
> +
> +	tl = mock_timeline(0);
> +	if (!tl)
> +		return -ENOMEM;
> +
> +	count = 0;
> +	kt = ktime_get();
> +	end_time = jiffies + HZ/10;
> +	do {
> +		if (!__intel_timeline_sync_is_later(tl, count & 7, count >> 4))
> +			__intel_timeline_sync_set(tl, count & 7, count >> 4);
> +
> +		count++;
> +	} while (!time_after(jiffies, end_time));
> +	kt = ktime_sub(ktime_get(), kt);
> +	pr_info("%s: %lu cyclic insert/lookups, %lluns/op\n",
> +		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
> +	mock_timeline_destroy(tl);
> +
> +	return 0;
> +}
> +
> +int i915_gem_timeline_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(igt_sync),
> +		SUBTEST(bench_sync),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index be9a9ebf5692..8d0f50c25df8 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -12,6 +12,7 @@ selftest(sanitycheck, i915_mock_sanitycheck) /* keep first (igt selfcheck) */
>  selftest(scatterlist, scatterlist_mock_selftests)
>  selftest(uncore, intel_uncore_mock_selftests)
>  selftest(breadcrumbs, intel_breadcrumbs_mock_selftests)
> +selftest(timelines, i915_gem_timeline_mock_selftests)
>  selftest(requests, i915_gem_request_mock_selftests)
>  selftest(objects, i915_gem_object_mock_selftests)
>  selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
> new file mode 100644
> index 000000000000..47b1f47c5812
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
> @@ -0,0 +1,45 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include "mock_timeline.h"
> +
> +struct intel_timeline *mock_timeline(u64 context)
> +{
> +	static struct lock_class_key class;
> +	struct intel_timeline *tl;
> +
> +	tl = kzalloc(sizeof(*tl), GFP_KERNEL);
> +	if (!tl)
> +		return NULL;
> +
> +	__intel_timeline_init(tl, NULL, context, &class, "mock");
> +
> +	return tl;
> +}
> +
> +void mock_timeline_destroy(struct intel_timeline *tl)
> +{
> +	__intel_timeline_fini(tl);
> +	kfree(tl);
> +}
> diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.h b/drivers/gpu/drm/i915/selftests/mock_timeline.h
> new file mode 100644
> index 000000000000..c27ff4639b8b
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/selftests/mock_timeline.h
> @@ -0,0 +1,33 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __MOCK_TIMELINE__
> +#define __MOCK_TIMELINE__
> +
> +#include "../i915_gem_timeline.h"
> +
> +struct intel_timeline *mock_timeline(u64 context);
> +void mock_timeline_destroy(struct intel_timeline *tl);
> +
> +#endif /* !__MOCK_TIMELINE__ */
>