[Intel-gfx] [PATCH 21/22] drm/i915: Enable lockless lookup of request tracking via RCU
Daniel Vetter
daniel at ffwll.ch
Thu Jul 28 10:23:40 UTC 2016
On Wed, Jul 27, 2016 at 12:14:59PM +0100, Chris Wilson wrote:
> If we enable RCU for the requests (providing a grace period where we can
> inspect a "dead" request before it is freed), we can allow callers to
> carefully perform lockless lookup of an active request.
>
> However, by enabling deferred freeing of requests, we can potentially
> hog a lot of memory when dealing with tens of thousands of requests per
> second - with a quick insertion of a synchronize_rcu() inside our
> shrinker callback, that issue disappears.
>
> v2: Currently, it is our responsibility to handle reclaim i.e. to avoid
> hogging memory with the delayed slab frees. At the moment, we wait for a
> grace period in the shrinker, and block for all RCU callbacks on oom.
> Suggested alternatives focus on flushing our RCU callback when we have a
> certain number of outstanding request frees, and blocking on that flush
> after a second high watermark. (So rather than wait for the system to
> run out of memory, we stop issuing requests - both are nondeterministic.)
>
> Paul E. McKenney wrote:
>
> Another approach is synchronize_rcu() after some largish number of
> requests. The advantage of this approach is that it throttles the
> production of callbacks at the source. The corresponding disadvantage
> is that it slows things up.
>
> Another approach is to use call_rcu(), but if the previous call_rcu()
> is still in flight, block waiting for it. Yet another approach is
> the get_state_synchronize_rcu() / cond_synchronize_rcu() pair. The
> idea is to do something like this:
>
> cond_synchronize_rcu(cookie);
> cookie = get_state_synchronize_rcu();
>
> You would of course do an initial get_state_synchronize_rcu() to
> get things going. This would not block unless there was less than
> one grace period's worth of time between invocations. But this
> assumes a busy system, where there is almost always a grace period
> in flight. But you can make that happen as follows:
>
> cond_synchronize_rcu(cookie);
> cookie = get_state_synchronize_rcu();
> call_rcu(&my_rcu_head, noop_function);
>
> Note that you need additional code to make sure that the old callback
> has completed before doing a new one. Setting and clearing a flag
> with appropriate memory ordering control suffices (e.g,. smp_load_acquire()
> and smp_store_release()).
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
> Cc: "Goel, Akash" <akash.goel at intel.com>
> Cc: Josh Triplett <josh at joshtriplett.org>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 7 +-
> drivers/gpu/drm/i915/i915_gem_request.c | 2 +-
> drivers/gpu/drm/i915/i915_gem_request.h | 114 +++++++++++++++++++++++++++++--
> drivers/gpu/drm/i915/i915_gem_shrinker.c | 15 ++--
> 4 files changed, 126 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 54d8a3863d11..0c546f8099d9 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4421,7 +4421,9 @@ i915_gem_load_init(struct drm_device *dev)
> dev_priv->requests =
> kmem_cache_create("i915_gem_request",
> sizeof(struct drm_i915_gem_request), 0,
> - SLAB_HWCACHE_ALIGN,
> + SLAB_HWCACHE_ALIGN |
> + SLAB_RECLAIM_ACCOUNT |
> + SLAB_DESTROY_BY_RCU,
> NULL);
>
> INIT_LIST_HEAD(&dev_priv->context_list);
> @@ -4457,6 +4459,9 @@ void i915_gem_load_cleanup(struct drm_device *dev)
> kmem_cache_destroy(dev_priv->requests);
> kmem_cache_destroy(dev_priv->vmas);
> kmem_cache_destroy(dev_priv->objects);
> +
> + /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
> + rcu_barrier();
> }
>
> int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 3395c955a532..bcc1369c0693 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -190,7 +190,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
> prefetchw(next);
>
> INIT_LIST_HEAD(&active->link);
> - active->__request = NULL;
> + RCU_INIT_POINTER(active->__request, NULL);
>
> active->retire(active, request);
> }
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 2eec0cac1e9f..bb03f4440b0f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -183,6 +183,12 @@ i915_gem_request_get(struct drm_i915_gem_request *req)
> return to_request(fence_get(&req->fence));
> }
>
> +static inline struct drm_i915_gem_request *
> +i915_gem_request_get_rcu(struct drm_i915_gem_request *req)
> +{
> + return to_request(fence_get_rcu(&req->fence));
> +}
> +
> static inline void
> i915_gem_request_put(struct drm_i915_gem_request *req)
> {
> @@ -286,7 +292,7 @@ typedef void (*i915_gem_retire_fn)(struct i915_gem_active *,
> struct drm_i915_gem_request *);
>
> struct i915_gem_active {
> - struct drm_i915_gem_request *__request;
> + struct drm_i915_gem_request __rcu *__request;
> struct list_head link;
> i915_gem_retire_fn retire;
> };
> @@ -323,13 +329,19 @@ i915_gem_active_set(struct i915_gem_active *active,
> struct drm_i915_gem_request *request)
> {
> list_move(&active->link, &request->active_list);
> - active->__request = request;
> + rcu_assign_pointer(active->__request, request);
> }
>
> static inline struct drm_i915_gem_request *
> __i915_gem_active_peek(const struct i915_gem_active *active)
> {
> - return active->__request;
> + /* Inside the error capture (running with the driver in an unknown
> + * state), we want to bend the rules slightly (a lot).
> + *
> + * Work is in progress to make it safer, in the meantime this keeps
> + * the known issue from spamming the logs.
> + */
> + return rcu_dereference_protected(active->__request, 1);
> }
>
> /**
> @@ -345,7 +357,29 @@ i915_gem_active_peek(const struct i915_gem_active *active, struct mutex *mutex)
> {
> struct drm_i915_gem_request *request;
>
> - request = active->__request;
> + request = rcu_dereference_protected(active->__request,
> + lockdep_is_held(mutex));
> + if (!request || i915_gem_request_completed(request))
> + return NULL;
> +
> + return request;
> +}
> +
> +/**
> + * i915_gem_active_peek_rcu - report the active request being monitored
> + * @active - the active tracker
> + *
> + * i915_gem_active_peek_rcu() returns the current request being tracked if
> + * still active, or NULL. It does not obtain a reference on the request
> + * for the caller, and inspection of the request is only valid under
> + * the RCU lock.
> + */
> +static inline struct drm_i915_gem_request *
> +i915_gem_active_peek_rcu(const struct i915_gem_active *active)
> +{
> + struct drm_i915_gem_request *request;
> +
> + request = rcu_dereference(active->__request);
> if (!request || i915_gem_request_completed(request))
> return NULL;
>
> @@ -366,6 +400,72 @@ i915_gem_active_get(const struct i915_gem_active *active, struct mutex *mutex)
> }
>
> /**
> + * i915_gem_active_get_rcu - return a reference to the active request
> + * @active - the active tracker
> + *
> + * i915_gem_active_get() returns a reference to the active request, or NULL
> + * if the active tracker is idle. The caller must hold the RCU read lock.
> + */
> +static inline struct drm_i915_gem_request *
> +i915_gem_active_get_rcu(const struct i915_gem_active *active)
> +{
> + /* Performing a lockless retrieval of the active request is super
> + * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing
> + * slab of request objects will not be freed whilst we hold the
> + * RCU read lock. It does not guarantee that the request itself
> + * will not be freed and then *reused*. Viz,
> + *
> + * Thread A Thread B
> + *
> + * req = active.request
> + * retire(req) -> free(req);
> + * (req is now first on the slab freelist)
> + * active.request = NULL
> + *
> + * req = new submission on a new object
> + * ref(req)
> + *
> + * To prevent the request from being reused whilst the caller
> + * uses it, we take a reference like normal. Whilst acquiring
> + * the reference we check that it is not in a destroyed state
> + * (refcnt == 0). That prevents the request being reallocated
> + * whilst the caller holds on to it. To check that the request
> + * was not reallocated as we acquired the reference we have to
> + * check that our request remains the active request across
> + * the lookup, in the same manner as a seqlock. The visibility
> + * of the pointer versus the reference counting is controlled
> + * by using RCU barriers (rcu_dereference and rcu_assign_pointer).
> + *
> + * In the middle of all that, we inspect whether the request is
> + * complete. Retiring is lazy so the request may be completed long
> + * before the active tracker is updated. Querying whether the
> + * request is complete is far cheaper (as it involves no locked
> + * instructions setting cachelines to exclusive) than acquiring
> + * the reference, so we do it first. The RCU read lock ensures the
> + * pointer dereference is valid, but does not ensure that the
> + * seqno nor HWS is the right one! However, if the request was
> + * reallocated, that means the active tracker's request was complete.
> + * If the new request is also complete, then both are and we can
> + * just report the active tracker is idle. If the new request is
> + * incomplete, then we acquire a reference on it and check that
> + * it remained the active request.
> + */
> + do {
> + struct drm_i915_gem_request *request;
> +
> + request = rcu_dereference(active->__request);
> + if (!request || i915_gem_request_completed(request))
> + return NULL;
> +
> + request = i915_gem_request_get_rcu(request);
I think we have a race here still: The issue is that the
kref_get_unless_zero is an unordered atomic, and the rcu_dereference is
only an smb_read_barrier_depends, which doesn't prevent the fetch from
happening before the atomic_add_unless.
Well until I opened memory-barriers.txt and learned that atomic_add_unless
is a full smp_mb() on both sides on success. That's a bit too tricky for
my taste, what about the following comment:
/* When request_get_rcu succeds the underlying
* atomic_add_unless has a full smp_mb() on both sides.
* This ensures that the rcu_dereference() below can't be
* reordered before the the refcounting increase has
* happened, which prevents the request from being reused.
*/
I couldn't poke any other holes into this, and we're reusing the fence rcu
functions where appropriate. With the comment:
Reviewed-by: Daniel Vetter <daniel.vetter at ffwll.ch>
> + if (!request || request == rcu_dereference(active->__request))
> + return request;
> +
> + i915_gem_request_put(request);
> + } while (1);
> +}
> +
> +/**
> * __i915_gem_active_is_busy - report whether the active tracker is assigned
> * @active - the active tracker
> *
> @@ -433,7 +533,8 @@ i915_gem_active_retire(struct i915_gem_active *active,
> struct drm_i915_gem_request *request;
> int ret;
>
> - request = active->__request;
> + request = rcu_dereference_protected(active->__request,
> + lockdep_is_held(mutex));
> if (!request)
> return 0;
>
> @@ -442,7 +543,8 @@ i915_gem_active_retire(struct i915_gem_active *active,
> return ret;
>
> list_del_init(&active->link);
> - active->__request = NULL;
> + RCU_INIT_POINTER(active->__request, NULL);
> +
> active->retire(active, request);
>
> return 0;
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index 33f8dcb9b8c4..a1a805fcdffa 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -191,6 +191,8 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
> intel_runtime_pm_put(dev_priv);
>
> i915_gem_retire_requests(dev_priv);
> + /* expedite the RCU grace period to free some request slabs */
> + synchronize_rcu_expedited();
>
> return count;
> }
> @@ -211,10 +213,15 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
> */
> unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv)
> {
> - return i915_gem_shrink(dev_priv, -1UL,
> - I915_SHRINK_BOUND |
> - I915_SHRINK_UNBOUND |
> - I915_SHRINK_ACTIVE);
> + unsigned long freed;
> +
> + freed = i915_gem_shrink(dev_priv, -1UL,
> + I915_SHRINK_BOUND |
> + I915_SHRINK_UNBOUND |
> + I915_SHRINK_ACTIVE);
> + rcu_barrier(); /* wait until our RCU delayed slab frees are completed */
> +
> + return freed;
> }
>
> static bool i915_gem_shrinker_lock(struct drm_device *dev, bool *unlock)
> --
> 2.8.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
More information about the Intel-gfx
mailing list