[Intel-gfx] [PATCH 1/6] drm/i915/gt: Protect signaler walk with RCU

Thu Feb 20 13:16:13 UTC 2020

On 20/02/2020 12:52, Chris Wilson wrote:
> Quoting Matthew Auld (2020-02-20 12:47:28)
>> On 20/02/2020 07:50, Chris Wilson wrote:
>>> While we know that the waiters cannot disappear as we walk our list
>>> (only that they might be added), the same cannot be said for our
>>> signalers as they may be completed by the HW and retired as we process
>>> this request. Ergo we need to use rcu to protect the list iteration and
>>> remember to mark up the list_del_rcu.
>>>
>>> v2: Mark the deps as safe-for-rcu
>>>
>>> Fixes: 793c22617367 ("drm/i915/gt: Protect execlists_hold/unhold from new waiters")
>>> Fixes: 32ff621fd744 ("drm/i915/gt: Allow temporary suspension of inflight requests")
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
>>> Cc: Matthew Auld <matthew.auld at intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/intel_lrc.c   | 16 ++++++++++------
>>>    drivers/gpu/drm/i915/i915_scheduler.c |  7 ++++---
>>>    2 files changed, 14 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> index ba31cbe8c68e..47561dc29304 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> @@ -1668,9 +1668,9 @@ last_active(const struct intel_engine_execlists *execlists)
>>>                                     wait_link)
>>>    
>>>    #define for_each_signaler(p__, rq__) \
>>> -     list_for_each_entry_lockless(p__, \
>>> -                                  &(rq__)->sched.signalers_list, \
>>> -                                  signal_link)
>>> +     list_for_each_entry_rcu(p__, \
>>> +                             &(rq__)->sched.signalers_list, \
>>> +                             signal_link)
>>>    
>>>    static void defer_request(struct i915_request *rq, struct list_head * const pl)
>>>    {
>>> @@ -2533,11 +2533,13 @@ static bool execlists_hold(struct intel_engine_cs *engine,
>>>    static bool hold_request(const struct i915_request *rq)
>>>    {
>>>        struct i915_dependency *p;
>>> +     bool result = false;
>>>    
>>>        /*
>>>         * If one of our ancestors is on hold, we must also be on hold,
>>>         * otherwise we will bypass it and execute before it.
>>>         */
>>> +     rcu_read_lock();
>>>        for_each_signaler(p, rq) {
>>>                const struct i915_request *s =
>>>                        container_of(p->signaler, typeof(*s), sched);
>>> @@ -2545,11 +2547,13 @@ static bool hold_request(const struct i915_request *rq)
>>>                if (s->engine != rq->engine)
>>>                        continue;
>>>    
>>> -             if (i915_request_on_hold(s))
>>> -                     return true;
>>> +             result = i915_request_on_hold(s);
>>> +             if (result)
>>> +                     break;
>>>        }
>>> +     rcu_read_unlock();
>>>    
>>> -     return false;
>>> +     return result;
>>>    }
>>>    
>>>    static void __execlists_unhold(struct i915_request *rq)
>>> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
>>> index e19a37a83397..59f70b674665 100644
>>> --- a/drivers/gpu/drm/i915/i915_scheduler.c
>>> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
>>> @@ -486,7 +486,7 @@ void i915_sched_node_fini(struct i915_sched_node *node)
>>>        list_for_each_entry_safe(dep, tmp, &node->signalers_list, signal_link) {
>>>                GEM_BUG_ON(!list_empty(&dep->dfs_link));
>>>    
>>> -             list_del(&dep->wait_link);
>>> +             list_del_rcu(&dep->wait_link);
>>>                if (dep->flags & I915_DEPENDENCY_ALLOC)
>>>                        i915_dependency_free(dep);
>>>        }
>>> @@ -497,7 +497,7 @@ void i915_sched_node_fini(struct i915_sched_node *node)
>>>                GEM_BUG_ON(dep->signaler != node);
>>>                GEM_BUG_ON(!list_empty(&dep->dfs_link));
>>>    
>>> -             list_del(&dep->signal_link);
>>> +             list_del_rcu(&dep->signal_link);
>>>                if (dep->flags & I915_DEPENDENCY_ALLOC)
>>>                        i915_dependency_free(dep);
>>>        }
>>> @@ -526,7 +526,8 @@ static struct i915_global_scheduler global = { {
>>>    int __init i915_global_scheduler_init(void)
>>>    {
>>>        global.slab_dependencies = KMEM_CACHE(i915_dependency,
>>> -                                           SLAB_HWCACHE_ALIGN);
>>> +                                           SLAB_HWCACHE_ALIGN |
>>> +                                           SLAB_TYPESAFE_BY_RCU);
>>
>> So, the claim is that we should be fine if the node is re-used and then
>> initialised, even though there might exist a minuscule window where
>> hold_request might still be able to see it, somehow?
> 
> Yes. That is my claim. The saving grace here is that for the on-hold
> transitions we must go through the engine->active.lock which we are
> holding. Ergo hold_request() is safe.

Reviewed-by: Matthew Auld <matthew.auld at intel.com>

> -Chris
>