[Intel-gfx] [PATCH 2/2] drm/i915: Use rcu instead of stop_machine in set_wedged

Tue Oct 10 14:09:06 UTC 2017

On Tue, Oct 10, 2017 at 3:39 PM, Chris Wilson <chris at chris-wilson.co.uk> wrote:
> Style nits:
>
> Quoting Daniel Vetter (2017-10-09 17:44:01)
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 82a10036fb38..8d7d8d1f78db 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -3057,49 +3057,71 @@ static void nop_submit_request(struct drm_i915_gem_request *request)
>>
>>         spin_lock_irqsave(&request->engine->timeline->lock, flags);
>>         __i915_gem_request_submit(request);
>> -       intel_engine_init_global_seqno(request->engine, request->global_seqno);
>>         spin_unlock_irqrestore(&request->engine->timeline->lock, flags);
>
> This reduced to i915_gem_request_submit().
>
> Hah, you can just engine->submit_request = i915_gem_request_submit, and
> keep the nop_submit_request for the second phase! That may make the
> diff neater?

We need to call dma_fence_set_error still, and we need to start doing
that before we start calling ->cancel_request, or we might miss some
requests.

I'll do all the other suggestions and resubmit.
-Daniel

>> -static int __i915_gem_set_wedged_BKL(void *data)
>> +void i915_gem_set_wedged(struct drm_i915_private *i915)
>>  {
>> -       struct drm_i915_private *i915 = data;
>>         struct intel_engine_cs *engine;
>>         enum intel_engine_id id;
>>
>> +       /* First, stop submission to hw, but do not yet complete requests by
>> +        * rolling the global seqno forward (since this would complete requests
>> +        * for which we haven't set the fence error to EIO yet).
>> +        */
>
> I'm doing a quiet fixup of all my comments to follow
> /*
>  * The core convention, which you normally use anyway.
>  */
>
>>         for_each_engine(engine, i915, id)
>> -               engine_set_wedged(engine);
>> +               engine->submit_request = nop_submit_request;
>>
>> -       set_bit(I915_WEDGED, &i915->gpu_error.flags);
>> -       wake_up_all(&i915->gpu_error.reset_queue);
>> +       /* Make sure no one is running the old callback before we proceed with
>> +        * cancelling requests and resetting the completion tracking. Otherwise
>> +        * we might submit a request to the hardware which never completes.
>> +        */
>> +       synchronize_rcu();
>>
>> -       return 0;
>> -}
>> +       for_each_engine(engine, i915, id) {
>> +               /* Mark all executing requests as skipped */
>> +               engine->cancel_requests(engine);
>>
>> -void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
>> -{
>> -       stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
>> +               /* Only once we've force-cancelled all in-flight requests can we
>> +                * start to complete all requests.
>> +                */
>> +               engine->submit_request = nop_complete_submit_request;
>> +       }
>> +
>> +       /* Make sure no request can slip through without getting completed by
>> +        * either this call here to intel_engine_init_global_seqno, or the one
>> +        * in nop_complete_submit_request.
>> +        */
>> +       synchronize_rcu();
>> +
>> +       for_each_engine(engine, i915, id) {
>> +               unsigned long flags;
>> +
>> +               /* Mark all pending requests as complete so that any concurrent
>> +                * (lockless) lookup doesn't try and wait upon the request as we
>> +                * reset it.
>> +                */
>> +               spin_lock_irqsave(&engine->timeline->lock, flags);
>> +               intel_engine_init_global_seqno(engine,
>> +                                              intel_engine_last_submit(engine));
>> +               spin_unlock_irqrestore(&engine->timeline->lock, flags);
>> +       }
>> +
>> +       set_bit(I915_WEDGED, &i915->gpu_error.flags);
>> +       wake_up_all(&i915->gpu_error.reset_queue);
>>  }
>>
>>  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
>> index b100b38f1dd2..ef78a85cb845 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_request.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
>> @@ -556,7 +556,9 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>>         switch (state) {
>>         case FENCE_COMPLETE:
>>                 trace_i915_gem_request_submit(request);
>> +               rcu_read_lock();
>
> This trick needs a comment.
>
> /*
>  * We need to serialize use of the submit_request() callback with its
>  * hotplugging performed during an emergency i915_gem_set_wedged().
>  * We use the RCU mechanism to mark the critical section in order to
>  * force i915_gem_set_wedged() to wait until the submit_request() is
>  * completed before proceeding.
>  */
> -Chris

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch