[Intel-gfx] [PATCH 05/21] drm/i915: Separate GPU hang waitqueue from advance

Tue Jun 7 12:11:20 UTC 2016

On 06/06/2016 18:30, Tvrtko Ursulin wrote:
>
> On 03/06/16 17:08, Chris Wilson wrote:
>> Currently __i915_wait_request uses a per-engine wait_queue_t for the dual
>> purpose of waking after the GPU advances or for waking after an error.
>> In the future, we may add even more wake sources and require greater
>> separation, but for now we can conceptually simplify wakeups by
>> separating
>> the two sources. In particular, this allows us to use different
>> wait-queues
>> (e.g. one on the engine advancement, a global one for errors and one on
>> each requests) without any hassle.
>
> + Arun
>
> I think this will conflict with the TDR work where one of the features
> is to make reset handling per engine. So I am not sure how beneficial in
> general, or painful for the TDR series, this patch might be.

Thanks Tvrtko.
Chris has give some comments on a related tdr patch based on these 
changes. I am looking into how to update my changes based on this.

Tdr code need access to struct_mutex so if a waiter is holding it then 
we should be able to ask it to try again so that we can proceed with 
recovery, similarly when an engine reset is in progress.

regards
Arun

>
> Regards,
>
> Tvrtko
>
>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h |  6 ++++++
>>   drivers/gpu/drm/i915/i915_gem.c |  5 +++++
>>   drivers/gpu/drm/i915/i915_irq.c | 19 ++++---------------
>>   3 files changed, 15 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index ceccc6d6b119..e399e97965e0 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1401,6 +1401,12 @@ struct i915_gpu_error {
>>   #define I915_WEDGED            (1 << 31)
>>
>>       /**
>> +     * Waitqueue to signal when a hang is detected. Used to for waiters
>> +     * to release the struct_mutex for the reset to procede.
>> +     */
>> +    wait_queue_head_t wait_queue;
>> +
>> +    /**
>>        * Waitqueue to signal when the reset has completed. Used by
>> clients
>>        * that wait for dev_priv->mm.wedged to settle.
>>        */
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 03256f096ab6..de4fb39312a4 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1234,6 +1234,7 @@ int __i915_wait_request(struct
>> drm_i915_gem_request *req,
>>       const bool irq_test_in_progress =
>>           ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) &
>> intel_engine_flag(engine);
>>       int state = interruptible ? TASK_INTERRUPTIBLE :
>> TASK_UNINTERRUPTIBLE;
>> +    DEFINE_WAIT(reset);
>>       DEFINE_WAIT(wait);
>>       unsigned long timeout_expire;
>>       s64 before = 0; /* Only to silence a compiler warning. */
>> @@ -1278,6 +1279,7 @@ int __i915_wait_request(struct
>> drm_i915_gem_request *req,
>>           goto out;
>>       }
>>
>> +    add_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
>>       for (;;) {
>>           struct timer_list timer;
>>
>> @@ -1329,6 +1331,8 @@ int __i915_wait_request(struct
>> drm_i915_gem_request *req,
>>               destroy_timer_on_stack(&timer);
>>           }
>>       }
>> +    remove_wait_queue(&dev_priv->gpu_error.wait_queue, &reset);
>> +
>>       if (!irq_test_in_progress)
>>           engine->irq_put(engine);
>>
>> @@ -5026,6 +5030,7 @@ i915_gem_load_init(struct drm_device *dev)
>>                 i915_gem_retire_work_handler);
>>       INIT_DELAYED_WORK(&dev_priv->mm.idle_work,
>>                 i915_gem_idle_work_handler);
>> +    init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
>>       init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
>>
>>       dev_priv->relative_constants_mode =
>> I915_EXEC_CONSTANTS_REL_GENERAL;
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c
>> b/drivers/gpu/drm/i915/i915_irq.c
>> index 83cab14639b2..30127b94f26e 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -2488,11 +2488,8 @@ static irqreturn_t gen8_irq_handler(int irq,
>> void *arg)
>>       return ret;
>>   }
>>
>> -static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>> -                   bool reset_completed)
>> +static void i915_error_wake_up(struct drm_i915_private *dev_priv)
>>   {
>> -    struct intel_engine_cs *engine;
>> -
>>       /*
>>        * Notify all waiters for GPU completion events that reset state
>> has
>>        * been changed, and that they need to restart their wait after
>> @@ -2501,18 +2498,10 @@ static void i915_error_wake_up(struct
>> drm_i915_private *dev_priv,
>>        */
>>
>>       /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
>> -    for_each_engine(engine, dev_priv)
>> -        wake_up_all(&engine->irq_queue);
>> +    wake_up_all(&dev_priv->gpu_error.wait_queue);
>>
>>       /* Wake up intel_crtc_wait_for_pending_flips, holding
>> crtc->mutex. */
>>       wake_up_all(&dev_priv->pending_flip_queue);
>> -
>> -    /*
>> -     * Signal tasks blocked in i915_gem_wait_for_error that the pending
>> -     * reset state is cleared.
>> -     */
>> -    if (reset_completed)
>> -        wake_up_all(&dev_priv->gpu_error.reset_queue);
>>   }
>>
>>   /**
>> @@ -2577,7 +2566,7 @@ static void i915_reset_and_wakeup(struct
>> drm_i915_private *dev_priv)
>>            * Note: The wake_up also serves as a memory barrier so that
>>            * waiters see the update value of the reset counter atomic_t.
>>            */
>> -        i915_error_wake_up(dev_priv, true);
>> +        wake_up_all(&dev_priv->gpu_error.reset_queue);
>>       }
>>   }
>>
>> @@ -2713,7 +2702,7 @@ void i915_handle_error(struct drm_i915_private
>> *dev_priv,
>>            * ensure that the waiters see the updated value of the reset
>>            * counter atomic_t.
>>            */
>> -        i915_error_wake_up(dev_priv, false);
>> +        i915_error_wake_up(dev_priv);
>>       }
>>
>>       i915_reset_and_wakeup(dev_priv);
>>
>