[Intel-gfx] [PATCH v7 3/5] drm/i915: No TLB invalidation on wedged or suspended GT

Fri Oct 6 17:30:57 UTC 2023

On 10/6/2023 03:23, Tvrtko Ursulin wrote:
>
>
> On 05/10/2023 20:35, Jonathan Cavitt wrote:
>> ...
>> +static bool intel_gt_is_enabled(const struct intel_gt *gt)
>> +{
>> +    /* Check if GT is wedged or suspended */
>> +    if (intel_gt_is_wedged(gt) || !intel_irqs_enabled(gt->i915))
>> +        return false;
>> +    return true;
>> +}
>> +
>>   static int guc_send_invalidate_tlb(struct intel_guc *guc, u32 type)
>>   {
>>       struct intel_guc_tlb_wait _wq, *wq = &_wq;
>> @@ -4763,7 +4786,8 @@ static int guc_send_invalidate_tlb(struct 
>> intel_guc *guc, u32 type)
>>       };
>>       u32 size = ARRAY_SIZE(action);
>>   -    if (!intel_guc_ct_enabled(&guc->ct))
>> +    if (!intel_gt_is_enabled(gt) ||
>> +        !intel_guc_ct_enabled(&guc->ct))
>
> IMO this reads confused but I leave it to the GuC experts to decide 
> what makes sense. Not only that it reads confused but it does inspire 
> confidence that it closes any race, since this state can still change 
> just after this check, and then the invalidation request gets 
> submitted (contrary to what the commit says?). Only thing it does 
> below is skip the wait and the time out error message. Again, I leave 
> it for people who know the GuC state transition flows to bless this part.
>
> Regards,
>
> Tvrtko
Regarding confused naming, I personally still think that 
intel_gt_is_enabled() is a bad name. Even the comment inside the 
function does not mention 'enable', it says 'wedged or suspended'. One 
could certainly argue that the GT is also not currently enabled if GuC 
is in use but the CT channel is down.

Regarding race conditions, the only things that can take the CT channel 
down are driver shutdown, suspend and GT reset. On the submission side, 
the assumption is that the scheduling levels of the driver are not going 
to call in to the submission backend without suitable locking held to 
ensure those operations cannot occur concurrently. Is the same not true 
here? We have to guard against the situation where the call starts from 
a 'bad' state, e.g. wedged. But the lowest level of code can't be 
expected to take higher level locks. From all the way down here, we have 
no idea what the upper levels may or may not be doing and what locks may 
or may not have been acquired, and therefore what locks may or may not 
be safe to acquire.

John.

>
>>           return -EINVAL;
>>         init_waitqueue_head(&_wq.wq);
>> @@ -4806,7 +4830,8 @@ static int guc_send_invalidate_tlb(struct 
>> intel_guc *guc, u32 type)
>>        * can be queued in CT buffer.
>>        */
>>   #define OUTSTANDING_GUC_TIMEOUT_PERIOD  (HZ * 2)
>> -    if (!must_wait_woken(&wait, OUTSTANDING_GUC_TIMEOUT_PERIOD)) {
>> +    if (intel_gt_is_enabled(gt) &&
>> +        !must_wait_woken(&wait, OUTSTANDING_GUC_TIMEOUT_PERIOD)) {
>>           gt_err(gt,
>>                  "TLB invalidation response timed out for seqno 
>> %u\n", seqno);
>>           err = -ETIME;
>> diff --git a/drivers/gpu/drm/i915/i915_driver.c 
>> b/drivers/gpu/drm/i915/i915_driver.c
>> index ccbb2834cde07..0c9d9826d2f41 100644
>> --- a/drivers/gpu/drm/i915/i915_driver.c
>> +++ b/drivers/gpu/drm/i915/i915_driver.c
>> @@ -72,6 +72,7 @@
>>   #include "gt/intel_gt.h"
>>   #include "gt/intel_gt_pm.h"
>>   #include "gt/intel_rc6.h"
>> +#include "gt/intel_tlb.h"
>>     #include "pxp/intel_pxp.h"
>>   #include "pxp/intel_pxp_debugfs.h"
>> @@ -1093,6 +1094,9 @@ static int i915_drm_suspend(struct drm_device 
>> *dev)
>>       intel_dp_mst_suspend(dev_priv);
>>         intel_runtime_pm_disable_interrupts(dev_priv);
>> +
>> +    intel_gt_tlb_suspend_all(dev_priv);
>> +
>>       intel_hpd_cancel_work(dev_priv);
>>         intel_suspend_encoders(dev_priv);
>> @@ -1264,6 +1268,8 @@ static int i915_drm_resume(struct drm_device *dev)
>>         intel_gvt_resume(dev_priv);
>>   +    intel_gt_tlb_resume_all(dev_priv);
>> +
>>       enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
>>         return 0;