[Intel-gfx] [PATCH] drm/i915: Improve user experience and driver robustness under SIGINT or similar

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Tue Jun 14 07:55:48 UTC 2022


Final call to raise objections.

Regards,

Tvrtko

On 07/06/2022 09:36, Tvrtko Ursulin wrote:
> 
> On 27/05/2022 13:07, Andrzej Hajda wrote:
>> On 27.05.2022 09:24, Tvrtko Ursulin wrote:
>>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>>
>>> We have long standing customer complaints that pressing Ctrl-C (or to 
>>> the
>>> effect of) causes engine resets with otherwise well behaving programs.
>>>
>>> Not only is logging engine resets during normal operation not desirable
>>> since it creates support incidents, but more fundamentally we should 
>>> avoid
>>> going the engine reset path when we can since any engine reset 
>>> introduces
>>> a chance of harming an innocent context.
>>>
>>> Reason for this undesirable behaviour is that the driver currently does
>>> not distinguish between banned contexts and non-persistent contexts 
>>> which
>>> have been closed.
>>>
>>> To fix this we add the distinction between the two reasons for revoking
>>> contexts, which then allows the strict timeout only be applied to 
>>> banned,
>>> while innocent contexts (well behaving) can preempt cleanly and exit
>>> without triggering the engine reset path.
>>>
>>> Note that the added context exiting category applies both to closed non-
>>> persistent context, and any exiting context when hangcheck has been
>>> disabled by the user.
>>>
>>> At the same time we rename the backend operation from 'ban' to 'revoke'
>>> which more accurately describes the actual semantics. (There is no 
>>> ban at
>>> the backend level since banning is a concept driven by the scheduling
>>> frontend. Backends are simply able to revoke a running context so that
>>> is the more appropriate name chosen.)
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/gem/i915_gem_context.c   | 23 +++++++++++------
>>>   drivers/gpu/drm/i915/gt/intel_context.c       | 24 ++++++++++++++++++
>>>   drivers/gpu/drm/i915/gt/intel_context.h       | 25 +++++++++++++------
>>>   drivers/gpu/drm/i915/gt/intel_context_types.h |  4 ++-
>>>   .../drm/i915/gt/intel_execlists_submission.c  |  6 ++---
>>>   .../gpu/drm/i915/gt/intel_ring_submission.c   |  7 +++---
>>>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 15 ++++++-----
>>>   drivers/gpu/drm/i915/i915_request.c           |  2 +-
>>>   8 files changed, 77 insertions(+), 29 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
>>> b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> index ab4c5ab28e4d..6b171c89b1b3 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> @@ -1367,7 +1367,8 @@ static struct intel_engine_cs 
>>> *active_engine(struct intel_context *ce)
>>>       return engine;
>>>   }
>>> -static void kill_engines(struct i915_gem_engines *engines, bool ban)
>>> +static void
>>> +kill_engines(struct i915_gem_engines *engines, bool exit, bool 
>>> persistent)
>>>   {
>>>       struct i915_gem_engines_iter it;
>>>       struct intel_context *ce;
>>> @@ -1381,9 +1382,15 @@ static void kill_engines(struct 
>>> i915_gem_engines *engines, bool ban)
>>>        */
>>>       for_each_gem_engine(ce, engines, it) {
>>>           struct intel_engine_cs *engine;
>>> +        bool skip = false;
>>> -        if (ban && intel_context_ban(ce, NULL))
>>> -            continue;
>>> +        if (exit)
>>> +            skip = intel_context_set_exiting(ce);
>>> +        else if (!persistent)
>>> +            skip = intel_context_exit_nonpersistent(ce, NULL);
>>> +
>>> +        if (skip)
>>> +            continue; /* Already marked. */
>>
>> why not:
>>      if (exit && intel_context_set_exiting(ce))
>>          continue;
>>      else if (!persistent && intel_context_exit_nonpersistent(ce, NULL)
>>          continue;
> 
> Just so I can put the "already marked" comment on single line.
> 
>>
>> Anyway:
>> Reviewed-by: Andrzej Hajda <andrzej.hajda at intel.com>
> 
> Thanks!
> 
> John, Daniel - you had reservations against the older version of this 
> patch AFAIR. This time round I believe I conceptually simplified it by 
> doing a clean separation of contexts which should not be scheduled any 
> more becuase they want it so, versus the ones we banned. That is, the 
> patch stops abusing the banned status for contexts which haven't been 
> (banned). This allows to only apply the strict preempt timeout to 
> banned, while there is no reason to add any new timeout values for the 
> rest.
> 
> Any objections to this version?
> 
> Regards,
> 
> Tvrtko
> 
>>
>> Regards
>> Andrzej
>>
>>>           /*
>>>            * Check the current active state of this context; if we
>>> @@ -1395,7 +1402,7 @@ static void kill_engines(struct 
>>> i915_gem_engines *engines, bool ban)
>>>           engine = active_engine(ce);
>>>           /* First attempt to gracefully cancel the context */
>>> -        if (engine && !__cancel_engine(engine) && ban)
>>> +        if (engine && !__cancel_engine(engine) && (exit || 
>>> !persistent))
>>>               /*
>>>                * If we are unable to send a preemptive pulse to bump
>>>                * the context from the GPU, we have to resort to a full
>>> @@ -1407,8 +1414,6 @@ static void kill_engines(struct 
>>> i915_gem_engines *engines, bool ban)
>>>   static void kill_context(struct i915_gem_context *ctx)
>>>   {
>>> -    bool ban = (!i915_gem_context_is_persistent(ctx) ||
>>> -            !ctx->i915->params.enable_hangcheck);
>>>       struct i915_gem_engines *pos, *next;
>>>       spin_lock_irq(&ctx->stale.lock);
>>> @@ -1421,7 +1426,8 @@ static void kill_context(struct 
>>> i915_gem_context *ctx)
>>>           spin_unlock_irq(&ctx->stale.lock);
>>> -        kill_engines(pos, ban);
>>> +        kill_engines(pos, !ctx->i915->params.enable_hangcheck,
>>> +                 i915_gem_context_is_persistent(ctx));
>>>           spin_lock_irq(&ctx->stale.lock);
>>>           GEM_BUG_ON(i915_sw_fence_signaled(&pos->fence));
>>> @@ -1467,7 +1473,8 @@ static void engines_idle_release(struct 
>>> i915_gem_context *ctx,
>>>   kill:
>>>       if (list_empty(&engines->link)) /* raced, already closed */
>>> -        kill_engines(engines, true);
>>> +        kill_engines(engines, true,
>>> +                 i915_gem_context_is_persistent(ctx));
>>>       i915_sw_fence_commit(&engines->fence);
>>>   }
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>>> b/drivers/gpu/drm/i915/gt/intel_context.c
>>> index 4070cb5711d8..654a092ed3d6 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>> @@ -601,6 +601,30 @@ u64 intel_context_get_avg_runtime_ns(struct 
>>> intel_context *ce)
>>>       return avg;
>>>   }
>>> +bool intel_context_ban(struct intel_context *ce, struct i915_request 
>>> *rq)
>>> +{
>>> +    bool ret = intel_context_set_banned(ce);
>>> +
>>> +    trace_intel_context_ban(ce);
>>> +
>>> +    if (ce->ops->revoke)
>>> +        ce->ops->revoke(ce, rq,
>>> +                INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +bool intel_context_exit_nonpersistent(struct intel_context *ce,
>>> +                      struct i915_request *rq)
>>> +{
>>> +    bool ret = intel_context_set_exiting(ce);
>>> +
>>> +    if (ce->ops->revoke)
>>> +        ce->ops->revoke(ce, rq, ce->engine->props.preempt_timeout_ms);
>>> +
>>> +    return ret;
>>> +}
>>> +
>>>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>>>   #include "selftest_context.c"
>>>   #endif
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>>> b/drivers/gpu/drm/i915/gt/intel_context.h
>>> index b7d3214d2cdd..8e2d70630c49 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>> @@ -25,6 +25,8 @@
>>>                ##__VA_ARGS__);                    \
>>>   } while (0)
>>> +#define INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS (1)
>>> +
>>>   struct i915_gem_ww_ctx;
>>>   void intel_context_init(struct intel_context *ce,
>>> @@ -309,18 +311,27 @@ static inline bool 
>>> intel_context_set_banned(struct intel_context *ce)
>>>       return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
>>>   }
>>> -static inline bool intel_context_ban(struct intel_context *ce,
>>> -                     struct i915_request *rq)
>>> +bool intel_context_ban(struct intel_context *ce, struct i915_request 
>>> *rq);
>>> +
>>> +static inline bool intel_context_is_schedulable(const struct 
>>> intel_context *ce)
>>>   {
>>> -    bool ret = intel_context_set_banned(ce);
>>> +    return !test_bit(CONTEXT_EXITING, &ce->flags) &&
>>> +           !test_bit(CONTEXT_BANNED, &ce->flags);
>>> +}
>>> -    trace_intel_context_ban(ce);
>>> -    if (ce->ops->ban)
>>> -        ce->ops->ban(ce, rq);
>>> +static inline bool intel_context_is_exiting(const struct 
>>> intel_context *ce)
>>> +{
>>> +    return test_bit(CONTEXT_EXITING, &ce->flags);
>>> +}
>>> -    return ret;
>>> +static inline bool intel_context_set_exiting(struct intel_context *ce)
>>> +{
>>> +    return test_and_set_bit(CONTEXT_EXITING, &ce->flags);
>>>   }
>>> +bool intel_context_exit_nonpersistent(struct intel_context *ce,
>>> +                      struct i915_request *rq);
>>> +
>>>   static inline bool
>>>   intel_context_force_single_submission(const struct intel_context *ce)
>>>   {
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
>>> b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> index 09f82545789f..d2d75d9c0c8d 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>> @@ -40,7 +40,8 @@ struct intel_context_ops {
>>>       int (*alloc)(struct intel_context *ce);
>>> -    void (*ban)(struct intel_context *ce, struct i915_request *rq);
>>> +    void (*revoke)(struct intel_context *ce, struct i915_request *rq,
>>> +               unsigned int preempt_timeout_ms);
>>>       int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx 
>>> *ww, void **vaddr);
>>>       int (*pin)(struct intel_context *ce, void *vaddr);
>>> @@ -122,6 +123,7 @@ struct intel_context {
>>>   #define CONTEXT_GUC_INIT        10
>>>   #define CONTEXT_PERMA_PIN        11
>>>   #define CONTEXT_IS_PARKING        12
>>> +#define CONTEXT_EXITING            13
>>>       struct {
>>>           u64 timeout_us;
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
>>> b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> index a4510b5c0c3d..ad72e2c5c4e7 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> @@ -480,9 +480,9 @@ __execlists_schedule_in(struct i915_request *rq)
>>>       if (unlikely(intel_context_is_closed(ce) &&
>>>                !intel_engine_has_heartbeat(engine)))
>>> -        intel_context_set_banned(ce);
>>> +        intel_context_set_exiting(ce);
>>> -    if (unlikely(intel_context_is_banned(ce) || bad_request(rq)))
>>> +    if (unlikely(!intel_context_is_schedulable(ce) || bad_request(rq)))
>>>           reset_active(rq, engine);
>>>       if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
>>> @@ -1243,7 +1243,7 @@ static unsigned long 
>>> active_preempt_timeout(struct intel_engine_cs *engine,
>>>       /* Force a fast reset for terminated contexts (ignoring sysfs!) */
>>>       if (unlikely(intel_context_is_banned(rq->context) || 
>>> bad_request(rq)))
>>> -        return 1;
>>> +        return INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS;
>>>       return READ_ONCE(engine->props.preempt_timeout_ms);
>>>   }
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c 
>>> b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>> index f8f279a195c0..d5d6f1fadcae 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
>>> @@ -598,8 +598,9 @@ static void ring_context_reset(struct 
>>> intel_context *ce)
>>>       clear_bit(CONTEXT_VALID_BIT, &ce->flags);
>>>   }
>>> -static void ring_context_ban(struct intel_context *ce,
>>> -                 struct i915_request *rq)
>>> +static void ring_context_revoke(struct intel_context *ce,
>>> +                struct i915_request *rq,
>>> +                unsigned int preempt_timeout_ms)
>>>   {
>>>       struct intel_engine_cs *engine;
>>> @@ -634,7 +635,7 @@ static const struct intel_context_ops 
>>> ring_context_ops = {
>>>       .cancel_request = ring_context_cancel_request,
>>> -    .ban = ring_context_ban,
>>> +    .revoke = ring_context_revoke,
>>>       .pre_pin = ring_context_pre_pin,
>>>       .pin = ring_context_pin,
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> index 5a1dfacf24ea..e62ea35513ea 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> @@ -2790,7 +2790,9 @@ static void 
>>> __guc_context_set_preemption_timeout(struct intel_guc *guc,
>>>       __guc_context_set_context_policies(guc, &policy, true);
>>>   }
>>> -static void guc_context_ban(struct intel_context *ce, struct 
>>> i915_request *rq)
>>> +static void
>>> +guc_context_revoke(struct intel_context *ce, struct i915_request *rq,
>>> +           unsigned int preempt_timeout_ms)
>>>   {
>>>       struct intel_guc *guc = ce_to_guc(ce);
>>>       struct intel_runtime_pm *runtime_pm =
>>> @@ -2829,7 +2831,8 @@ static void guc_context_ban(struct 
>>> intel_context *ce, struct i915_request *rq)
>>>            * gets kicked off the HW ASAP.
>>>            */
>>>           with_intel_runtime_pm(runtime_pm, wakeref) {
>>> -            __guc_context_set_preemption_timeout(guc, guc_id, 1);
>>> +            __guc_context_set_preemption_timeout(guc, guc_id,
>>> +                                 preempt_timeout_ms);
>>>               __guc_context_sched_disable(guc, ce, guc_id);
>>>           }
>>>       } else {
>>> @@ -2837,7 +2840,7 @@ static void guc_context_ban(struct 
>>> intel_context *ce, struct i915_request *rq)
>>>               with_intel_runtime_pm(runtime_pm, wakeref)
>>>                   __guc_context_set_preemption_timeout(guc,
>>>                                        ce->guc_id.id,
>>> -                                     1);
>>> +                                     preempt_timeout_ms);
>>>           spin_unlock_irqrestore(&ce->guc_state.lock, flags);
>>>       }
>>>   }
>>> @@ -3190,7 +3193,7 @@ static const struct intel_context_ops 
>>> guc_context_ops = {
>>>       .unpin = guc_context_unpin,
>>>       .post_unpin = guc_context_post_unpin,
>>> -    .ban = guc_context_ban,
>>> +    .revoke = guc_context_revoke,
>>>       .cancel_request = guc_context_cancel_request,
>>> @@ -3439,7 +3442,7 @@ static const struct intel_context_ops 
>>> virtual_guc_context_ops = {
>>>       .unpin = guc_virtual_context_unpin,
>>>       .post_unpin = guc_context_post_unpin,
>>> -    .ban = guc_context_ban,
>>> +    .revoke = guc_context_revoke,
>>>       .cancel_request = guc_context_cancel_request,
>>> @@ -3528,7 +3531,7 @@ static const struct intel_context_ops 
>>> virtual_parent_context_ops = {
>>>       .unpin = guc_parent_context_unpin,
>>>       .post_unpin = guc_context_post_unpin,
>>> -    .ban = guc_context_ban,
>>> +    .revoke = guc_context_revoke,
>>>       .cancel_request = guc_context_cancel_request,
>>> diff --git a/drivers/gpu/drm/i915/i915_request.c 
>>> b/drivers/gpu/drm/i915/i915_request.c
>>> index 73d5195146b0..c3937640b119 100644
>>> --- a/drivers/gpu/drm/i915/i915_request.c
>>> +++ b/drivers/gpu/drm/i915/i915_request.c
>>> @@ -611,7 +611,7 @@ bool __i915_request_submit(struct i915_request 
>>> *request)
>>>           goto active;
>>>       }
>>> -    if (unlikely(intel_context_is_banned(request->context)))
>>> +    if (unlikely(!intel_context_is_schedulable(request->context)))
>>>           i915_request_set_error_once(request, -EIO);
>>>       if (unlikely(fatal_error(request->fence.error)))
>>


More information about the Intel-gfx mailing list