[Intel-gfx] [PATCH v2] drm/i915/guc: Fix revocation of non-persistent contexts

Ceraolo Spurio, Daniele daniele.ceraolospurio at intel.com
Tue Oct 4 15:13:54 UTC 2022



On 10/4/2022 4:14 AM, Tvrtko Ursulin wrote:
>
> On 03/10/2022 13:16, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> Patch which added graceful exit for non-persistent contexts missed the
>> fact it is not enough to set the exiting flag on a context and let the
>> backend handle it from there.
>>
>> GuC backend cannot handle it because it runs independently in the
>> firmware and driver might not see the requests ever again. Patch also
>> missed the fact some usages of intel_context_is_banned in the GuC 
>> backend
>> needed replacing with newly introduced intel_context_is_schedulable.
>>
>> Fix the first issue by calling into backend revoke when we know this is
>> the last chance to do it. Fix the second issue by replacing
>> intel_context_is_banned with intel_context_is_schedulable, which should
>> always be safe since latter is a superset of the former.
>>
>> v2:
>>   * Just call ce->ops->revoke unconditionally. (Andrzej)
>
> CI is happy - could I get some acks for the GuC backend changes please?

I think we still need to have a longer conversation on the revoking 
times, but in the meantime this fixes the immediate concerns, so:

Acked-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

Daniele

>
> Regards,
>
> Tvrtko
>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> Fixes: 45c64ecf97ee ("drm/i915: Improve user experience and driver 
>> robustness under SIGINT or similar")
>> Cc: Andrzej Hajda <andrzej.hajda at intel.com>
>> Cc: John Harrison <John.C.Harrison at Intel.com>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
>> Cc: <stable at vger.kernel.org> # v6.0+
>> Reviewed-by: Andrzej Hajda <andrzej.hajda at intel.com>
>> ---
>>   drivers/gpu/drm/i915/gem/i915_gem_context.c   |  8 +-----
>>   drivers/gpu/drm/i915/gt/intel_context.c       |  5 ++--
>>   drivers/gpu/drm/i915/gt/intel_context.h       |  3 +--
>>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 26 +++++++++----------
>>   4 files changed, 17 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
>> b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>> index 0bcde53c50c6..1e29b1e6d186 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>> @@ -1387,14 +1387,8 @@ kill_engines(struct i915_gem_engines *engines, 
>> bool exit, bool persistent)
>>        */
>>       for_each_gem_engine(ce, engines, it) {
>>           struct intel_engine_cs *engine;
>> -        bool skip = false;
>>   -        if (exit)
>> -            skip = intel_context_set_exiting(ce);
>> -        else if (!persistent)
>> -            skip = intel_context_exit_nonpersistent(ce, NULL);
>> -
>> -        if (skip)
>> +        if ((exit || !persistent) && intel_context_revoke(ce))
>>               continue; /* Already marked. */
>>             /*
>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c 
>> b/drivers/gpu/drm/i915/gt/intel_context.c
>> index 654a092ed3d6..e94365b08f1e 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>> @@ -614,13 +614,12 @@ bool intel_context_ban(struct intel_context 
>> *ce, struct i915_request *rq)
>>       return ret;
>>   }
>>   -bool intel_context_exit_nonpersistent(struct intel_context *ce,
>> -                      struct i915_request *rq)
>> +bool intel_context_revoke(struct intel_context *ce)
>>   {
>>       bool ret = intel_context_set_exiting(ce);
>>         if (ce->ops->revoke)
>> -        ce->ops->revoke(ce, rq, ce->engine->props.preempt_timeout_ms);
>> +        ce->ops->revoke(ce, NULL, 
>> ce->engine->props.preempt_timeout_ms);
>>         return ret;
>>   }
>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
>> b/drivers/gpu/drm/i915/gt/intel_context.h
>> index 8e2d70630c49..be09fb2e883a 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_context.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
>> @@ -329,8 +329,7 @@ static inline bool 
>> intel_context_set_exiting(struct intel_context *ce)
>>       return test_and_set_bit(CONTEXT_EXITING, &ce->flags);
>>   }
>>   -bool intel_context_exit_nonpersistent(struct intel_context *ce,
>> -                      struct i915_request *rq);
>> +bool intel_context_revoke(struct intel_context *ce);
>>     static inline bool
>>   intel_context_force_single_submission(const struct intel_context *ce)
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> index 0ef295a94060..88a4476b8e92 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> @@ -685,7 +685,7 @@ static int __guc_add_request(struct intel_guc 
>> *guc, struct i915_request *rq)
>>        * Corner case where requests were sitting in the priority list 
>> or a
>>        * request resubmitted after the context was banned.
>>        */
>> -    if (unlikely(intel_context_is_banned(ce))) {
>> +    if (unlikely(!intel_context_is_schedulable(ce))) {
>>           i915_request_put(i915_request_mark_eio(rq));
>>           intel_engine_signal_breadcrumbs(ce->engine);
>>           return 0;
>> @@ -871,15 +871,15 @@ static int guc_wq_item_append(struct intel_guc 
>> *guc,
>>                     struct i915_request *rq)
>>   {
>>       struct intel_context *ce = request_to_scheduling_context(rq);
>> -    int ret = 0;
>> +    int ret;
>>   -    if (likely(!intel_context_is_banned(ce))) {
>> -        ret = __guc_wq_item_append(rq);
>> +    if (unlikely(!intel_context_is_schedulable(ce)))
>> +        return 0;
>>   -        if (unlikely(ret == -EBUSY)) {
>> -            guc->stalled_request = rq;
>> -            guc->submission_stall_reason = STALL_MOVE_LRC_TAIL;
>> -        }
>> +    ret = __guc_wq_item_append(rq);
>> +    if (unlikely(ret == -EBUSY)) {
>> +        guc->stalled_request = rq;
>> +        guc->submission_stall_reason = STALL_MOVE_LRC_TAIL;
>>       }
>>         return ret;
>> @@ -898,7 +898,7 @@ static bool multi_lrc_submit(struct i915_request 
>> *rq)
>>        * submitting all the requests generated in parallel.
>>        */
>>       return test_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, 
>> &rq->fence.flags) ||
>> -        intel_context_is_banned(ce);
>> +           !intel_context_is_schedulable(ce);
>>   }
>>     static int guc_dequeue_one_context(struct intel_guc *guc)
>> @@ -967,7 +967,7 @@ static int guc_dequeue_one_context(struct 
>> intel_guc *guc)
>>           struct intel_context *ce = 
>> request_to_scheduling_context(last);
>>             if (unlikely(!ctx_id_mapped(guc, ce->guc_id.id) &&
>> -                 !intel_context_is_banned(ce))) {
>> +                 intel_context_is_schedulable(ce))) {
>>               ret = try_context_registration(ce, false);
>>               if (unlikely(ret == -EPIPE)) {
>>                   goto deadlk;
>> @@ -1577,7 +1577,7 @@ static void guc_reset_state(struct 
>> intel_context *ce, u32 head, bool scrub)
>>   {
>>       struct intel_engine_cs *engine = __context_to_physical_engine(ce);
>>   -    if (intel_context_is_banned(ce))
>> +    if (!intel_context_is_schedulable(ce))
>>           return;
>>         GEM_BUG_ON(!intel_context_is_pinned(ce));
>> @@ -4518,12 +4518,12 @@ static void guc_handle_context_reset(struct 
>> intel_guc *guc,
>>   {
>>       trace_intel_context_reset(ce);
>>   -    if (likely(!intel_context_is_banned(ce))) {
>> +    if (likely(intel_context_is_schedulable(ce))) {
>>           capture_error_state(guc, ce);
>>           guc_context_replay(ce);
>>       } else {
>>           drm_info(&guc_to_gt(guc)->i915->drm,
>> -             "Ignoring context reset notification of banned context 
>> 0x%04X on %s",
>> +             "Ignoring context reset notification of exiting context 
>> 0x%04X on %s",
>>                ce->guc_id.id, ce->engine->name);
>>       }
>>   }



More information about the dri-devel mailing list