[PATCH] drm/amdgpu: Skip execution of pending reset jobs

Christian König ckoenig.leichtzumerken at gmail.com
Fri Nov 10 15:10:58 UTC 2023


Am 10.11.23 um 16:07 schrieb Lazar, Lijo:
>
>
> On 11/10/2023 8:18 PM, Christian König wrote:
>> Am 09.11.23 um 08:38 schrieb Lijo Lazar:
>>> cancel_work is not backported to all custom kernels.
>>
>> Well this is pretty clear NAK to pushing this upstream. We absolutely 
>> can't add workaround for older kernels.
>>
>> You could keep this in the backported kernel, but why should 
>> cancel_work not be available?
>>
>
> As you know there are vendor maintained kernels, and all users 
> necessarily don't upgrade to a kernel which has backport of this, as 
> that could be total disruption of their current environment.

Yeah, so what? The documented rule for upstreaming is that the code 
should be written for this kernel version and this alone!

This has clear precedence over vendors backporting the driver. No 
backporting aid whatsoever is allowed in the upstream kernel and this 
counts for all drivers in there.

We can certainly do this in the internal backporting branch, but pushing 
it to amd-staging-drm-next is a complete no-go.

Regards,
Christian.

>
> Thanks,
> Lijo
>
>> Regards,
>> Christian.
>>
>>>   Add a workaround to
>>> skip execution of already queued recovery jobs, if the device is 
>>> already
>>> reset.
>>>
>>> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  9 +++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 16 ++++++++++++++++
>>>   3 files changed, 30 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index bebc73c6822c..c66524e2a56a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -5411,6 +5411,8 @@ static inline void 
>>> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>>>   {
>>>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>>> +    amdgpu_reset_domain_clear_pending(adev->reset_domain);
>>> +
>>>   #if defined(CONFIG_DEBUG_FS)
>>>       if (!amdgpu_sriov_vf(adev))
>>>           cancel_work(&adev->reset_work);
>>> @@ -5452,6 +5454,9 @@ int amdgpu_device_gpu_recover(struct 
>>> amdgpu_device *adev,
>>>       bool audio_suspended = false;
>>>       bool gpu_reset_for_dev_remove = false;
>>> +    if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain))
>>> +        return 0;
>>> +
>>>       gpu_reset_for_dev_remove =
>>>               test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, 
>>> &reset_context->flags) &&
>>>                   test_bit(AMDGPU_NEED_FULL_RESET, 
>>> &reset_context->flags);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> index 4baa300121d8..3ece7267d6ea 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> @@ -120,6 +120,14 @@ void amdgpu_reset_destroy_reset_domain(struct 
>>> kref *ref)
>>>       kvfree(reset_domain);
>>>   }
>>> +static void amdgpu_reset_domain_cancel_all_work(struct work_struct 
>>> *work)
>>> +{
>>> +    struct amdgpu_reset_domain *reset_domain =
>>> +        container_of(work, struct amdgpu_reset_domain, clear);
>>> +
>>> +    reset_domain->drain = false;
>>> +}
>>> +
>>>   struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum 
>>> amdgpu_reset_domain_type type,
>>>                                    char *wq_name)
>>>   {
>>> @@ -142,6 +150,7 @@ struct amdgpu_reset_domain 
>>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>>       }
>>> +    INIT_WORK(&reset_domain->clear, 
>>> amdgpu_reset_domain_cancel_all_work);
>>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>>       atomic_set(&reset_domain->reset_res, 0);
>>>       init_rwsem(&reset_domain->sem);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> index b0335a1c5e90..70059eea7e2f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>>> @@ -87,6 +87,8 @@ struct amdgpu_reset_domain {
>>>       struct rw_semaphore sem;
>>>       atomic_t in_gpu_reset;
>>>       atomic_t reset_res;
>>> +    struct work_struct clear;
>>> +    bool drain;
>>>   };
>>>   #ifdef CONFIG_DEV_COREDUMP
>>> @@ -137,6 +139,20 @@ static inline bool 
>>> amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma
>>>       return queue_work(domain->wq, work);
>>>   }
>>> +static inline void amdgpu_reset_domain_clear_pending(struct 
>>> amdgpu_reset_domain *domain)
>>> +{
>>> +    domain->drain = true;
>>> +    /* queue one more work to the domain queue. Till this work is 
>>> finished,
>>> +     * domain is in drain mode.
>>> +     */
>>> +    queue_work(domain->wq, &domain->clear);
>>> +}
>>> +
>>> +static inline bool amdgpu_reset_domain_in_drain_mode(struct 
>>> amdgpu_reset_domain *domain)
>>> +{
>>> +    return domain->drain;
>>> +}
>>> +
>>>   void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
>>> *reset_domain);
>>>   void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain 
>>> *reset_domain);
>>



More information about the amd-gfx mailing list