[PATCH] drm/amdgpu: Skip execution of pending reset jobs

Lazar, Lijo lijo.lazar at amd.com
Fri Nov 10 15:07:50 UTC 2023



On 11/10/2023 8:18 PM, Christian König wrote:
> Am 09.11.23 um 08:38 schrieb Lijo Lazar:
>> cancel_work is not backported to all custom kernels.
> 
> Well this is pretty clear NAK to pushing this upstream. We absolutely 
> can't add workaround for older kernels.
> 
> You could keep this in the backported kernel, but why should cancel_work 
> not be available?
> 

As you know there are vendor maintained kernels, and all users 
necessarily don't upgrade to a kernel which has backport of this, as 
that could be total disruption of their current environment.

Thanks,
Lijo

> Regards,
> Christian.
> 
>>   Add a workaround to
>> skip execution of already queued recovery jobs, if the device is already
>> reset.
>>
>> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  9 +++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 16 ++++++++++++++++
>>   3 files changed, 30 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index bebc73c6822c..c66524e2a56a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5411,6 +5411,8 @@ static inline void 
>> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>>   {
>>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> +    amdgpu_reset_domain_clear_pending(adev->reset_domain);
>> +
>>   #if defined(CONFIG_DEBUG_FS)
>>       if (!amdgpu_sriov_vf(adev))
>>           cancel_work(&adev->reset_work);
>> @@ -5452,6 +5454,9 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>       bool audio_suspended = false;
>>       bool gpu_reset_for_dev_remove = false;
>> +    if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain))
>> +        return 0;
>> +
>>       gpu_reset_for_dev_remove =
>>               test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, 
>> &reset_context->flags) &&
>>                   test_bit(AMDGPU_NEED_FULL_RESET, 
>> &reset_context->flags);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> index 4baa300121d8..3ece7267d6ea 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>> @@ -120,6 +120,14 @@ void amdgpu_reset_destroy_reset_domain(struct 
>> kref *ref)
>>       kvfree(reset_domain);
>>   }
>> +static void amdgpu_reset_domain_cancel_all_work(struct work_struct 
>> *work)
>> +{
>> +    struct amdgpu_reset_domain *reset_domain =
>> +        container_of(work, struct amdgpu_reset_domain, clear);
>> +
>> +    reset_domain->drain = false;
>> +}
>> +
>>   struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum 
>> amdgpu_reset_domain_type type,
>>                                    char *wq_name)
>>   {
>> @@ -142,6 +150,7 @@ struct amdgpu_reset_domain 
>> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>>       }
>> +    INIT_WORK(&reset_domain->clear, 
>> amdgpu_reset_domain_cancel_all_work);
>>       atomic_set(&reset_domain->in_gpu_reset, 0);
>>       atomic_set(&reset_domain->reset_res, 0);
>>       init_rwsem(&reset_domain->sem);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> index b0335a1c5e90..70059eea7e2f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
>> @@ -87,6 +87,8 @@ struct amdgpu_reset_domain {
>>       struct rw_semaphore sem;
>>       atomic_t in_gpu_reset;
>>       atomic_t reset_res;
>> +    struct work_struct clear;
>> +    bool drain;
>>   };
>>   #ifdef CONFIG_DEV_COREDUMP
>> @@ -137,6 +139,20 @@ static inline bool 
>> amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma
>>       return queue_work(domain->wq, work);
>>   }
>> +static inline void amdgpu_reset_domain_clear_pending(struct 
>> amdgpu_reset_domain *domain)
>> +{
>> +    domain->drain = true;
>> +    /* queue one more work to the domain queue. Till this work is 
>> finished,
>> +     * domain is in drain mode.
>> +     */
>> +    queue_work(domain->wq, &domain->clear);
>> +}
>> +
>> +static inline bool amdgpu_reset_domain_in_drain_mode(struct 
>> amdgpu_reset_domain *domain)
>> +{
>> +    return domain->drain;
>> +}
>> +
>>   void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
>> *reset_domain);
>>   void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain 
>> *reset_domain);
> 


More information about the amd-gfx mailing list