[PATCH] drm/amdgpu: Skip execution of pending reset jobs

Christian König ckoenig.leichtzumerken at gmail.com
Fri Nov 10 14:48:10 UTC 2023


Am 09.11.23 um 08:38 schrieb Lijo Lazar:
> cancel_work is not backported to all custom kernels.

Well this is pretty clear NAK to pushing this upstream. We absolutely 
can't add workaround for older kernels.

You could keep this in the backported kernel, but why should cancel_work 
not be available?

Regards,
Christian.

>   Add a workaround to
> skip execution of already queued recovery jobs, if the device is already
> reset.
>
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 16 ++++++++++++++++
>   3 files changed, 30 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bebc73c6822c..c66524e2a56a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5411,6 +5411,8 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   
> +	amdgpu_reset_domain_clear_pending(adev->reset_domain);
> +
>   #if defined(CONFIG_DEBUG_FS)
>   	if (!amdgpu_sriov_vf(adev))
>   		cancel_work(&adev->reset_work);
> @@ -5452,6 +5454,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	bool audio_suspended = false;
>   	bool gpu_reset_for_dev_remove = false;
>   
> +	if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain))
> +		return 0;
> +
>   	gpu_reset_for_dev_remove =
>   			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
>   				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 4baa300121d8..3ece7267d6ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -120,6 +120,14 @@ void amdgpu_reset_destroy_reset_domain(struct kref *ref)
>   	kvfree(reset_domain);
>   }
>   
> +static void amdgpu_reset_domain_cancel_all_work(struct work_struct *work)
> +{
> +	struct amdgpu_reset_domain *reset_domain =
> +		container_of(work, struct amdgpu_reset_domain, clear);
> +
> +	reset_domain->drain = false;
> +}
> +
>   struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
>   							     char *wq_name)
>   {
> @@ -142,6 +150,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   
>   	}
>   
> +	INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work);
>   	atomic_set(&reset_domain->in_gpu_reset, 0);
>   	atomic_set(&reset_domain->reset_res, 0);
>   	init_rwsem(&reset_domain->sem);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index b0335a1c5e90..70059eea7e2f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -87,6 +87,8 @@ struct amdgpu_reset_domain {
>   	struct rw_semaphore sem;
>   	atomic_t in_gpu_reset;
>   	atomic_t reset_res;
> +	struct work_struct clear;
> +	bool drain;
>   };
>   
>   #ifdef CONFIG_DEV_COREDUMP
> @@ -137,6 +139,20 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma
>   	return queue_work(domain->wq, work);
>   }
>   
> +static inline void amdgpu_reset_domain_clear_pending(struct amdgpu_reset_domain *domain)
> +{
> +	domain->drain = true;
> +	/* queue one more work to the domain queue. Till this work is finished,
> +	 * domain is in drain mode.
> +	 */
> +	queue_work(domain->wq, &domain->clear);
> +}
> +
> +static inline bool amdgpu_reset_domain_in_drain_mode(struct amdgpu_reset_domain *domain)
> +{
> +	return domain->drain;
> +}
> +
>   void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
>   
>   void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);



More information about the amd-gfx mailing list