[PATCH] drm/amdgpu: Skip execution of pending reset jobs

Kamal, Asad Asad.Kamal at amd.com
Fri Nov 10 12:33:24 UTC 2023


[AMD Official Use Only - General]

Reviewed-by: Asad Kamal <asad.kamal at amd.com>

Thanks & Regards
Asad

-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Lazar, Lijo
Sent: Friday, November 10, 2023 4:19 PM
To: amd-gfx at lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: Re: [PATCH] drm/amdgpu: Skip execution of pending reset jobs

<Ping>

On 11/9/2023 1:08 PM, Lijo Lazar wrote:
> cancel_work is not backported to all custom kernels. Add a workaround
> to skip execution of already queued recovery jobs, if the device is
> already reset.
>
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 16 ++++++++++++++++
>   3 files changed, 30 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bebc73c6822c..c66524e2a56a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5411,6 +5411,8 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
>   {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>
> +     amdgpu_reset_domain_clear_pending(adev->reset_domain);
> +
>   #if defined(CONFIG_DEBUG_FS)
>       if (!amdgpu_sriov_vf(adev))
>               cancel_work(&adev->reset_work);
> @@ -5452,6 +5454,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>       bool audio_suspended = false;
>       bool gpu_reset_for_dev_remove = false;
>
> +     if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain))
> +             return 0;
> +
>       gpu_reset_for_dev_remove =
>                       test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
>                               test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 4baa300121d8..3ece7267d6ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -120,6 +120,14 @@ void amdgpu_reset_destroy_reset_domain(struct kref *ref)
>       kvfree(reset_domain);
>   }
>
> +static void amdgpu_reset_domain_cancel_all_work(struct work_struct
> +*work) {
> +     struct amdgpu_reset_domain *reset_domain =
> +             container_of(work, struct amdgpu_reset_domain, clear);
> +
> +     reset_domain->drain = false;
> +}
> +
>   struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
>                                                            char *wq_name)
>   {
> @@ -142,6 +150,7 @@ struct amdgpu_reset_domain
> *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>
>       }
>
> +     INIT_WORK(&reset_domain->clear,
> +amdgpu_reset_domain_cancel_all_work);
>       atomic_set(&reset_domain->in_gpu_reset, 0);
>       atomic_set(&reset_domain->reset_res, 0);
>       init_rwsem(&reset_domain->sem);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index b0335a1c5e90..70059eea7e2f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -87,6 +87,8 @@ struct amdgpu_reset_domain {
>       struct rw_semaphore sem;
>       atomic_t in_gpu_reset;
>       atomic_t reset_res;
> +     struct work_struct clear;
> +     bool drain;
>   };
>
>   #ifdef CONFIG_DEV_COREDUMP
> @@ -137,6 +139,20 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma
>       return queue_work(domain->wq, work);
>   }
>
> +static inline void amdgpu_reset_domain_clear_pending(struct
> +amdgpu_reset_domain *domain) {
> +     domain->drain = true;
> +     /* queue one more work to the domain queue. Till this work is finished,
> +      * domain is in drain mode.
> +      */
> +     queue_work(domain->wq, &domain->clear); }
> +
> +static inline bool amdgpu_reset_domain_in_drain_mode(struct
> +amdgpu_reset_domain *domain) {
> +     return domain->drain;
> +}
> +
>   void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain
> *reset_domain);
>
>   void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain
> *reset_domain);


More information about the amd-gfx mailing list