[PATCH] drm/amdgpu: reset asic after system-wide suspend aborted

Thu Nov 18 08:00:57 UTC 2021

On 11/18/2021 12:32 PM, Prike Liang wrote:
> Do ASIC reset at the moment Sx suspend aborted behind of amdgpu suspend
> to keep AMDGPU in a clean reset state and that can avoid re-initialize
> device improperly error.
> 
> Signed-off-by: Prike Liang <Prike.Liang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 19 +++++++++++++++++++
>   3 files changed, 24 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b85b67a..8bd9833 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1053,6 +1053,7 @@ struct amdgpu_device {
>   	bool				in_s3;
>   	bool				in_s4;
>   	bool				in_s0ix;
> +	bool				pm_completed;

PM framework maintains separate flags, why not use the same?

         dev->power.is_suspended = false;
         dev->power.is_noirq_suspended = false;
         dev->power.is_late_suspended = false;

BTW, Alex posted a patch which does similar thing, though it tries reset 
if suspend fails.

https://lore.kernel.org/all/DM6PR12MB26195F8E099407B4B6966FEBE4999@DM6PR12MB2619.namprd12.prod.outlook.com/

If that reset also failed, then no point in another reset, or keep it as 
part of resume.

Thanks,
Lijo

>   
>   	atomic_t 			in_gpu_reset;
>   	enum pp_mp1_state               mp1_state;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ec42a6f..a12ed54 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3983,6 +3983,10 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
>   	if (adev->in_s0ix)
>   		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
>   
> +	if (!adev->pm_completed) {
> +		dev_warn(adev->dev, "suspend aborted will do asic reset\n");
> +		amdgpu_asic_reset(adev);
> +	}
>   	/* post card */
>   	if (amdgpu_device_need_post(adev)) {
>   		r = amdgpu_device_asic_init(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index eee3cf8..9f90017 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2168,6 +2168,23 @@ static int amdgpu_pmops_suspend(struct device *dev)
>   	return r;
>   }
>   
> +/*
> + * Actually the PM suspend whether is completed should be confirmed
> + * by checking the sysfs sys/power/suspend_stats/failed_suspend.However,
> + * in this function only check the AMDGPU device whether is suspended
> + * completely in the system-wide suspend process.
> + */
> +static int amdgpu_pmops_noirq_suspend(struct device *dev)
> +{
> +	struct drm_device *drm_dev = dev_get_drvdata(dev);
> +	struct amdgpu_device *adev = drm_to_adev(drm_dev);
> +
> +	dev_dbg(dev, "amdgpu suspend completely.\n");
> +	adev->pm_completed = true;
> +
> +	return 0;
> +}
> +
>   static int amdgpu_pmops_resume(struct device *dev)
>   {
>   	struct drm_device *drm_dev = dev_get_drvdata(dev);
> @@ -2181,6 +2198,7 @@ static int amdgpu_pmops_resume(struct device *dev)
>   	r = amdgpu_device_resume(drm_dev, true);
>   	if (amdgpu_acpi_is_s0ix_active(adev))
>   		adev->in_s0ix = false;
> +	adev->pm_completed = false;
>   	return r;
>   }
>   
> @@ -2397,6 +2415,7 @@ static const struct dev_pm_ops amdgpu_pm_ops = {
>   	.runtime_suspend = amdgpu_pmops_runtime_suspend,
>   	.runtime_resume = amdgpu_pmops_runtime_resume,
>   	.runtime_idle = amdgpu_pmops_runtime_idle,
> +	.suspend_noirq = amdgpu_pmops_noirq_suspend,
>   };
>   
>   static int amdgpu_flush(struct file *f, fl_owner_t id)
>