[PATCH 2/2] drm/amdgpu: fix pm notifier handling

Thu May 1 21:18:48 UTC 2025

On 5/1/2025 3:09 PM, Alex Deucher wrote:
> Set the s3/s0ix and s4 flags in the pm notifier so that we can skip
> the resource evictions properly in pm prepare based on whether
> we are suspending or hibernating.  Drop the eviction as processes
> are not frozen at this time, we we can end up getting stuck trying
> to evict VRAM while applications continue to submit work which
> causes the buffers to get pulled back into VRAM.
> 
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178
> Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback support")
> Cc: Mario Limonciello <mario.limonciello at amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>

Rather than removing the callbacks (and thus introducing the "other" 
bugs with memory pressure), I've sent an alternative series.  LMK what 
you think of this instead.

https://lore.kernel.org/amd-gfx/20250501211734.2434369-1-superm1@kernel.org/T/#m6cdc075af911868667ea6fc849bcd196477d6c20

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 +++++++++++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 22 ++-----------------
>   2 files changed, 15 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 71d95f16067a4..d232e4a26d7bf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4974,28 +4974,29 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
>    * @data: data
>    *
>    * This function is called when the system is about to suspend or hibernate.
> - * It is used to evict resources from the device before the system goes to
> - * sleep while there is still access to swap.
> + * It is used to set the appropriate flags so that eviction can be optimized
> + * in the pm prepare callback.
>    */
>   static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
>   				     void *data)
>   {
>   	struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
> -	int r;
>   
>   	switch (mode) {
>   	case PM_HIBERNATION_PREPARE:
>   		adev->in_s4 = true;
> -		fallthrough;
> +		break;
> +	case PM_POST_HIBERNATION:
> +		adev->in_s4 = false;
> +		break;
>   	case PM_SUSPEND_PREPARE:
> -		r = amdgpu_device_evict_resources(adev);
> -		/*
> -		 * This is considered non-fatal at this time because
> -		 * amdgpu_device_prepare() will also fatally evict resources.
> -		 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
> -		 */
> -		if (r)
> -			drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
> +		if (amdgpu_acpi_is_s0ix_active(adev))
> +			adev->in_s0ix = true;
> +		else if (amdgpu_acpi_is_s3_active(adev))
> +			adev->in_s3 = true;
> +		break;
> +	case PM_POST_SUSPEND:
> +		adev->in_s0ix = adev->in_s3 = false;
>   		break;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index cec041a556013..6599fb6313220 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2572,10 +2572,6 @@ static int amdgpu_pmops_suspend(struct device *dev)
>   	struct drm_device *drm_dev = dev_get_drvdata(dev);
>   	struct amdgpu_device *adev = drm_to_adev(drm_dev);
>   
> -	if (amdgpu_acpi_is_s0ix_active(adev))
> -		adev->in_s0ix = true;
> -	else if (amdgpu_acpi_is_s3_active(adev))
> -		adev->in_s3 = true;
>   	if (!adev->in_s0ix && !adev->in_s3) {
>   		/* don't allow going deep first time followed by s2idle the next time */
>   		if (adev->last_suspend_state != PM_SUSPEND_ON &&
> @@ -2608,7 +2604,6 @@ static int amdgpu_pmops_resume(struct device *dev)
>   {
>   	struct drm_device *drm_dev = dev_get_drvdata(dev);
>   	struct amdgpu_device *adev = drm_to_adev(drm_dev);
> -	int r;
>   
>   	if (!adev->in_s0ix && !adev->in_s3)
>   		return 0;
> @@ -2617,12 +2612,7 @@ static int amdgpu_pmops_resume(struct device *dev)
>   	if (!pci_device_is_present(adev->pdev))
>   		adev->no_hw_access = true;
>   
> -	r = amdgpu_device_resume(drm_dev, true);
> -	if (amdgpu_acpi_is_s0ix_active(adev))
> -		adev->in_s0ix = false;
> -	else
> -		adev->in_s3 = false;
> -	return r;
> +	return amdgpu_device_resume(drm_dev, true);
>   }
>   
>   static int amdgpu_pmops_freeze(struct device *dev)
> @@ -2643,13 +2633,8 @@ static int amdgpu_pmops_freeze(struct device *dev)
>   static int amdgpu_pmops_thaw(struct device *dev)
>   {
>   	struct drm_device *drm_dev = dev_get_drvdata(dev);
> -	struct amdgpu_device *adev = drm_to_adev(drm_dev);
> -	int r;
>   
> -	r = amdgpu_device_resume(drm_dev, true);
> -	adev->in_s4 = false;
> -
> -	return r;
> +	return amdgpu_device_resume(drm_dev, true);
>   }
>   
>   static int amdgpu_pmops_poweroff(struct device *dev)
> @@ -2662,9 +2647,6 @@ static int amdgpu_pmops_poweroff(struct device *dev)
>   static int amdgpu_pmops_restore(struct device *dev)
>   {
>   	struct drm_device *drm_dev = dev_get_drvdata(dev);
> -	struct amdgpu_device *adev = drm_to_adev(drm_dev);
> -
> -	adev->in_s4 = false;
>   
>   	return amdgpu_device_resume(drm_dev, true);
>   }