[PATCH v5 2/2] drm/amd: Add Suspend/Hibernate notification callback support
Lazar, Lijo
lijo.lazar at amd.com
Thu Nov 28 04:48:08 UTC 2024
On 11/28/2024 8:56 AM, Mario Limonciello wrote:
> From: Mario Limonciello <mario.limonciello at amd.com>
>
> As part of the suspend sequence VRAM needs to be evicted on dGPUs.
> In order to make suspend/resume more reliable we moved this into
> the pmops prepare() callback so that the suspend sequence would fail
> but the system could remain operational under high memory usage suspend.
>
> Another class of issues exist though where due to memory fragementation
> there isn't a large enough contiguous space and swap isn't accessible.
>
> Add support for a suspend/hibernate notification callback that could
> evict VRAM before tasks are frozen. This should allow paging out to swap
> if necessary.
>
> Link: https://github.com/ROCm/ROCK-Kernel-Driver/issues/174
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3476
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2362
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3781
> Signed-off-by: Mario Limonciello <mario.limonciello at amd.com>
Series is -
Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>
Thanks,
Lijo
> ---
> v5:
> * Keep s4 set all the way through prepare.
> v4:
> * Make non fatal, drop patch 3
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 +++++++++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 -
> 3 files changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d8bc6da500161..79ec4ab8ecfc5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -879,6 +879,7 @@ struct amdgpu_device {
> bool need_swiotlb;
> bool accel_working;
> struct notifier_block acpi_nb;
> + struct notifier_block pm_nb;
> struct amdgpu_i2c_chan *i2c_bus[AMDGPU_MAX_I2C_BUS];
> struct debugfs_blob_wrapper debugfs_vbios_blob;
> struct debugfs_blob_wrapper debugfs_discovery_blob;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 996e9c78384dd..dc6abeba61948 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -191,6 +191,8 @@ void amdgpu_set_init_level(struct amdgpu_device *adev,
> }
>
> static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
> +static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
> + void *data);
>
> /**
> * DOC: pcie_replay_count
> @@ -4553,6 +4555,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>
> amdgpu_device_check_iommu_direct_map(adev);
>
> + adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
> + r = register_pm_notifier(&adev->pm_nb);
> + if (r)
> + goto failed;
> +
> return 0;
>
> release_ras_con:
> @@ -4617,6 +4624,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
> drain_workqueue(adev->mman.bdev.wq);
> adev->shutdown = true;
>
> + unregister_pm_notifier(&adev->pm_nb);
> +
> /* make sure IB test finished before entering exclusive mode
> * to avoid preemption on IB test
> */
> @@ -4748,6 +4757,41 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
> /*
> * Suspend & resume.
> */
> +/**
> + * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
> + * @nb: notifier block
> + * @mode: suspend mode
> + * @data: data
> + *
> + * This function is called when the system is about to suspend or hibernate.
> + * It is used to evict resources from the device before the system goes to
> + * sleep while there is still access to swap.
> + */
> +static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
> + void *data)
> +{
> + struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
> + int r;
> +
> + switch (mode) {
> + case PM_HIBERNATION_PREPARE:
> + adev->in_s4 = true;
> + fallthrough;
> + case PM_SUSPEND_PREPARE:
> + r = amdgpu_device_evict_resources(adev);
> + /*
> + * This is considered non-fatal at this time because
> + * amdgpu_device_prepare() will also fatally evict resources.
> + * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
> + */
> + if (r)
> + drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
> + break;
> + }
> +
> + return NOTIFY_DONE;
> +}
> +
> /**
> * amdgpu_device_prepare - prepare for device suspend
> *
> @@ -4787,7 +4831,7 @@ int amdgpu_device_prepare(struct drm_device *dev)
> return 0;
>
> unprepare:
> - adev->in_s0ix = adev->in_s3 = false;
> + adev->in_s0ix = adev->in_s3 = adev->in_s4 = false;
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 38686203bea63..5e5ab0b2a7196 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2552,7 +2552,6 @@ static int amdgpu_pmops_freeze(struct device *dev)
> struct amdgpu_device *adev = drm_to_adev(drm_dev);
> int r;
>
> - adev->in_s4 = true;
> r = amdgpu_device_suspend(drm_dev, true);
> adev->in_s4 = false;
> if (r)
More information about the amd-gfx
mailing list