[PATCH 2/2] drm/amdgpu: fix pm notifier handling

Fri May 2 19:16:13 UTC 2025

On Fri, May 2, 2025 at 12:14 PM Mario Limonciello
<mario.limonciello at amd.com> wrote:
>
> On 5/2/2025 8:13 AM, Alex Deucher wrote:
> > On Fri, May 2, 2025 at 8:56 AM Alex Deucher <alexdeucher at gmail.com> wrote:
> >>
> >> On Thu, May 1, 2025 at 5:19 PM Mario Limonciello
> >> <mario.limonciello at amd.com> wrote:
> >>>
> >>> On 5/1/2025 3:09 PM, Alex Deucher wrote:
> >>>> Set the s3/s0ix and s4 flags in the pm notifier so that we can skip
> >>>> the resource evictions properly in pm prepare based on whether
> >>>> we are suspending or hibernating.  Drop the eviction as processes
> >>>> are not frozen at this time, we we can end up getting stuck trying
> >>>> to evict VRAM while applications continue to submit work which
> >>>> causes the buffers to get pulled back into VRAM.
> >>>>
> >>>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178
> >>>> Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback support")
> >>>> Cc: Mario Limonciello <mario.limonciello at amd.com>
> >>>> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> >>>
> >>> Rather than removing the callbacks (and thus introducing the "other"
> >>> bugs with memory pressure), I've sent an alternative series.  LMK what
> >>> you think of this instead.
> >>>
> >>> https://lore.kernel.org/amd-gfx/20250501211734.2434369-1-superm1@kernel.org/T/#m6cdc075af911868667ea6fc849bcd196477d6c20
> >>
> >> Series looks good to me, but I think a variant of this patch (with the
> >> evictions still in place) still makes sense so that we can properly
> >> set the suspend and hibernate flags in amdgpu so that we can optimize
> >> the evictions for suspend on APUs.  Will respin.
> >
> > I think this set still makes sense for stable, but then if your patch
> > series lands, we can apply the attached patch on top of that.
>
> OK.  Let me re-look through your series once more and I'll leave
> comments or a tag.

Thanks.

>
> Regarding the patch to re-introduced it you attached, I suppose the
> drm_warn() message doesn't need to talk about freezing processes as a
> solution to eviction failure because freezing is implied with the new
> notifier in use.

Will fix.

Thanks

Alex

>
> >
> > Alex
> >
> >>
> >> Alex
> >>
> >>>
> >>>> ---
> >>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 +++++++++++-----------
> >>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 22 ++-----------------
> >>>>    2 files changed, 15 insertions(+), 32 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>> index 71d95f16067a4..d232e4a26d7bf 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>> @@ -4974,28 +4974,29 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
> >>>>     * @data: data
> >>>>     *
> >>>>     * This function is called when the system is about to suspend or hibernate.
> >>>> - * It is used to evict resources from the device before the system goes to
> >>>> - * sleep while there is still access to swap.
> >>>> + * It is used to set the appropriate flags so that eviction can be optimized
> >>>> + * in the pm prepare callback.
> >>>>     */
> >>>>    static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
> >>>>                                     void *data)
> >>>>    {
> >>>>        struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
> >>>> -     int r;
> >>>>
> >>>>        switch (mode) {
> >>>>        case PM_HIBERNATION_PREPARE:
> >>>>                adev->in_s4 = true;
> >>>> -             fallthrough;
> >>>> +             break;
> >>>> +     case PM_POST_HIBERNATION:
> >>>> +             adev->in_s4 = false;
> >>>> +             break;
> >>>>        case PM_SUSPEND_PREPARE:
> >>>> -             r = amdgpu_device_evict_resources(adev);
> >>>> -             /*
> >>>> -              * This is considered non-fatal at this time because
> >>>> -              * amdgpu_device_prepare() will also fatally evict resources.
> >>>> -              * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
> >>>> -              */
> >>>> -             if (r)
> >>>> -                     drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
> >>>> +             if (amdgpu_acpi_is_s0ix_active(adev))
> >>>> +                     adev->in_s0ix = true;
> >>>> +             else if (amdgpu_acpi_is_s3_active(adev))
> >>>> +                     adev->in_s3 = true;
> >>>> +             break;
> >>>> +     case PM_POST_SUSPEND:
> >>>> +             adev->in_s0ix = adev->in_s3 = false;
> >>>>                break;
> >>>>        }
> >>>>
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>>> index cec041a556013..6599fb6313220 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>>> @@ -2572,10 +2572,6 @@ static int amdgpu_pmops_suspend(struct device *dev)
> >>>>        struct drm_device *drm_dev = dev_get_drvdata(dev);
> >>>>        struct amdgpu_device *adev = drm_to_adev(drm_dev);
> >>>>
> >>>> -     if (amdgpu_acpi_is_s0ix_active(adev))
> >>>> -             adev->in_s0ix = true;
> >>>> -     else if (amdgpu_acpi_is_s3_active(adev))
> >>>> -             adev->in_s3 = true;
> >>>>        if (!adev->in_s0ix && !adev->in_s3) {
> >>>>                /* don't allow going deep first time followed by s2idle the next time */
> >>>>                if (adev->last_suspend_state != PM_SUSPEND_ON &&
> >>>> @@ -2608,7 +2604,6 @@ static int amdgpu_pmops_resume(struct device *dev)
> >>>>    {
> >>>>        struct drm_device *drm_dev = dev_get_drvdata(dev);
> >>>>        struct amdgpu_device *adev = drm_to_adev(drm_dev);
> >>>> -     int r;
> >>>>
> >>>>        if (!adev->in_s0ix && !adev->in_s3)
> >>>>                return 0;
> >>>> @@ -2617,12 +2612,7 @@ static int amdgpu_pmops_resume(struct device *dev)
> >>>>        if (!pci_device_is_present(adev->pdev))
> >>>>                adev->no_hw_access = true;
> >>>>
> >>>> -     r = amdgpu_device_resume(drm_dev, true);
> >>>> -     if (amdgpu_acpi_is_s0ix_active(adev))
> >>>> -             adev->in_s0ix = false;
> >>>> -     else
> >>>> -             adev->in_s3 = false;
> >>>> -     return r;
> >>>> +     return amdgpu_device_resume(drm_dev, true);
> >>>>    }
> >>>>
> >>>>    static int amdgpu_pmops_freeze(struct device *dev)
> >>>> @@ -2643,13 +2633,8 @@ static int amdgpu_pmops_freeze(struct device *dev)
> >>>>    static int amdgpu_pmops_thaw(struct device *dev)
> >>>>    {
> >>>>        struct drm_device *drm_dev = dev_get_drvdata(dev);
> >>>> -     struct amdgpu_device *adev = drm_to_adev(drm_dev);
> >>>> -     int r;
> >>>>
> >>>> -     r = amdgpu_device_resume(drm_dev, true);
> >>>> -     adev->in_s4 = false;
> >>>> -
> >>>> -     return r;
> >>>> +     return amdgpu_device_resume(drm_dev, true);
> >>>>    }
> >>>>
> >>>>    static int amdgpu_pmops_poweroff(struct device *dev)
> >>>> @@ -2662,9 +2647,6 @@ static int amdgpu_pmops_poweroff(struct device *dev)
> >>>>    static int amdgpu_pmops_restore(struct device *dev)
> >>>>    {
> >>>>        struct drm_device *drm_dev = dev_get_drvdata(dev);
> >>>> -     struct amdgpu_device *adev = drm_to_adev(drm_dev);
> >>>> -
> >>>> -     adev->in_s4 = false;
> >>>>
> >>>>        return amdgpu_device_resume(drm_dev, true);
> >>>>    }
> >>>
>