[PATCH 2/2] drm/amdgpu: fix pm notifier handling

Fri May 2 16:13:55 UTC 2025

On 5/2/2025 8:13 AM, Alex Deucher wrote:
> On Fri, May 2, 2025 at 8:56 AM Alex Deucher <alexdeucher at gmail.com> wrote:
>>
>> On Thu, May 1, 2025 at 5:19 PM Mario Limonciello
>> <mario.limonciello at amd.com> wrote:
>>>
>>> On 5/1/2025 3:09 PM, Alex Deucher wrote:
>>>> Set the s3/s0ix and s4 flags in the pm notifier so that we can skip
>>>> the resource evictions properly in pm prepare based on whether
>>>> we are suspending or hibernating.  Drop the eviction as processes
>>>> are not frozen at this time, we we can end up getting stuck trying
>>>> to evict VRAM while applications continue to submit work which
>>>> causes the buffers to get pulled back into VRAM.
>>>>
>>>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178
>>>> Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback support")
>>>> Cc: Mario Limonciello <mario.limonciello at amd.com>
>>>> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
>>>
>>> Rather than removing the callbacks (and thus introducing the "other"
>>> bugs with memory pressure), I've sent an alternative series.  LMK what
>>> you think of this instead.
>>>
>>> https://lore.kernel.org/amd-gfx/20250501211734.2434369-1-superm1@kernel.org/T/#m6cdc075af911868667ea6fc849bcd196477d6c20
>>
>> Series looks good to me, but I think a variant of this patch (with the
>> evictions still in place) still makes sense so that we can properly
>> set the suspend and hibernate flags in amdgpu so that we can optimize
>> the evictions for suspend on APUs.  Will respin.
> 
> I think this set still makes sense for stable, but then if your patch
> series lands, we can apply the attached patch on top of that.

OK.  Let me re-look through your series once more and I'll leave 
comments or a tag.

Regarding the patch to re-introduced it you attached, I suppose the 
drm_warn() message doesn't need to talk about freezing processes as a 
solution to eviction failure because freezing is implied with the new 
notifier in use.

> 
> Alex
> 
>>
>> Alex
>>
>>>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 25 +++++++++++-----------
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 22 ++-----------------
>>>>    2 files changed, 15 insertions(+), 32 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 71d95f16067a4..d232e4a26d7bf 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -4974,28 +4974,29 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
>>>>     * @data: data
>>>>     *
>>>>     * This function is called when the system is about to suspend or hibernate.
>>>> - * It is used to evict resources from the device before the system goes to
>>>> - * sleep while there is still access to swap.
>>>> + * It is used to set the appropriate flags so that eviction can be optimized
>>>> + * in the pm prepare callback.
>>>>     */
>>>>    static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
>>>>                                     void *data)
>>>>    {
>>>>        struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
>>>> -     int r;
>>>>
>>>>        switch (mode) {
>>>>        case PM_HIBERNATION_PREPARE:
>>>>                adev->in_s4 = true;
>>>> -             fallthrough;
>>>> +             break;
>>>> +     case PM_POST_HIBERNATION:
>>>> +             adev->in_s4 = false;
>>>> +             break;
>>>>        case PM_SUSPEND_PREPARE:
>>>> -             r = amdgpu_device_evict_resources(adev);
>>>> -             /*
>>>> -              * This is considered non-fatal at this time because
>>>> -              * amdgpu_device_prepare() will also fatally evict resources.
>>>> -              * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
>>>> -              */
>>>> -             if (r)
>>>> -                     drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
>>>> +             if (amdgpu_acpi_is_s0ix_active(adev))
>>>> +                     adev->in_s0ix = true;
>>>> +             else if (amdgpu_acpi_is_s3_active(adev))
>>>> +                     adev->in_s3 = true;
>>>> +             break;
>>>> +     case PM_POST_SUSPEND:
>>>> +             adev->in_s0ix = adev->in_s3 = false;
>>>>                break;
>>>>        }
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index cec041a556013..6599fb6313220 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -2572,10 +2572,6 @@ static int amdgpu_pmops_suspend(struct device *dev)
>>>>        struct drm_device *drm_dev = dev_get_drvdata(dev);
>>>>        struct amdgpu_device *adev = drm_to_adev(drm_dev);
>>>>
>>>> -     if (amdgpu_acpi_is_s0ix_active(adev))
>>>> -             adev->in_s0ix = true;
>>>> -     else if (amdgpu_acpi_is_s3_active(adev))
>>>> -             adev->in_s3 = true;
>>>>        if (!adev->in_s0ix && !adev->in_s3) {
>>>>                /* don't allow going deep first time followed by s2idle the next time */
>>>>                if (adev->last_suspend_state != PM_SUSPEND_ON &&
>>>> @@ -2608,7 +2604,6 @@ static int amdgpu_pmops_resume(struct device *dev)
>>>>    {
>>>>        struct drm_device *drm_dev = dev_get_drvdata(dev);
>>>>        struct amdgpu_device *adev = drm_to_adev(drm_dev);
>>>> -     int r;
>>>>
>>>>        if (!adev->in_s0ix && !adev->in_s3)
>>>>                return 0;
>>>> @@ -2617,12 +2612,7 @@ static int amdgpu_pmops_resume(struct device *dev)
>>>>        if (!pci_device_is_present(adev->pdev))
>>>>                adev->no_hw_access = true;
>>>>
>>>> -     r = amdgpu_device_resume(drm_dev, true);
>>>> -     if (amdgpu_acpi_is_s0ix_active(adev))
>>>> -             adev->in_s0ix = false;
>>>> -     else
>>>> -             adev->in_s3 = false;
>>>> -     return r;
>>>> +     return amdgpu_device_resume(drm_dev, true);
>>>>    }
>>>>
>>>>    static int amdgpu_pmops_freeze(struct device *dev)
>>>> @@ -2643,13 +2633,8 @@ static int amdgpu_pmops_freeze(struct device *dev)
>>>>    static int amdgpu_pmops_thaw(struct device *dev)
>>>>    {
>>>>        struct drm_device *drm_dev = dev_get_drvdata(dev);
>>>> -     struct amdgpu_device *adev = drm_to_adev(drm_dev);
>>>> -     int r;
>>>>
>>>> -     r = amdgpu_device_resume(drm_dev, true);
>>>> -     adev->in_s4 = false;
>>>> -
>>>> -     return r;
>>>> +     return amdgpu_device_resume(drm_dev, true);
>>>>    }
>>>>
>>>>    static int amdgpu_pmops_poweroff(struct device *dev)
>>>> @@ -2662,9 +2647,6 @@ static int amdgpu_pmops_poweroff(struct device *dev)
>>>>    static int amdgpu_pmops_restore(struct device *dev)
>>>>    {
>>>>        struct drm_device *drm_dev = dev_get_drvdata(dev);
>>>> -     struct amdgpu_device *adev = drm_to_adev(drm_dev);
>>>> -
>>>> -     adev->in_s4 = false;
>>>>
>>>>        return amdgpu_device_resume(drm_dev, true);
>>>>    }
>>>