[PATCH v3 2/2] drm/amdgpu: clean up the suspend_complete

Lazar, Lijo lijo.lazar at amd.com
Fri Oct 25 04:48:59 UTC 2024



On 10/25/2024 8:54 AM, Liang, Prike wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
>> From: Lazar, Lijo <Lijo.Lazar at amd.com>
>> Sent: Thursday, October 24, 2024 4:40 PM
>> To: Liang, Prike <Prike.Liang at amd.com>; amd-gfx at lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>
>> Subject: Re: [PATCH v3 2/2] drm/amdgpu: clean up the suspend_complete
>>
>>
>>
>> On 10/24/2024 1:51 PM, Prike Liang wrote:
>>> To check the status of S3 suspend completion, use the PM core
>>> pm_suspend_global_flags bit(1) to detect S3 abort events. Therefore,
>>> clean up the AMDGPU driver's private flag suspend_complete.
>>>
>>> Signed-off-by: Prike Liang <Prike.Liang at amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  2 --
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  2 --
>>>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  4 ++--
>>>  drivers/gpu/drm/amd/amdgpu/soc15.c      |  7 ++-----
>>>  drivers/gpu/drm/amd/amdgpu/soc21.c      | 16 ++++++----------
>>>  5 files changed, 10 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 48c9b9b06905..9b35763ae0a7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1111,8 +1111,6 @@ struct amdgpu_device {
>>>     bool                            in_s3;
>>>     bool                            in_s4;
>>>     bool                            in_s0ix;
>>> -   /* indicate amdgpu suspension status */
>>> -   bool                            suspend_complete;
>>>
>>>     enum pp_mp1_state               mp1_state;
>>>     struct amdgpu_doorbell_index doorbell_index; diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 680e44fdee6e..78972151b970 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -2501,7 +2501,6 @@ static int amdgpu_pmops_suspend(struct device *dev)
>>>     struct drm_device *drm_dev = dev_get_drvdata(dev);
>>>     struct amdgpu_device *adev = drm_to_adev(drm_dev);
>>>
>>> -   adev->suspend_complete = false;
>>>     if (amdgpu_acpi_is_s0ix_active(adev))
>>>             adev->in_s0ix = true;
>>>     else if (amdgpu_acpi_is_s3_active(adev)) @@ -2516,7 +2515,6 @@
>>> static int amdgpu_pmops_suspend_noirq(struct device *dev)
>>>     struct drm_device *drm_dev = dev_get_drvdata(dev);
>>>     struct amdgpu_device *adev = drm_to_adev(drm_dev);
>>>
>>> -   adev->suspend_complete = true;
>>>     if (amdgpu_acpi_should_gpu_reset(adev))
>>>             return amdgpu_asic_reset(adev);
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index b4c4b9916289..6ffcee3008ba 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -3276,8 +3276,8 @@ static int gfx_v9_0_cp_gfx_start(struct amdgpu_device
>> *adev)
>>>      * confirmed that the APU gfx10/gfx11 needn't such update.
>>>      */
>>>     if (adev->flags & AMD_IS_APU &&
>>> -                   adev->in_s3 && !adev->suspend_complete) {
>>> -           DRM_INFO(" Will skip the CSB packet resubmit\n");
>>> +                   adev->in_s3 && !pm_resume_via_firmware()) {
>>> +           DRM_INFO("Will skip the CSB packet resubmit\n");
>>>             return 0;
>>>     }
>>>     r = amdgpu_ring_alloc(ring, gfx_v9_0_get_csb_size(adev) + 4 + 3);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> b/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> index 12ff6cf568dc..d9d11131a744 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> @@ -584,13 +584,10 @@ static bool soc15_need_reset_on_resume(struct
>> amdgpu_device *adev)
>>>      *    performing pm core test.
>>>      */
>>>     if (adev->flags & AMD_IS_APU && adev->in_s3 &&
>>> -                   !pm_resume_via_firmware()) {
>>> -           adev->suspend_complete = false;
>>> +                   !pm_resume_via_firmware())
>>>             return true;
>>> -   } else {
>>> -           adev->suspend_complete = true;
>>> +   else
>>>             return false;
>>> -   }
>>>  }
>>>
>>>  static int soc15_asic_reset(struct amdgpu_device *adev) diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/soc21.c
>>> b/drivers/gpu/drm/amd/amdgpu/soc21.c
>>> index c4b950e75133..8d5844d7a10f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/soc21.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
>>> @@ -897,22 +897,18 @@ static int soc21_common_suspend(struct
>>> amdgpu_ip_block *ip_block)
>>>
>>>  static bool soc21_need_reset_on_resume(struct amdgpu_device *adev)  {
>>> -   u32 sol_reg1, sol_reg2;
>>> +   u32 sol_reg;
>>>
>>>     /* Will reset for the following suspend abort cases.
>>>      * 1) Only reset dGPU side.
>>>      * 2) S3 suspend got aborted and TOS is active.
>>>      */
>>> -   if (!(adev->flags & AMD_IS_APU) && adev->in_s3 &&
>>> -       !adev->suspend_complete) {
>>> -           sol_reg1 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81);
>>> -           msleep(100);
>>> -           sol_reg2 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81);
>>> -
>>> -           return (sol_reg1 != sol_reg2);
>>> -   }
>>>
>>> -   return false;
>>> +   sol_reg = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81);
>>
>> Actually, two samples are taken intentionally to make sure that FW is not hung
>> before deciding on reset.
>>
> However, the original two samples cannot filter out the case where the dGPU is completely suspended. When the dGPU is in competition to suspend, the SOL remains at zero until the PSP resumes. Therefore, the original two SOL samples checking will wrongly reset the case of the dGPU being completely suspended. Personally, one SOL check is sufficient to determine if the dGPU device has been completely suspended.
> 

If the dGPU is completely suspended (D3), then those two samples will be
0. In that case, there won't be any reset (sol_reg1 != sol_reg2).

If it's ticking, then SOS is active - then a reset is possible. If it's
non-zero, but not ticking, FW is hung. There is no point in trying a
reset as FW needs to be active to do so. That case will eventually end
up in a different failure.

Thanks,
Lijo

> Thanks,
> Prike
> 
>> Thanks,
>> Lijo
>>
>>> +   if (!(adev->flags & AMD_IS_APU) && sol_reg)
>>> +           return true;
>>> +   else
>>> +           return false;
>>>  }
>>>
>>>  static int soc21_common_resume(struct amdgpu_ip_block *ip_block)


More information about the amd-gfx mailing list