[PATCH] drm/amdgpu: Fix crash on device remove/driver unload

Lazar, Lijo lijo.lazar at amd.com
Thu Sep 16 15:51:52 UTC 2021



On 9/16/2021 9:15 PM, Andrey Grodzovsky wrote:
> 
> On 2021-09-16 4:20 a.m., Lazar, Lijo wrote:
>> A minor comment below.
>>
>> On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:
>>> Crash:
>>> BUG: unable to handle page fault for address: 00000000000010e1
>>> RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
>>> Call Trace:
>>> pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
>>> amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
>>> amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
>>> vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
>>> amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
>>> amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
>>> amdgpu_pci_remove+0x27/0x40 [amdgpu]
>>> pci_device_remove+0x3e/0xb0
>>> device_release_driver_internal+0x103/0x1d0
>>> device_release_driver+0x12/0x20
>>> pci_stop_bus_device+0x79/0xa0
>>> pci_stop_and_remove_bus_device_locked+0x1b/0x30
>>> remove_store+0x7b/0x90
>>> dev_attr_store+0x17/0x30
>>> sysfs_kf_write+0x4b/0x60
>>> kernfs_fop_write_iter+0x151/0x1e0
>>>
>>> Why:
>>> VCE/UVD had dependency on SMC block for their suspend but
>>> SMC block is the first to do HW fini due to some constraints
>>>
>>> How:
>>> Since the original patch was dealing with suspend issues
>>> move the SMC block dependency back into suspend hooks as
>>> was done in V1 of the original patches.
>>> Keep flushing idle work both in suspend and HW fini seuqnces
>>> since it's essential in both cases.
>>>
>>> Fixes:
>>> 2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on 
>>> UVD/VCE suspend
>>> ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE 
>>> on suspend
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
>>>   drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
>>>   drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
>>>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
>>>   drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
>>>   drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
>>>   drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++++++++++++++-------------
>>>   7 files changed, 105 insertions(+), 90 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c 
>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>> index 7232241e3bfb..0fef925b6602 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>> @@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>>> +
>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>> +        uvd_v3_1_stop(adev);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int uvd_v3_1_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>> -        uvd_v3_1_stop(adev);
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int uvd_v3_1_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = uvd_v3_1_hw_fini(adev);
>>
>> "cosmetic change" comment - hw_fini is supposed to be the final tear 
>> down call. So instead of suspend calling hw_fini, perhaps it makes 
>> sense to read the other way - hw_fini just suspends the hardware?
>>
>> Thanks,
>> Lijo
> 
> 
> Not sure what you mean ?

Now it is - suspend()-> calls hw_fini()

What I meant is hw_fini() -> calls suspend() and that is read as "to do 
hw_fini() only suspend the hardware and nothing extra is needed".

In short implementation stays in suspend() and hw_fini() calls suspend().

Thanks,
Lijo

> 
> Andrey
> 
> 
>>
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c 
>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>> index 52d6de969f46..c108b8381795 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>> @@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>>> +
>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>> +        uvd_v4_2_stop(adev);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int uvd_v4_2_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>> -        uvd_v4_2_stop(adev);
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int uvd_v4_2_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = uvd_v4_2_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>> index db6d06758e4d..563493d1f830 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>> @@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>>> +
>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>> +        uvd_v5_0_stop(adev);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int uvd_v5_0_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>> -        uvd_v5_0_stop(adev);
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int uvd_v5_0_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = uvd_v5_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>> index b6e82d75561f..1fd9ca21a091 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>> @@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>>> +
>>> +    if (!amdgpu_sriov_vf(adev))
>>> +        uvd_v7_0_stop(adev);
>>> +    else {
>>> +        /* full access mode, so don't touch any UVD register */
>>> +        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int uvd_v7_0_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    if (!amdgpu_sriov_vf(adev))
>>> -        uvd_v7_0_stop(adev);
>>> -    else {
>>> -        /* full access mode, so don't touch any UVD register */
>>> -        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>>> -    }
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int uvd_v7_0_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = uvd_v7_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>> index 84e488f189f5..67eb01fef789 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>> @@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->vce.idle_work);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int vce_v2_0_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    return 0;
>>> -}
>>> -
>>> -static int vce_v2_0_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = vce_v2_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>> index 2a18c1e089fd..142e291983b4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>> @@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
>>>       int r;
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->vce.idle_work);
>>> +
>>> +    r = vce_v3_0_wait_for_idle(handle);
>>> +    if (r)
>>> +        return r;
>>> +
>>> +    vce_v3_0_stop(adev);
>>> +    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>>> +}
>>> +
>>> +static int vce_v3_0_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    r = vce_v3_0_wait_for_idle(handle);
>>> -    if (r)
>>> -        return r;
>>> -
>>> -    vce_v3_0_stop(adev);
>>> -    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>>> -}
>>> -
>>> -static int vce_v3_0_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = vce_v3_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>> index 044cf9d74b85..226b79254db8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>> @@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   -    /*
>>> -     * Proper cleanups before halting the HW engine:
>>> -     *   - cancel the delayed idle work
>>> -     *   - enable powergating
>>> -     *   - enable clockgating
>>> -     *   - disable dpm
>>> -     *
>>> -     * TODO: to align with the VCN implementation, move the
>>> -     * jobs for clockgating/powergating/dpm setting to
>>> -     * ->set_powergating_state().
>>> -     */
>>>       cancel_delayed_work_sync(&adev->vce.idle_work);
>>>   -    if (adev->pm.dpm_enabled) {
>>> -        amdgpu_dpm_enable_vce(adev, false);
>>> -    } else {
>>> -        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>>> -        amdgpu_device_ip_set_powergating_state(adev, 
>>> AMD_IP_BLOCK_TYPE_VCE,
>>> -                               AMD_PG_STATE_GATE);
>>> -        amdgpu_device_ip_set_clockgating_state(adev, 
>>> AMD_IP_BLOCK_TYPE_VCE,
>>> -                               AMD_CG_STATE_GATE);
>>> -    }
>>> -
>>>       if (!amdgpu_sriov_vf(adev)) {
>>>           /* vce_v4_0_wait_for_idle(handle); */
>>>           vce_v4_0_stop(adev);
>>> @@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
>>>           drm_dev_exit(idx);
>>>       }
>>>   +    /*
>>> +     * Proper cleanups before halting the HW engine:
>>> +     *   - cancel the delayed idle work
>>> +     *   - enable powergating
>>> +     *   - enable clockgating
>>> +     *   - disable dpm
>>> +     *
>>> +     * TODO: to align with the VCN implementation, move the
>>> +     * jobs for clockgating/powergating/dpm setting to
>>> +     * ->set_powergating_state().
>>> +     */
>>> +    cancel_delayed_work_sync(&adev->vce.idle_work);
>>> +
>>> +    if (adev->pm.dpm_enabled) {
>>> +        amdgpu_dpm_enable_vce(adev, false);
>>> +    } else {
>>> +        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>>> +        amdgpu_device_ip_set_powergating_state(adev, 
>>> AMD_IP_BLOCK_TYPE_VCE,
>>> +                               AMD_PG_STATE_GATE);
>>> +        amdgpu_device_ip_set_clockgating_state(adev, 
>>> AMD_IP_BLOCK_TYPE_VCE,
>>> +                               AMD_CG_STATE_GATE);
>>> +    }
>>> +
>>>       r = vce_v4_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>>


More information about the amd-gfx mailing list