[PATCH] drm/amdgpu: Fix crash on device remove/driver unload
Andrey Grodzovsky
andrey.grodzovsky at amd.com
Thu Sep 16 15:45:36 UTC 2021
On 2021-09-16 4:20 a.m., Lazar, Lijo wrote:
> A minor comment below.
>
> On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:
>> Crash:
>> BUG: unable to handle page fault for address: 00000000000010e1
>> RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
>> Call Trace:
>> pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
>> amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
>> amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
>> vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
>> amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
>> amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
>> amdgpu_pci_remove+0x27/0x40 [amdgpu]
>> pci_device_remove+0x3e/0xb0
>> device_release_driver_internal+0x103/0x1d0
>> device_release_driver+0x12/0x20
>> pci_stop_bus_device+0x79/0xa0
>> pci_stop_and_remove_bus_device_locked+0x1b/0x30
>> remove_store+0x7b/0x90
>> dev_attr_store+0x17/0x30
>> sysfs_kf_write+0x4b/0x60
>> kernfs_fop_write_iter+0x151/0x1e0
>>
>> Why:
>> VCE/UVD had dependency on SMC block for their suspend but
>> SMC block is the first to do HW fini due to some constraints
>>
>> How:
>> Since the original patch was dealing with suspend issues
>> move the SMC block dependency back into suspend hooks as
>> was done in V1 of the original patches.
>> Keep flushing idle work both in suspend and HW fini seuqnces
>> since it's essential in both cases.
>>
>> Fixes:
>> 2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on
>> UVD/VCE suspend
>> ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE
>> on suspend
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
>> drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
>> drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
>> drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
>> drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
>> drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
>> drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++++++++++++++-------------
>> 7 files changed, 105 insertions(+), 90 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>> b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>> index 7232241e3bfb..0fef925b6602 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>> @@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
>> {
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> + cancel_delayed_work_sync(&adev->uvd.idle_work);
>> +
>> + if (RREG32(mmUVD_STATUS) != 0)
>> + uvd_v3_1_stop(adev);
>> +
>> + return 0;
>> +}
>> +
>> +static int uvd_v3_1_suspend(void *handle)
>> +{
>> + int r;
>> + struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>> /*
>> * Proper cleanups before halting the HW engine:
>> * - cancel the delayed idle work
>> @@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
>> AMD_CG_STATE_GATE);
>> }
>> - if (RREG32(mmUVD_STATUS) != 0)
>> - uvd_v3_1_stop(adev);
>> -
>> - return 0;
>> -}
>> -
>> -static int uvd_v3_1_suspend(void *handle)
>> -{
>> - int r;
>> - struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>> r = uvd_v3_1_hw_fini(adev);
>
> "cosmetic change" comment - hw_fini is supposed to be the final tear
> down call. So instead of suspend calling hw_fini, perhaps it makes
> sense to read the other way - hw_fini just suspends the hardware?
>
> Thanks,
> Lijo
Not sure what you mean ?
Andrey
>
>> if (r)
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>> b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>> index 52d6de969f46..c108b8381795 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>> @@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
>> {
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> + cancel_delayed_work_sync(&adev->uvd.idle_work);
>> +
>> + if (RREG32(mmUVD_STATUS) != 0)
>> + uvd_v4_2_stop(adev);
>> +
>> + return 0;
>> +}
>> +
>> +static int uvd_v4_2_suspend(void *handle)
>> +{
>> + int r;
>> + struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>> /*
>> * Proper cleanups before halting the HW engine:
>> * - cancel the delayed idle work
>> @@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
>> AMD_CG_STATE_GATE);
>> }
>> - if (RREG32(mmUVD_STATUS) != 0)
>> - uvd_v4_2_stop(adev);
>> -
>> - return 0;
>> -}
>> -
>> -static int uvd_v4_2_suspend(void *handle)
>> -{
>> - int r;
>> - struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>> r = uvd_v4_2_hw_fini(adev);
>> if (r)
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>> b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>> index db6d06758e4d..563493d1f830 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>> @@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
>> {
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> + cancel_delayed_work_sync(&adev->uvd.idle_work);
>> +
>> + if (RREG32(mmUVD_STATUS) != 0)
>> + uvd_v5_0_stop(adev);
>> +
>> + return 0;
>> +}
>> +
>> +static int uvd_v5_0_suspend(void *handle)
>> +{
>> + int r;
>> + struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>> /*
>> * Proper cleanups before halting the HW engine:
>> * - cancel the delayed idle work
>> @@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
>> AMD_CG_STATE_GATE);
>> }
>> - if (RREG32(mmUVD_STATUS) != 0)
>> - uvd_v5_0_stop(adev);
>> -
>> - return 0;
>> -}
>> -
>> -static int uvd_v5_0_suspend(void *handle)
>> -{
>> - int r;
>> - struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>> r = uvd_v5_0_hw_fini(adev);
>> if (r)
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>> b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>> index b6e82d75561f..1fd9ca21a091 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>> @@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
>> {
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> + cancel_delayed_work_sync(&adev->uvd.idle_work);
>> +
>> + if (!amdgpu_sriov_vf(adev))
>> + uvd_v7_0_stop(adev);
>> + else {
>> + /* full access mode, so don't touch any UVD register */
>> + DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int uvd_v7_0_suspend(void *handle)
>> +{
>> + int r;
>> + struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>> /*
>> * Proper cleanups before halting the HW engine:
>> * - cancel the delayed idle work
>> @@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
>> AMD_CG_STATE_GATE);
>> }
>> - if (!amdgpu_sriov_vf(adev))
>> - uvd_v7_0_stop(adev);
>> - else {
>> - /* full access mode, so don't touch any UVD register */
>> - DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>> - }
>> -
>> - return 0;
>> -}
>> -
>> -static int uvd_v7_0_suspend(void *handle)
>> -{
>> - int r;
>> - struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>> r = uvd_v7_0_hw_fini(adev);
>> if (r)
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>> b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>> index 84e488f189f5..67eb01fef789 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>> @@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
>> {
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> + cancel_delayed_work_sync(&adev->vce.idle_work);
>> +
>> + return 0;
>> +}
>> +
>> +static int vce_v2_0_suspend(void *handle)
>> +{
>> + int r;
>> + struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>> +
>> /*
>> * Proper cleanups before halting the HW engine:
>> * - cancel the delayed idle work
>> @@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
>> AMD_CG_STATE_GATE);
>> }
>> - return 0;
>> -}
>> -
>> -static int vce_v2_0_suspend(void *handle)
>> -{
>> - int r;
>> - struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>> r = vce_v2_0_hw_fini(adev);
>> if (r)
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>> b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>> index 2a18c1e089fd..142e291983b4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>> @@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
>> int r;
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> + cancel_delayed_work_sync(&adev->vce.idle_work);
>> +
>> + r = vce_v3_0_wait_for_idle(handle);
>> + if (r)
>> + return r;
>> +
>> + vce_v3_0_stop(adev);
>> + return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>> +}
>> +
>> +static int vce_v3_0_suspend(void *handle)
>> +{
>> + int r;
>> + struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>> /*
>> * Proper cleanups before halting the HW engine:
>> * - cancel the delayed idle work
>> @@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
>> AMD_CG_STATE_GATE);
>> }
>> - r = vce_v3_0_wait_for_idle(handle);
>> - if (r)
>> - return r;
>> -
>> - vce_v3_0_stop(adev);
>> - return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>> -}
>> -
>> -static int vce_v3_0_suspend(void *handle)
>> -{
>> - int r;
>> - struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>> r = vce_v3_0_hw_fini(adev);
>> if (r)
>> return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>> b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>> index 044cf9d74b85..226b79254db8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>> @@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
>> {
>> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> - /*
>> - * Proper cleanups before halting the HW engine:
>> - * - cancel the delayed idle work
>> - * - enable powergating
>> - * - enable clockgating
>> - * - disable dpm
>> - *
>> - * TODO: to align with the VCN implementation, move the
>> - * jobs for clockgating/powergating/dpm setting to
>> - * ->set_powergating_state().
>> - */
>> cancel_delayed_work_sync(&adev->vce.idle_work);
>> - if (adev->pm.dpm_enabled) {
>> - amdgpu_dpm_enable_vce(adev, false);
>> - } else {
>> - amdgpu_asic_set_vce_clocks(adev, 0, 0);
>> - amdgpu_device_ip_set_powergating_state(adev,
>> AMD_IP_BLOCK_TYPE_VCE,
>> - AMD_PG_STATE_GATE);
>> - amdgpu_device_ip_set_clockgating_state(adev,
>> AMD_IP_BLOCK_TYPE_VCE,
>> - AMD_CG_STATE_GATE);
>> - }
>> -
>> if (!amdgpu_sriov_vf(adev)) {
>> /* vce_v4_0_wait_for_idle(handle); */
>> vce_v4_0_stop(adev);
>> @@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
>> drm_dev_exit(idx);
>> }
>> + /*
>> + * Proper cleanups before halting the HW engine:
>> + * - cancel the delayed idle work
>> + * - enable powergating
>> + * - enable clockgating
>> + * - disable dpm
>> + *
>> + * TODO: to align with the VCN implementation, move the
>> + * jobs for clockgating/powergating/dpm setting to
>> + * ->set_powergating_state().
>> + */
>> + cancel_delayed_work_sync(&adev->vce.idle_work);
>> +
>> + if (adev->pm.dpm_enabled) {
>> + amdgpu_dpm_enable_vce(adev, false);
>> + } else {
>> + amdgpu_asic_set_vce_clocks(adev, 0, 0);
>> + amdgpu_device_ip_set_powergating_state(adev,
>> AMD_IP_BLOCK_TYPE_VCE,
>> + AMD_PG_STATE_GATE);
>> + amdgpu_device_ip_set_clockgating_state(adev,
>> AMD_IP_BLOCK_TYPE_VCE,
>> + AMD_CG_STATE_GATE);
>> + }
>> +
>> r = vce_v4_0_hw_fini(adev);
>> if (r)
>> return r;
>>
More information about the amd-gfx
mailing list