[PATCH] drm/amdgpu: Fix crash on device remove/driver unload

Lazar, Lijo lijo.lazar at amd.com
Thu Sep 16 08:20:48 UTC 2021


A minor comment below.

On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:
> Crash:
> BUG: unable to handle page fault for address: 00000000000010e1
> RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
> Call Trace:
> pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
> amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
> amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
> vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
> amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
> amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
> amdgpu_pci_remove+0x27/0x40 [amdgpu]
> pci_device_remove+0x3e/0xb0
> device_release_driver_internal+0x103/0x1d0
> device_release_driver+0x12/0x20
> pci_stop_bus_device+0x79/0xa0
> pci_stop_and_remove_bus_device_locked+0x1b/0x30
> remove_store+0x7b/0x90
> dev_attr_store+0x17/0x30
> sysfs_kf_write+0x4b/0x60
> kernfs_fop_write_iter+0x151/0x1e0
> 
> Why:
> VCE/UVD had dependency on SMC block for their suspend but
> SMC block is the first to do HW fini due to some constraints
> 
> How:
> Since the original patch was dealing with suspend issues
> move the SMC block dependency back into suspend hooks as
> was done in V1 of the original patches.
> Keep flushing idle work both in suspend and HW fini seuqnces
> since it's essential in both cases.
> 
> Fixes:
> 2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on UVD/VCE suspend
> ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
>   drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
>   drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
>   drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
>   drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++++++++++++++-------------
>   7 files changed, 105 insertions(+), 90 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
> index 7232241e3bfb..0fef925b6602 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
> @@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +
> +	if (RREG32(mmUVD_STATUS) != 0)
> +		uvd_v3_1_stop(adev);
> +
> +	return 0;
> +}
> +
> +static int uvd_v3_1_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	if (RREG32(mmUVD_STATUS) != 0)
> -		uvd_v3_1_stop(adev);
> -
> -	return 0;
> -}
> -
> -static int uvd_v3_1_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = uvd_v3_1_hw_fini(adev);

"cosmetic change" comment - hw_fini is supposed to be the final tear 
down call. So instead of suspend calling hw_fini, perhaps it makes sense 
to read the other way - hw_fini just suspends the hardware?

Thanks,
Lijo

>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
> index 52d6de969f46..c108b8381795 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
> @@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +
> +	if (RREG32(mmUVD_STATUS) != 0)
> +		uvd_v4_2_stop(adev);
> +
> +	return 0;
> +}
> +
> +static int uvd_v4_2_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	if (RREG32(mmUVD_STATUS) != 0)
> -		uvd_v4_2_stop(adev);
> -
> -	return 0;
> -}
> -
> -static int uvd_v4_2_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = uvd_v4_2_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
> index db6d06758e4d..563493d1f830 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
> @@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +
> +	if (RREG32(mmUVD_STATUS) != 0)
> +		uvd_v5_0_stop(adev);
> +
> +	return 0;
> +}
> +
> +static int uvd_v5_0_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	if (RREG32(mmUVD_STATUS) != 0)
> -		uvd_v5_0_stop(adev);
> -
> -	return 0;
> -}
> -
> -static int uvd_v5_0_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = uvd_v5_0_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> index b6e82d75561f..1fd9ca21a091 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> @@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		uvd_v7_0_stop(adev);
> +	else {
> +		/* full access mode, so don't touch any UVD register */
> +		DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
> +	}
> +
> +	return 0;
> +}
> +
> +static int uvd_v7_0_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	if (!amdgpu_sriov_vf(adev))
> -		uvd_v7_0_stop(adev);
> -	else {
> -		/* full access mode, so don't touch any UVD register */
> -		DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
> -	}
> -
> -	return 0;
> -}
> -
> -static int uvd_v7_0_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = uvd_v7_0_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
> index 84e488f189f5..67eb01fef789 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
> @@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->vce.idle_work);
> +
> +	return 0;
> +}
> +
> +static int vce_v2_0_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	return 0;
> -}
> -
> -static int vce_v2_0_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = vce_v2_0_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
> index 2a18c1e089fd..142e291983b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
> @@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
>   	int r;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->vce.idle_work);
> +
> +	r = vce_v3_0_wait_for_idle(handle);
> +	if (r)
> +		return r;
> +
> +	vce_v3_0_stop(adev);
> +	return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
> +}
> +
> +static int vce_v3_0_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	r = vce_v3_0_wait_for_idle(handle);
> -	if (r)
> -		return r;
> -
> -	vce_v3_0_stop(adev);
> -	return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
> -}
> -
> -static int vce_v3_0_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = vce_v3_0_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
> index 044cf9d74b85..226b79254db8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
> @@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> -	/*
> -	 * Proper cleanups before halting the HW engine:
> -	 *   - cancel the delayed idle work
> -	 *   - enable powergating
> -	 *   - enable clockgating
> -	 *   - disable dpm
> -	 *
> -	 * TODO: to align with the VCN implementation, move the
> -	 * jobs for clockgating/powergating/dpm setting to
> -	 * ->set_powergating_state().
> -	 */
>   	cancel_delayed_work_sync(&adev->vce.idle_work);
>   
> -	if (adev->pm.dpm_enabled) {
> -		amdgpu_dpm_enable_vce(adev, false);
> -	} else {
> -		amdgpu_asic_set_vce_clocks(adev, 0, 0);
> -		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
> -						       AMD_PG_STATE_GATE);
> -		amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
> -						       AMD_CG_STATE_GATE);
> -	}
> -
>   	if (!amdgpu_sriov_vf(adev)) {
>   		/* vce_v4_0_wait_for_idle(handle); */
>   		vce_v4_0_stop(adev);
> @@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
>   		drm_dev_exit(idx);
>   	}
>   
> +	/*
> +	 * Proper cleanups before halting the HW engine:
> +	 *   - cancel the delayed idle work
> +	 *   - enable powergating
> +	 *   - enable clockgating
> +	 *   - disable dpm
> +	 *
> +	 * TODO: to align with the VCN implementation, move the
> +	 * jobs for clockgating/powergating/dpm setting to
> +	 * ->set_powergating_state().
> +	 */
> +	cancel_delayed_work_sync(&adev->vce.idle_work);
> +
> +	if (adev->pm.dpm_enabled) {
> +		amdgpu_dpm_enable_vce(adev, false);
> +	} else {
> +		amdgpu_asic_set_vce_clocks(adev, 0, 0);
> +		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
> +						       AMD_PG_STATE_GATE);
> +		amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
> +						       AMD_CG_STATE_GATE);
> +	}
> +
>   	r = vce_v4_0_hw_fini(adev);
>   	if (r)
>   		return r;
> 


More information about the amd-gfx mailing list