[PATCH] drm/amdgpu: release gpu full access after "amdgpu_device_ip_late_init"

Mon Apr 17 09:43:33 UTC 2023

Reviewed-by: JingWen.Chen2 at amd.com

On 4/14/23 4:41 PM, Chong Li wrote:
> [WHY]
>  Function "amdgpu_irq_update()" called by "amdgpu_device_ip_late_init()" is an atomic context.
>  We shouldn't access registers through KIQ since "msleep()" may be called in "amdgpu_kiq_rreg()".
>
> [HOW]
>  Move function "amdgpu_virt_release_full_gpu()" after function "amdgpu_device_ip_late_init()",
>  to ensure that registers be accessed through RLCG instead of KIQ.
>
> Call Trace:
>   <TASK>
>   show_stack+0x52/0x69
>   dump_stack_lvl+0x49/0x6d
>   dump_stack+0x10/0x18
>   __schedule_bug.cold+0x4f/0x6b
>   __schedule+0x473/0x5d0
>   ? __wake_up_klogd.part.0+0x40/0x70
>   ? vprintk_emit+0xbe/0x1f0
>   schedule+0x68/0x110
>   schedule_timeout+0x87/0x160
>   ? timer_migration_handler+0xa0/0xa0
>   msleep+0x2d/0x50
>   amdgpu_kiq_rreg+0x18d/0x1f0 [amdgpu]
>   amdgpu_device_rreg.part.0+0x59/0xd0 [amdgpu]
>   amdgpu_device_rreg+0x3a/0x50 [amdgpu]
>   amdgpu_sriov_rreg+0x3c/0xb0 [amdgpu]
>   gfx_v10_0_set_gfx_eop_interrupt_state.constprop.0+0x16c/0x190 [amdgpu]
>   gfx_v10_0_set_eop_interrupt_state+0xa5/0xb0 [amdgpu]
>   amdgpu_irq_update+0x53/0x80 [amdgpu]
>   amdgpu_irq_get+0x7c/0xb0 [amdgpu]
>   amdgpu_fence_driver_hw_init+0x58/0x90 [amdgpu]
>   amdgpu_device_init.cold+0x16b7/0x2022 [amdgpu]
>
> Signed-off-by: Chong Li <chongli2 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 ++++++++++++----------
>  1 file changed, 17 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 051b9e231cf4..ee21a99ab4d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2538,8 +2538,6 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>  	amdgpu_fru_get_product_info(adev);
>  
>  init_failed:
> -	if (amdgpu_sriov_vf(adev))
> -		amdgpu_virt_release_full_gpu(adev, true);
>  
>  	return r;
>  }
> @@ -3856,18 +3854,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>  
>  	r = amdgpu_device_ip_init(adev);
>  	if (r) {
> -		/* failed in exclusive mode due to timeout */
> -		if (amdgpu_sriov_vf(adev) &&
> -		    !amdgpu_sriov_runtime(adev) &&
> -		    amdgpu_virt_mmio_blocked(adev) &&
> -		    !amdgpu_virt_wait_reset(adev)) {
> -			dev_err(adev->dev, "VF exclusive mode timeout\n");
> -			/* Don't send request since VF is inactive. */
> -			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> -			adev->virt.ops = NULL;
> -			r = -EAGAIN;
> -			goto release_ras_con;
> -		}
>  		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
>  		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
>  		goto release_ras_con;
> @@ -3936,8 +3922,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>  				   msecs_to_jiffies(AMDGPU_RESUME_MS));
>  	}
>  
> -	if (amdgpu_sriov_vf(adev))
> +	if (amdgpu_sriov_vf(adev)) {
> +		amdgpu_virt_release_full_gpu(adev, true);
>  		flush_delayed_work(&adev->delayed_init_work);
> +	}
>  
>  	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
>  	if (r)
> @@ -3977,6 +3965,20 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>  	return 0;
>  
>  release_ras_con:
> +	if (amdgpu_sriov_vf(adev))
> +		amdgpu_virt_release_full_gpu(adev, true);
> +
> +	/* failed in exclusive mode due to timeout */
> +	if (amdgpu_sriov_vf(adev) &&
> +		!amdgpu_sriov_runtime(adev) &&
> +		amdgpu_virt_mmio_blocked(adev) &&
> +		!amdgpu_virt_wait_reset(adev)) {
> +		dev_err(adev->dev, "VF exclusive mode timeout\n");
> +		/* Don't send request since VF is inactive. */
> +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> +		adev->virt.ops = NULL;
> +		r = -EAGAIN;
> +	}
>  	amdgpu_release_ras_context(adev);
>  
>  failed: