[PATCH 7/7] drm/amdgpu: retry init if it fails due to exclusive mode timeout

Andres Rodriguez andresx7 at gmail.com
Mon Oct 23 15:44:10 UTC 2017



On 2017-10-23 06:03 AM, Pixel Ding wrote:
> From: pding <Pixel.Ding at amd.com>
> 
> The exclusive mode has real-time limitation in reality, such like being
> done in 300ms. It's easy observed if running many VF/VMs in single host
> with heavy CPU workload.
> 
> If we find the init fails due to exclusive mode timeout, try it again.
> 
> Signed-off-by: pding <Pixel.Ding at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 15 +++++++++++++--
>   2 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3458d46..1935f5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2306,6 +2306,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   
>   	r = amdgpu_init(adev);
>   	if (r) {
> +		/* failed in exclusive mode due to timeout */
> +		if (amdgpu_sriov_vf(adev) &&
> +		    !amdgpu_sriov_runtime(adev) &&
> +		    amdgpu_virt_mmio_blocked(adev) &&
> +		    !amdgpu_virt_wait_reset(adev)) {
> +			dev_err(adev->dev, "VF exclusive mode timeout\n");
> +			r = -EAGAIN;
> +			goto failed;
> +		}
>   		dev_err(adev->dev, "amdgpu_init failed\n");
>   		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
>   		amdgpu_fini(adev);
> @@ -2393,6 +2402,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	amdgpu_vf_error_trans_all(adev);
>   	if (runtime)
>   		vga_switcheroo_fini_domain_pm_ops(adev->dev);
> +
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index f2eb7ac..fdc240a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>   {
>   	struct amdgpu_device *adev;
> -	int r, acpi_status;
> +	int r, acpi_status, retry = 0;
>   
>   #ifdef CONFIG_DRM_AMDGPU_SI
>   	if (!amdgpu_si_support) {
> @@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>   		}
>   	}
>   #endif
> +retry_init:
>   
>   	adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
>   	if (adev == NULL) {
> @@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>   	 * VRAM allocation
>   	 */
>   	r = amdgpu_device_init(adev, dev, dev->pdev, flags);
> -	if (r) {
> +	if (++retry != 3 && r == -EAGAIN) {

Minor nitpick here. Might want to rewrite the condition so that it 
evaluates to false for most values of retry (currently it evaluates to 
false only for one value of retry).

E.g. if (++retry >= 3 ...)

Or

int retry = 3;
...
if (--retry >= 0 ...)

> +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> +		adev->virt.ops = NULL;
> +		amdgpu_device_fini(adev);
> +		kfree(adev);
> +		dev->dev_private = NULL;
> +		msleep(5000);
> +		dev_err(&dev->pdev->dev, "retry init %d\n", retry);
> +		amdgpu_init_log = 0;
> +		goto retry_init;
> +	} else if (r) {
>   		dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
>   		goto out;
>   	}
> 


More information about the amd-gfx mailing list