[PATCH] drm/amdgpu: revise retry init to fully cleanup driver

Ding, Pixel Pixel.Ding at amd.com
Wed Nov 8 09:46:23 UTC 2017


When exclusive mode timeout happens, the VF is not active anymore. Exclusive requests will be ignored by host. Unload kms or device fini also request exclusive mode and it will get timeout again since no response received.

This only happens for exclusive mode timeout, so I didn’t put them in general SRIOV fini function.
— 
Sincerely Yours,
Pixel








On 08/11/2017, 5:42 PM, "Christian König" <ckoenig.leichtzumerken at gmail.com> wrote:

>Am 08.11.2017 um 04:29 schrieb Pixel Ding:
>> Retry at drm_dev_register instead of amdgpu_device_init.
>>
>> Signed-off-by: Pixel Ding <Pixel.Ding at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 11 +++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 15 ++-------------
>>   3 files changed, 13 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index bf2b008..4ef2b1b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2390,6 +2390,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>   		    amdgpu_virt_mmio_blocked(adev) &&
>>   		    !amdgpu_virt_wait_reset(adev)) {
>>   			dev_err(adev->dev, "VF exclusive mode timeout\n");
>> +			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
>> +			adev->virt.ops = NULL;
>
>Why is that necessary? Maybe put this into some SRIOV specific fini 
>function?
>
>Apart from that patch looks good to me and is Acked-by: Christian König 
><christian.koenig at amd.com>.
>
>Regards,
>Christian.
>
>>   			r = -EAGAIN;
>>   			goto failed;
>>   		}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 6b11a75..eaccd4b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -565,12 +565,13 @@ static int amdgpu_kick_out_firmware_fb(struct pci_dev *pdev)
>>   	return 0;
>>   }
>>   
>> +
>>   static int amdgpu_pci_probe(struct pci_dev *pdev,
>>   			    const struct pci_device_id *ent)
>>   {
>>   	struct drm_device *dev;
>>   	unsigned long flags = ent->driver_data;
>> -	int ret;
>> +	int ret, retry = 0;
>>   
>>   	if ((flags & AMD_EXP_HW_SUPPORT) && !amdgpu_exp_hw_support) {
>>   		DRM_INFO("This hardware requires experimental hardware support.\n"
>> @@ -603,8 +604,14 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
>>   
>>   	pci_set_drvdata(pdev, dev);
>>   
>> +retry_init:
>>   	ret = drm_dev_register(dev, ent->driver_data);
>> -	if (ret)
>> +	if (ret == -EAGAIN && ++retry <= 3) {
>> +		DRM_INFO("retry init %d\n", retry);
>> +		/* Don't request EX mode too frequently which is attacking */
>> +		msleep(5000);
>> +		goto retry_init;
>> +	} else if (ret)
>>   		goto err_pci;
>>   
>>   	return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 1d56b5b..65360cd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -84,7 +84,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
>>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>   {
>>   	struct amdgpu_device *adev;
>> -	int r, acpi_status, retry = 0;
>> +	int r, acpi_status;
>>   
>>   #ifdef CONFIG_DRM_AMDGPU_SI
>>   	if (!amdgpu_si_support) {
>> @@ -120,7 +120,6 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>   		}
>>   	}
>>   #endif
>> -retry_init:
>>   
>>   	adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
>>   	if (adev == NULL) {
>> @@ -143,17 +142,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>   	 * VRAM allocation
>>   	 */
>>   	r = amdgpu_device_init(adev, dev, dev->pdev, flags);
>> -	if (r == -EAGAIN && ++retry <= 3) {
>> -		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
>> -		adev->virt.ops = NULL;
>> -		amdgpu_device_fini(adev);
>> -		kfree(adev);
>> -		dev->dev_private = NULL;
>> -		/* Don't request EX mode too frequently which is attacking */
>> -		msleep(5000);
>> -		dev_err(&dev->pdev->dev, "retry init %d\n", retry);
>> -		goto retry_init;
>> -	} else if (r) {
>> +	if (r) {
>>   		dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
>>   		goto out;
>>   	}
>
>


More information about the amd-gfx mailing list