[PATCH] drm/amdgpu: trigger flr_work if reading pf2vf data failed

Fri Mar 15 10:47:13 UTC 2024

On 3/14/2024 10:24 PM, Zhigang Luo wrote:
> if reading pf2vf data failed 5 times continuously, it means something is
> wrong. Need to trigger flr_work to recover the issue.
> 
> also use dev_err to print the error message to get which device has
> issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL
> timeout.
> 
> Signed-off-by: Zhigang Luo <Zhigang.Luo at amd.com>
> Change-Id: Ia7ce934d0c3068ad3934715c14bbffdfcbafc4c2
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 29 ++++++++++++++++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  2 ++
>  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  2 ++
>  5 files changed, 39 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 1e9454e6e4cb..9bb04a56d020 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -143,6 +143,8 @@ const char *amdgpu_asic_name[] = {
>  	"LAST",
>  };
>  
> +static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
> +
>  /**
>   * DOC: pcie_replay_count
>   *
> @@ -4972,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
>  retry:
>  	amdgpu_amdkfd_pre_reset(adev);
>  
> +	amdgpu_device_stop_pending_resets(adev);
> +
>  	if (from_hypervisor)
>  		r = amdgpu_virt_request_full_gpu(adev, true);
>  	else
> @@ -5689,11 +5693,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>  			tmp_adev->asic_reset_res = r;
>  		}
>  
> -		/*
> -		 * Drop all pending non scheduler resets. Scheduler resets
> -		 * were already dropped during drm_sched_stop
> -		 */
> -		amdgpu_device_stop_pending_resets(tmp_adev);
> +		if (!amdgpu_sriov_vf(tmp_adev))
> +			/*
> +			* Drop all pending non scheduler resets. Scheduler resets
> +			* were already dropped during drm_sched_stop
> +			*/
> +			amdgpu_device_stop_pending_resets(tmp_adev);
>  	}
>  

For better consistency - you may make this path common for both VF and
bare metal. Call this right before "skip_hw_reset" after a successful
reset. All pending reset jobs submitted may be cancelled at that point
once a succesful reset of the device/hive is completed.

Thanks,
Lijo

>  	/* Actual ASIC resets if needed.*/
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 7a4eae36778a..4e8364a46d4e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -32,6 +32,7 @@
>  
>  #include "amdgpu.h"
>  #include "amdgpu_ras.h"
> +#include "amdgpu_reset.h"
>  #include "vi.h"
>  #include "soc15.h"
>  #include "nv.h"
> @@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
>  		return -EINVAL;
>  
>  	if (pf2vf_info->size > 1024) {
> -		DRM_ERROR("invalid pf2vf message size\n");
> +		dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size);
>  		return -EINVAL;
>  	}
>  
> @@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
>  			adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
>  			adev->virt.fw_reserve.checksum_key, checksum);
>  		if (checksum != checkval) {
> -			DRM_ERROR("invalid pf2vf message\n");
> +			dev_err(adev->dev,
> +				"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
> +				checksum, checkval);
>  			return -EINVAL;
>  		}
>  
> @@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
>  			adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
>  			0, checksum);
>  		if (checksum != checkval) {
> -			DRM_ERROR("invalid pf2vf message\n");
> +			dev_err(adev->dev,
> +				"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
> +				checksum, checkval);
>  			return -EINVAL;
>  		}
>  
> @@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
>  			((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
>  		break;
>  	default:
> -		DRM_ERROR("invalid pf2vf version\n");
> +		dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version);
>  		return -EINVAL;
>  	}
>  
> @@ -584,8 +589,21 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
>  	int ret;
>  
>  	ret = amdgpu_virt_read_pf2vf_data(adev);
> -	if (ret)
> +	if (ret) {
> +		adev->virt.vf2pf_update_retry_cnt++;
> +		if ((adev->virt.vf2pf_update_retry_cnt >= 5) &&
> +		    amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
> +			if (amdgpu_reset_domain_schedule(adev->reset_domain,
> +							  &adev->virt.flr_work))
> +				return;
> +			else
> +				dev_err(adev->dev, "Failed to queue work! at %s", __func__);
> +		}
> +
>  		goto out;
> +	}
> +
> +	adev->virt.vf2pf_update_retry_cnt = 0;
>  	amdgpu_virt_write_vf2pf_data(adev);
>  
>  out:
> @@ -606,6 +624,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
>  	adev->virt.fw_reserve.p_pf2vf = NULL;
>  	adev->virt.fw_reserve.p_vf2pf = NULL;
>  	adev->virt.vf2pf_update_interval_ms = 0;
> +	adev->virt.vf2pf_update_retry_cnt = 0;
>  
>  	if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
>  		DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 3f59b7b5523f..eedf5d44bf00 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -257,6 +257,7 @@ struct amdgpu_virt {
>  	/* vf2pf message */
>  	struct delayed_work vf2pf_work;
>  	uint32_t vf2pf_update_interval_ms;
> +	int vf2pf_update_retry_cnt;
>  
>  	/* multimedia bandwidth config */
>  	bool     is_mm_bw_enabled;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index a2bd2c3b1ef9..0c7275bca8f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>  		timeout -= 10;
>  	} while (timeout > 1);
>  
> +	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
> +
>  flr_done:
>  	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
>  	up_write(&adev->reset_domain->sem);
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 77f5b55decf9..ee9aa2815d5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>  		timeout -= 10;
>  	} while (timeout > 1);
>  
> +	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
> +
>  flr_done:
>  	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
>  	up_write(&adev->reset_domain->sem);