[PATCH] drm/amdgpu: process RAS fatal error MB notification

Lazar, Lijo lijo.lazar at amd.com
Wed Jun 19 11:20:11 UTC 2024



On 6/19/2024 2:44 AM, Vignesh Chander wrote:
> For RAS error scenario, VF guest driver will check mailbox
> and set fed flag to avoid unnecessary HW accesses.
> additionally, poll for reset completion message first
> to avoid accidentally spamming multiple reset requests to host.
> 
> v2: add another mailbox check for handling case where kfd detects
> timeout first
> 
> Signed-off-by: Vignesh Chander <Vignesh.Chander at amd.com>
> Change-Id: Ib501c653265883999c62a12a209ce5eb81c80846
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 25 +++++++++++++++++++++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  4 +++-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c    | 22 +++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h    |  4 +++-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c    | 22 +++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h    |  3 ++-
>  6 files changed, 70 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 63f2286858c484..ccb3d041c2b249 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -229,6 +229,22 @@ void amdgpu_virt_free_mm_table(struct amdgpu_device *adev)
>  	adev->virt.mm_table.gpu_addr = 0;
>  }
>  
> +/**
> + * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt
> + * @adev:	amdgpu device.
> + * Check whether host sent RAS error message
> + * Return: true if found, otherwise false
> + */
> +bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_virt *virt = &adev->virt;
> +
> +	if (!virt->ops || !virt->ops->rcvd_ras_intr)
> +		return false;
> +
> +	return virt->ops->rcvd_ras_intr(adev);
> +}
> +
>  >  unsigned int amd_sriov_msg_checksum(void *obj,
>  				unsigned long obj_size,
> @@ -612,11 +628,14 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
>  	ret = amdgpu_virt_read_pf2vf_data(adev);
>  	if (ret) {
>  		adev->virt.vf2pf_update_retry_cnt++;
> -		if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
> -		    amdgpu_sriov_runtime(adev)) {
> +
> +		if ((amdgpu_virt_rcvd_ras_interrupt(adev) ||
> +			adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
> +			amdgpu_sriov_runtime(adev)) {
> +
>  			amdgpu_ras_set_fed(adev, true);
>  			if (amdgpu_reset_domain_schedule(adev->reset_domain,
> -							  &adev->kfd.reset_work))
> +							&adev->kfd.reset_work))

Instead of this and below waits, what about checking the status in
gpu_recover() or in device_reset_sriov(). It will get called for reset
initiated from all sources.

Setting the flag means it will wait for FLR completion.

        /* Actual ASIC resets if needed.*/
        /* Host driver will handle XGMI hive reset for SRIOV */
        if (amdgpu_sriov_vf(adev)) {
+
+               /* RAS error is equivalent to FLR initiated from host,
wait for
+                * completion
+                */
+               if (amdgpu_virt_rcvd_ras_interrupt(adev) ||
amdgpu_ras_get_fed_status(adev))
+                       set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
+


Thanks,
Lijo
>  				return;
>  			else
>  				dev_err(adev->dev, "Failed to queue work! at %s", __func__);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index f04cd1586c7220..b42a8854dca0cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -52,7 +52,7 @@
>  /* tonga/fiji use this offset */
>  #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
>  
> -#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
> +#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
>  
>  enum amdgpu_sriov_vf_mode {
>  	SRIOV_VF_MODE_BARE_METAL = 0,
> @@ -94,6 +94,7 @@ struct amdgpu_virt_ops {
>  			  u32 data1, u32 data2, u32 data3);
>  	void (*ras_poison_handler)(struct amdgpu_device *adev,
>  					enum amdgpu_ras_block block);
> +	bool (*rcvd_ras_intr)(struct amdgpu_device *adev);
>  };
>  
>  /*
> @@ -352,6 +353,7 @@ void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev);
>  int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
>  int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
>  void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
> +bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev);
>  void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev);
>  void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev);
>  void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 65656afc6ed1c2..1bb8393ad6d358 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -196,11 +196,22 @@ static int xgpu_ai_request_reset(struct amdgpu_device *adev)
>  {
>  	int ret, i = 0;
>  
> -	while (i < AI_MAILBOX_POLL_MSG_REP_MAX) {
> +	if (amdgpu_ras_get_fed_status(adev) || xgpu_ai_rcvd_ras_intr(adev)) {
> +		dev_dbg(adev->dev, "ras flag is set, poll for IDH_FLR_NOTIFICATION_CMPL\n");
> +
> +		for (i = 0; i < AI_MAILBOX_POLL_MSG_REP_MAX; i++) {
> +			ret = xgpu_ai_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
> +			if (!ret)
> +				break;
> +
> +			dev_dbg(adev->dev, "retries left = %d\n", AI_MAILBOX_POLL_MSG_REP_MAX - i);
> +		}
> +	}
> +
> +	for (i = 0; i < AI_MAILBOX_POLL_MSG_REP_MAX; i++) {
>  		ret = xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS);
>  		if (!ret)
>  			break;
> -		i++;
>  	}
>  
>  	return ret;
> @@ -408,6 +419,12 @@ static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
>  	xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
>  }
>  
> +static bool xgpu_ai_rcvd_ras_intr(struct amdgpu_device *adev)
> +{
> +	enum idh_event msg = xgpu_ai_mailbox_peek_msg(adev);
> +	return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
> +}
> +
>  const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
>  	.req_full_gpu	= xgpu_ai_request_full_gpu_access,
>  	.rel_full_gpu	= xgpu_ai_release_full_gpu_access,
> @@ -417,4 +434,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
>  	.trans_msg = xgpu_ai_mailbox_trans_msg,
>  	.req_init_data  = xgpu_ai_request_init_data,
>  	.ras_poison_handler = xgpu_ai_ras_poison_handler,
> +	.rcvd_ras_intr = xgpu_ai_rcvd_ras_intr,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> index c520b2fabfb9a8..ed57cbc150afba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> @@ -51,7 +51,9 @@ enum idh_event {
>  	IDH_FAIL,
>  	IDH_QUERY_ALIVE,
>  	IDH_REQ_GPU_INIT_DATA_READY,
> -
> +	IDH_RAS_POISON_READY,
> +	IDH_PF_SOFT_FLR_NOTIFICATION,
> +	IDH_RAS_ERROR_DETECTED,
>  	IDH_TEXT_MESSAGE = 255,
>  };
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 17e1e8cc243752..f2e5b38a64314c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -225,11 +225,22 @@ static int xgpu_nv_request_reset(struct amdgpu_device *adev)
>  {
>  	int ret, i = 0;
>  
> -	while (i < NV_MAILBOX_POLL_MSG_REP_MAX) {
> +	if (amdgpu_ras_get_fed_status(adev) || xgpu_nv_rcvd_ras_intr(adev) {
> +		dev_dbg(adev->dev, "ras flag is set, poll for IDH_FLR_NOTIFICATION_CMPL\n");
> +
> +		for (i = 0; i < NV_MAILBOX_POLL_MSG_REP_MAX; i++) {
> +			ret = xgpu_nv_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
> +			if (!ret)
> +				break;
> +
> +			dev_dbg(adev->dev, "retries left = %d\n", NV_MAILBOX_POLL_MSG_REP_MAX - i);
> +		}
> +	}
> +
> +	for (i = 0; i < NV_MAILBOX_POLL_MSG_REP_MAX; i++) {
>  		ret = xgpu_nv_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS);
>  		if (!ret)
>  			break;
> -		i++;
>  	}
>  
>  	return ret;
> @@ -449,6 +460,12 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
>  	}
>  }
>  
> +static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev)
> +{
> +	enum idh_event msg = xgpu_nv_mailbox_peek_msg(adev);
> +	return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
> +}
> +
>  const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>  	.req_full_gpu	= xgpu_nv_request_full_gpu_access,
>  	.rel_full_gpu	= xgpu_nv_release_full_gpu_access,
> @@ -458,4 +475,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
>  	.wait_reset = xgpu_nv_wait_reset,
>  	.trans_msg = xgpu_nv_mailbox_trans_msg,
>  	.ras_poison_handler = xgpu_nv_ras_poison_handler,
> +	.rcvd_ras_intr = xgpu_nv_rcvd_ras_intr,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
> index 1e8fd90cab4347..719a4c88615752 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
> @@ -52,7 +52,8 @@ enum idh_event {
>  	IDH_QUERY_ALIVE,
>  	IDH_REQ_GPU_INIT_DATA_READY,
>  	IDH_RAS_POISON_READY,
> -
> +	IDH_PF_SOFT_FLR_NOTIFICATION,
> +	IDH_RAS_ERROR_DETECTED,
>  	IDH_TEXT_MESSAGE = 255,
>  };
>  


More information about the amd-gfx mailing list