[PATCH] drm/amdgpu: process RAS fatal error MB notification
Lazar, Lijo
lijo.lazar at amd.com
Wed Jun 19 11:20:11 UTC 2024
On 6/19/2024 2:44 AM, Vignesh Chander wrote:
> For RAS error scenario, VF guest driver will check mailbox
> and set fed flag to avoid unnecessary HW accesses.
> additionally, poll for reset completion message first
> to avoid accidentally spamming multiple reset requests to host.
>
> v2: add another mailbox check for handling case where kfd detects
> timeout first
>
> Signed-off-by: Vignesh Chander <Vignesh.Chander at amd.com>
> Change-Id: Ib501c653265883999c62a12a209ce5eb81c80846
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 25 +++++++++++++++++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 4 +++-
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 22 +++++++++++++++++++--
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 4 +++-
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22 +++++++++++++++++++--
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 3 ++-
> 6 files changed, 70 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 63f2286858c484..ccb3d041c2b249 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -229,6 +229,22 @@ void amdgpu_virt_free_mm_table(struct amdgpu_device *adev)
> adev->virt.mm_table.gpu_addr = 0;
> }
>
> +/**
> + * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt
> + * @adev: amdgpu device.
> + * Check whether host sent RAS error message
> + * Return: true if found, otherwise false
> + */
> +bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev)
> +{
> + struct amdgpu_virt *virt = &adev->virt;
> +
> + if (!virt->ops || !virt->ops->rcvd_ras_intr)
> + return false;
> +
> + return virt->ops->rcvd_ras_intr(adev);
> +}
> +
> > unsigned int amd_sriov_msg_checksum(void *obj,
> unsigned long obj_size,
> @@ -612,11 +628,14 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
> ret = amdgpu_virt_read_pf2vf_data(adev);
> if (ret) {
> adev->virt.vf2pf_update_retry_cnt++;
> - if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
> - amdgpu_sriov_runtime(adev)) {
> +
> + if ((amdgpu_virt_rcvd_ras_interrupt(adev) ||
> + adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
> + amdgpu_sriov_runtime(adev)) {
> +
> amdgpu_ras_set_fed(adev, true);
> if (amdgpu_reset_domain_schedule(adev->reset_domain,
> - &adev->kfd.reset_work))
> + &adev->kfd.reset_work))
Instead of this and below waits, what about checking the status in
gpu_recover() or in device_reset_sriov(). It will get called for reset
initiated from all sources.
Setting the flag means it will wait for FLR completion.
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
+
+ /* RAS error is equivalent to FLR initiated from host,
wait for
+ * completion
+ */
+ if (amdgpu_virt_rcvd_ras_interrupt(adev) ||
amdgpu_ras_get_fed_status(adev))
+ set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
+
Thanks,
Lijo
> return;
> else
> dev_err(adev->dev, "Failed to queue work! at %s", __func__);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index f04cd1586c7220..b42a8854dca0cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -52,7 +52,7 @@
> /* tonga/fiji use this offset */
> #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
>
> -#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
> +#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
>
> enum amdgpu_sriov_vf_mode {
> SRIOV_VF_MODE_BARE_METAL = 0,
> @@ -94,6 +94,7 @@ struct amdgpu_virt_ops {
> u32 data1, u32 data2, u32 data3);
> void (*ras_poison_handler)(struct amdgpu_device *adev,
> enum amdgpu_ras_block block);
> + bool (*rcvd_ras_intr)(struct amdgpu_device *adev);
> };
>
> /*
> @@ -352,6 +353,7 @@ void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev);
> int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
> int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
> void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
> +bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev);
> void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev);
> void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev);
> void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 65656afc6ed1c2..1bb8393ad6d358 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -196,11 +196,22 @@ static int xgpu_ai_request_reset(struct amdgpu_device *adev)
> {
> int ret, i = 0;
>
> - while (i < AI_MAILBOX_POLL_MSG_REP_MAX) {
> + if (amdgpu_ras_get_fed_status(adev) || xgpu_ai_rcvd_ras_intr(adev)) {
> + dev_dbg(adev->dev, "ras flag is set, poll for IDH_FLR_NOTIFICATION_CMPL\n");
> +
> + for (i = 0; i < AI_MAILBOX_POLL_MSG_REP_MAX; i++) {
> + ret = xgpu_ai_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
> + if (!ret)
> + break;
> +
> + dev_dbg(adev->dev, "retries left = %d\n", AI_MAILBOX_POLL_MSG_REP_MAX - i);
> + }
> + }
> +
> + for (i = 0; i < AI_MAILBOX_POLL_MSG_REP_MAX; i++) {
> ret = xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS);
> if (!ret)
> break;
> - i++;
> }
>
> return ret;
> @@ -408,6 +419,12 @@ static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev,
> xgpu_ai_send_access_requests(adev, IDH_RAS_POISON);
> }
>
> +static bool xgpu_ai_rcvd_ras_intr(struct amdgpu_device *adev)
> +{
> + enum idh_event msg = xgpu_ai_mailbox_peek_msg(adev);
> + return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
> +}
> +
> const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
> .req_full_gpu = xgpu_ai_request_full_gpu_access,
> .rel_full_gpu = xgpu_ai_release_full_gpu_access,
> @@ -417,4 +434,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
> .trans_msg = xgpu_ai_mailbox_trans_msg,
> .req_init_data = xgpu_ai_request_init_data,
> .ras_poison_handler = xgpu_ai_ras_poison_handler,
> + .rcvd_ras_intr = xgpu_ai_rcvd_ras_intr,
> };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> index c520b2fabfb9a8..ed57cbc150afba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> @@ -51,7 +51,9 @@ enum idh_event {
> IDH_FAIL,
> IDH_QUERY_ALIVE,
> IDH_REQ_GPU_INIT_DATA_READY,
> -
> + IDH_RAS_POISON_READY,
> + IDH_PF_SOFT_FLR_NOTIFICATION,
> + IDH_RAS_ERROR_DETECTED,
> IDH_TEXT_MESSAGE = 255,
> };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 17e1e8cc243752..f2e5b38a64314c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -225,11 +225,22 @@ static int xgpu_nv_request_reset(struct amdgpu_device *adev)
> {
> int ret, i = 0;
>
> - while (i < NV_MAILBOX_POLL_MSG_REP_MAX) {
> + if (amdgpu_ras_get_fed_status(adev) || xgpu_nv_rcvd_ras_intr(adev) {
> + dev_dbg(adev->dev, "ras flag is set, poll for IDH_FLR_NOTIFICATION_CMPL\n");
> +
> + for (i = 0; i < NV_MAILBOX_POLL_MSG_REP_MAX; i++) {
> + ret = xgpu_nv_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL);
> + if (!ret)
> + break;
> +
> + dev_dbg(adev->dev, "retries left = %d\n", NV_MAILBOX_POLL_MSG_REP_MAX - i);
> + }
> + }
> +
> + for (i = 0; i < NV_MAILBOX_POLL_MSG_REP_MAX; i++) {
> ret = xgpu_nv_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS);
> if (!ret)
> break;
> - i++;
> }
>
> return ret;
> @@ -449,6 +460,12 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
> }
> }
>
> +static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev)
> +{
> + enum idh_event msg = xgpu_nv_mailbox_peek_msg(adev);
> + return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
> +}
> +
> const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
> .req_full_gpu = xgpu_nv_request_full_gpu_access,
> .rel_full_gpu = xgpu_nv_release_full_gpu_access,
> @@ -458,4 +475,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
> .wait_reset = xgpu_nv_wait_reset,
> .trans_msg = xgpu_nv_mailbox_trans_msg,
> .ras_poison_handler = xgpu_nv_ras_poison_handler,
> + .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr,
> };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
> index 1e8fd90cab4347..719a4c88615752 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
> @@ -52,7 +52,8 @@ enum idh_event {
> IDH_QUERY_ALIVE,
> IDH_REQ_GPU_INIT_DATA_READY,
> IDH_RAS_POISON_READY,
> -
> + IDH_PF_SOFT_FLR_NOTIFICATION,
> + IDH_RAS_ERROR_DETECTED,
> IDH_TEXT_MESSAGE = 255,
> };
>
More information about the amd-gfx
mailing list