[PATCH] drm/amdgpu: trigger flr_work if reading pf2vf data failed

Zhang, Hawking Hawking.Zhang at amd.com
Mon Mar 18 03:50:57 UTC 2024


[AMD Official Use Only - General]

Acked-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: Luo, Zhigang <Zhigang.Luo at amd.com>
Sent: Monday, March 18, 2024 11:38
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Saye, Sashank <Sashank.Saye at amd.com>; Chan, Hing Pong <Jeffrey.Chan at amd.com>; Luo, Zhigang <Zhigang.Luo at amd.com>
Subject: [PATCH] drm/amdgpu: trigger flr_work if reading pf2vf data failed

if reading pf2vf data failed 30 times continuously, it means something is wrong. Need to trigger flr_work to recover the issue.

also use dev_err to print the error message to get which device has issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL timeout.

Signed-off-by: Zhigang Luo <Zhigang.Luo at amd.com>
Change-Id: Ia7ce934d0c3068ad3934715c14bbffdfcbafc4c2
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 29 ++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  3 +++
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  2 ++
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  2 ++
 5 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b37113b79483..70261eb9b0bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -143,6 +143,8 @@ const char *amdgpu_asic_name[] = {
        "LAST",
 };

+static inline void amdgpu_device_stop_pending_resets(struct
+amdgpu_device *adev);
+
 /**
  * DOC: pcie_replay_count
  *
@@ -4972,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 retry:
        amdgpu_amdkfd_pre_reset(adev);

+       amdgpu_device_stop_pending_resets(adev);
+
        if (from_hypervisor)
                r = amdgpu_virt_request_full_gpu(adev, true);
        else
@@ -5712,11 +5716,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                        tmp_adev->asic_reset_res = r;
                }

-               /*
-                * Drop all pending non scheduler resets. Scheduler resets
-                * were already dropped during drm_sched_stop
-                */
-               amdgpu_device_stop_pending_resets(tmp_adev);
+               if (!amdgpu_sriov_vf(tmp_adev))
+                       /*
+                       * Drop all pending non scheduler resets. Scheduler resets
+                       * were already dropped during drm_sched_stop
+                       */
+                       amdgpu_device_stop_pending_resets(tmp_adev);
        }

        /* Actual ASIC resets if needed.*/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 7a4eae36778a..aed60aaf1a55 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -32,6 +32,7 @@

 #include "amdgpu.h"
 #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
 #include "vi.h"
 #include "soc15.h"
 #include "nv.h"
@@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
                return -EINVAL;

        if (pf2vf_info->size > 1024) {
-               DRM_ERROR("invalid pf2vf message size\n");
+               dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n",
+pf2vf_info->size);
                return -EINVAL;
        }

@@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
                        adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
                        adev->virt.fw_reserve.checksum_key, checksum);
                if (checksum != checkval) {
-                       DRM_ERROR("invalid pf2vf message\n");
+                       dev_err(adev->dev,
+                               "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
+                               checksum, checkval);
                        return -EINVAL;
                }

@@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
                        adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
                        0, checksum);
                if (checksum != checkval) {
-                       DRM_ERROR("invalid pf2vf message\n");
+                       dev_err(adev->dev,
+                               "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
+                               checksum, checkval);
                        return -EINVAL;
                }

@@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
                        ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
                break;
        default:
-               DRM_ERROR("invalid pf2vf version\n");
+               dev_err(adev->dev, "invalid pf2vf version: 0x%x\n",
+pf2vf_info->version);
                return -EINVAL;
        }

@@ -584,8 +589,21 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
        int ret;

        ret = amdgpu_virt_read_pf2vf_data(adev);
-       if (ret)
+       if (ret) {
+               adev->virt.vf2pf_update_retry_cnt++;
+               if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
+                   amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
+                       if (amdgpu_reset_domain_schedule(adev->reset_domain,
+                                                         &adev->virt.flr_work))
+                               return;
+                       else
+                               dev_err(adev->dev, "Failed to queue work! at %s", __func__);
+               }
+
                goto out;
+       }
+
+       adev->virt.vf2pf_update_retry_cnt = 0;
        amdgpu_virt_write_vf2pf_data(adev);

 out:
@@ -606,6 +624,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
        adev->virt.fw_reserve.p_pf2vf = NULL;
        adev->virt.fw_reserve.p_vf2pf = NULL;
        adev->virt.vf2pf_update_interval_ms = 0;
+       adev->virt.vf2pf_update_retry_cnt = 0;

        if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
                DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 3f59b7b5523f..a858bc98cad4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -52,6 +52,8 @@
 /* tonga/fiji use this offset */
 #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503

+#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 30
+
 enum amdgpu_sriov_vf_mode {
        SRIOV_VF_MODE_BARE_METAL = 0,
        SRIOV_VF_MODE_ONE_VF,
@@ -257,6 +259,7 @@ struct amdgpu_virt {
        /* vf2pf message */
        struct delayed_work vf2pf_work;
        uint32_t vf2pf_update_interval_ms;
+       int vf2pf_update_retry_cnt;

        /* multimedia bandwidth config */
        bool     is_mm_bw_enabled;
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index a2bd2c3b1ef9..0c7275bca8f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
                timeout -= 10;
        } while (timeout > 1);

+       dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
+
 flr_done:
        atomic_set(&adev->reset_domain->in_gpu_reset, 0);
        up_write(&adev->reset_domain->sem);
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index a1bad772d932..89992c1c9a62 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
                timeout -= 10;
        } while (timeout > 1);

+       dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
+
 flr_done:
        atomic_set(&adev->reset_domain->in_gpu_reset, 0);
        up_write(&adev->reset_domain->sem);
--
2.25.1



More information about the amd-gfx mailing list