[PATCH] drm/amdgpu: trigger flr_work if reading pf2vf data failed
Skvortsov, Victor
Victor.Skvortsov at amd.com
Sun Mar 17 00:59:52 UTC 2024
[AMD Official Use Only - General]
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Lazar,
> Lijo
> Sent: Friday, March 15, 2024 6:47 AM
> To: Luo, Zhigang <Zhigang.Luo at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Saye, Sashank
> <Sashank.Saye at amd.com>; Chan, Hing Pong <Jeffrey.Chan at amd.com>
> Subject: Re: [PATCH] drm/amdgpu: trigger flr_work if reading pf2vf data failed
>
> Caution: This message originated from an External Source. Use proper
> caution when opening attachments, clicking links, or responding.
>
>
> On 3/14/2024 10:24 PM, Zhigang Luo wrote:
> > if reading pf2vf data failed 5 times continuously, it means something
> > is wrong. Need to trigger flr_work to recover the issue.
> >
> > also use dev_err to print the error message to get which device has
> > issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL
> > timeout.
> >
> > Signed-off-by: Zhigang Luo <Zhigang.Luo at amd.com>
> > Change-Id: Ia7ce934d0c3068ad3934715c14bbffdfcbafc4c2
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++----
> > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 29 ++++++++++++++++++-
> ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 +
> > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 ++
> > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 2 ++
> > 5 files changed, 39 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 1e9454e6e4cb..9bb04a56d020 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -143,6 +143,8 @@ const char *amdgpu_asic_name[] = {
> > "LAST",
> > };
> >
> > +static inline void amdgpu_device_stop_pending_resets(struct
> > +amdgpu_device *adev);
> > +
> > /**
> > * DOC: pcie_replay_count
> > *
> > @@ -4972,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct
> > amdgpu_device *adev,
> > retry:
> > amdgpu_amdkfd_pre_reset(adev);
> >
> > + amdgpu_device_stop_pending_resets(adev);
> > +
> > if (from_hypervisor)
> > r = amdgpu_virt_request_full_gpu(adev, true);
> > else
> > @@ -5689,11 +5693,12 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
> > tmp_adev->asic_reset_res = r;
> > }
> >
> > - /*
> > - * Drop all pending non scheduler resets. Scheduler resets
> > - * were already dropped during drm_sched_stop
> > - */
> > - amdgpu_device_stop_pending_resets(tmp_adev);
> > + if (!amdgpu_sriov_vf(tmp_adev))
> > + /*
> > + * Drop all pending non scheduler resets. Scheduler resets
> > + * were already dropped during drm_sched_stop
> > + */
> > + amdgpu_device_stop_pending_resets(tmp_adev);
> > }
> >
>
> For better consistency - you may make this path common for both VF and
> bare metal. Call this right before "skip_hw_reset" after a successful reset. All
> pending reset jobs submitted may be cancelled at that point once a succesful
> reset of the device/hive is completed.
>
> Thanks,
> Lijo
>
> > /* Actual ASIC resets if needed.*/ diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > index 7a4eae36778a..4e8364a46d4e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > @@ -32,6 +32,7 @@
> >
> > #include "amdgpu.h"
> > #include "amdgpu_ras.h"
> > +#include "amdgpu_reset.h"
> > #include "vi.h"
> > #include "soc15.h"
> > #include "nv.h"
> > @@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct
> amdgpu_device *adev)
> > return -EINVAL;
> >
> > if (pf2vf_info->size > 1024) {
> > - DRM_ERROR("invalid pf2vf message size\n");
> > + dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n",
> > + pf2vf_info->size);
> > return -EINVAL;
> > }
> >
> > @@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct
> amdgpu_device *adev)
> > adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
> > adev->virt.fw_reserve.checksum_key, checksum);
> > if (checksum != checkval) {
> > - DRM_ERROR("invalid pf2vf message\n");
> > + dev_err(adev->dev,
> > + "invalid pf2vf message: header checksum=0x%x calculated
> checksum=0x%x\n",
> > + checksum, checkval);
> > return -EINVAL;
> > }
> >
> > @@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct
> amdgpu_device *adev)
> > adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
> > 0, checksum);
> > if (checksum != checkval) {
> > - DRM_ERROR("invalid pf2vf message\n");
> > + dev_err(adev->dev,
> > + "invalid pf2vf message: header checksum=0x%x calculated
> checksum=0x%x\n",
> > + checksum, checkval);
> > return -EINVAL;
> > }
> >
> > @@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct
> amdgpu_device *adev)
> > ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
> > break;
> > default:
> > - DRM_ERROR("invalid pf2vf version\n");
> > + dev_err(adev->dev, "invalid pf2vf version: 0x%x\n",
> > + pf2vf_info->version);
> > return -EINVAL;
> > }
> >
> > @@ -584,8 +589,21 @@ static void
> amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
> > int ret;
> >
> > ret = amdgpu_virt_read_pf2vf_data(adev);
> > - if (ret)
> > + if (ret) {
> > + adev->virt.vf2pf_update_retry_cnt++;
Can we also add a debug print with retry_cnt value each time it's incremented?
Thanks,
Victor
> > + if ((adev->virt.vf2pf_update_retry_cnt >= 5) &&
> > + amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
> > + if (amdgpu_reset_domain_schedule(adev->reset_domain,
> > + &adev->virt.flr_work))
> > + return;
> > + else
> > + dev_err(adev->dev, "Failed to queue work! at %s",
> __func__);
> > + }
> > +
> > goto out;
> > + }
> > +
> > + adev->virt.vf2pf_update_retry_cnt = 0;
> > amdgpu_virt_write_vf2pf_data(adev);
> >
> > out:
> > @@ -606,6 +624,7 @@ void amdgpu_virt_init_data_exchange(struct
> amdgpu_device *adev)
> > adev->virt.fw_reserve.p_pf2vf = NULL;
> > adev->virt.fw_reserve.p_vf2pf = NULL;
> > adev->virt.vf2pf_update_interval_ms = 0;
> > + adev->virt.vf2pf_update_retry_cnt = 0;
> >
> > if (adev->mman.fw_vram_usage_va && adev-
> >mman.drv_vram_usage_va) {
> > DRM_WARN("Currently fw_vram and drv_vram should not have
> > values at the same time!"); diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > index 3f59b7b5523f..eedf5d44bf00 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > @@ -257,6 +257,7 @@ struct amdgpu_virt {
> > /* vf2pf message */
> > struct delayed_work vf2pf_work;
> > uint32_t vf2pf_update_interval_ms;
> > + int vf2pf_update_retry_cnt;
> >
> > /* multimedia bandwidth config */
> > bool is_mm_bw_enabled;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> > b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> > index a2bd2c3b1ef9..0c7275bca8f7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> > @@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct
> work_struct *work)
> > timeout -= 10;
> > } while (timeout > 1);
> >
> > + dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL
> > + timeout\n");
> > +
> > flr_done:
> > atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> > up_write(&adev->reset_domain->sem);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> > b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> > index 77f5b55decf9..ee9aa2815d5a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> > @@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct
> work_struct *work)
> > timeout -= 10;
> > } while (timeout > 1);
> >
> > + dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL
> > + timeout\n");
> > +
> > flr_done:
> > atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> > up_write(&adev->reset_domain->sem);
More information about the amd-gfx
mailing list