[PATCH v3] drm/amdgpu: Save and restore switch state

Wang, Yang(Kevin) KevinYang.Wang at amd.com
Fri Aug 8 00:20:47 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

> +     do {
> +             usleep_range(10000, 10500);
> +             r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status);
> +     } while ((status != PCI_VENDOR_ID_ATI) &&
> +              (status != PCI_VENDOR_ID_AMD));

You'd better add another exit to avoid this becoming a dead loop. E.g: check response 'r' or add a try counter ?

Best Regards,
Kevin

-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Lazar, Lijo
Sent: Thursday, August 7, 2025 3:33 PM
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Sun, Ce(Overlord) <Ce.Sun at amd.com>
Subject: Re: [PATCH v3] drm/amdgpu: Save and restore switch state

<Ping>

On 8/1/2025 6:10 PM, Lijo Lazar wrote:
> During a DPC error kernel waits for the link to be active before
> notifying downstream devices. On certain platforms with Broadcom
> switch in synthetiic mode, switch responds with values even though the
> link is not fully ready. The config space restoration done by pcie
> port driver for SWUS/DS of dGPU is thus not effective as the switch is
> still doing internal enumeration.
>
> As a workaround, save state of SWUS/DS device in driver. Add
> additional check to see if link is active and restore the values
> during DPC error callbacks.
>
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
> ---
> v2: Use usleep_range as sleep is short. Remove dev_info logs.
> v3: remove redundant increment of 'i' in loop (Ce Sun).
>
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 72
> +++++++++++++++++++++-
>  2 files changed, 73 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 3550c2fac184..96d772aadb04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -904,6 +904,9 @@ struct amdgpu_pcie_reset_ctx {
>       bool in_link_reset;
>       bool occurs_dpc;
>       bool audio_suspended;
> +     struct pci_dev *swus;
> +     struct pci_saved_state *swus_pcistate;
> +     struct pci_saved_state *swds_pcistate;
>  };
>
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index cfd72faec16e..e58f42531974 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -178,6 +178,8 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
>               BIT(AMD_IP_BLOCK_TYPE_PSP)
>  };
>
> +static void amdgpu_device_load_switch_state(struct amdgpu_device
> +*adev);
> +
>  static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
>                                            enum amd_ip_block_type block)  { @@ -5006,7 +5008,8 @@ void
> amdgpu_device_fini_sw(struct amdgpu_device *adev)
>       adev->reset_domain = NULL;
>
>       kfree(adev->pci_state);
> -
> +     kfree(adev->pcie_reset_ctx.swds_pcistate);
> +     kfree(adev->pcie_reset_ctx.swus_pcistate);
>  }
>
>  /**
> @@ -6963,16 +6966,27 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>       struct amdgpu_device *tmp_adev;
>       struct amdgpu_hive_info *hive;
>       struct list_head device_list;
> +     struct pci_dev *link_dev;
>       int r = 0, i;
>       u32 memsize;
> +     u16 status;
>
>       dev_info(adev->dev, "PCI error: slot reset callback!!\n");
>
>       memset(&reset_context, 0, sizeof(reset_context));
>
> +     if (adev->pcie_reset_ctx.swus)
> +             link_dev = adev->pcie_reset_ctx.swus;
> +     else
> +             link_dev = adev->pdev;
>       /* wait for asic to come out of reset */
> -     msleep(700);
> +     do {
> +             usleep_range(10000, 10500);
> +             r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status);
> +     } while ((status != PCI_VENDOR_ID_ATI) &&
> +              (status != PCI_VENDOR_ID_AMD));
>
> +     amdgpu_device_load_switch_state(adev);
>       /* Restore PCI confspace */
>       amdgpu_device_load_pci_state(pdev);
>
> @@ -7074,6 +7088,58 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
>       }
>  }
>
> +static void amdgpu_device_cache_switch_state(struct amdgpu_device
> +*adev) {
> +     struct pci_dev *parent = pci_upstream_bridge(adev->pdev);
> +     int r;
> +
> +     if (parent->vendor != PCI_VENDOR_ID_ATI)
> +             return;
> +
> +     /* If already saved, return */
> +     if (adev->pcie_reset_ctx.swus)
> +             return;
> +     /* Upstream bridge is ATI, assume it's SWUS/DS architecture */
> +     r = pci_save_state(parent);
> +     if (r)
> +             return;
> +     adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent);
> +
> +     parent = pci_upstream_bridge(parent);
> +     r = pci_save_state(parent);
> +     if (r)
> +             return;
> +     adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent);
> +
> +     adev->pcie_reset_ctx.swus = parent;
> +}
> +
> +static void amdgpu_device_load_switch_state(struct amdgpu_device
> +*adev) {
> +     struct pci_dev *pdev;
> +     int r;
> +
> +     if (!adev->pcie_reset_ctx.swds_pcistate ||
> +         !adev->pcie_reset_ctx.swus_pcistate)
> +             return;
> +
> +     pdev = adev->pcie_reset_ctx.swus;
> +     r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate);
> +     if (!r) {
> +             pci_restore_state(pdev);
> +     } else {
> +             dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r);
> +             return;
> +     }
> +
> +     pdev = pci_upstream_bridge(adev->pdev);
> +     r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate);
> +     if (!r)
> +             pci_restore_state(pdev);
> +     else
> +             dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); }
> +
>  bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)  {
>       struct drm_device *dev = pci_get_drvdata(pdev); @@ -7098,6 +7164,8
> @@ bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
>               return false;
>       }
>
> +     amdgpu_device_cache_switch_state(adev);
> +
>       return true;
>  }
>



More information about the amd-gfx mailing list