[PATCH 1/7] drm/amdgpu: Implement DPC recovery
Andrey Grodzovsky
Andrey.Grodzovsky at amd.com
Thu Aug 27 13:38:10 UTC 2020
Indeed, I noticed it later and did it in patch 6 (merging 1 and 6 was very messy
due to following changes so I just kept it as a separate patch).
Andrey
On 8/26/20 9:23 PM, Li, Dennis wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Andrey,
> I found that the sequences of amdgpu_pci_slot_reset is mostly similar to amdgpu_do_asic_reset. Could help us refactor them to reuse more codes?
>
> Best Regards
> Dennis Li
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Andrey Grodzovsky
> Sent: Wednesday, August 26, 2020 10:46 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Das, Nirmoy <Nirmoy.Das at amd.com>
> Subject: [PATCH 1/7] drm/amdgpu: Implement DPC recovery
>
> Add DPC handlers with basic recovery functionality.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 181 ++++++++++++++++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 +-
> 3 files changed, 196 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 49ea9fa..3399242 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -49,6 +49,8 @@
> #include <linux/rbtree.h>
> #include <linux/hashtable.h>
> #include <linux/dma-fence.h>
> +#include <linux/pci.h>
> +#include <linux/aer.h>
>
> #include <drm/ttm/ttm_bo_api.h>
> #include <drm/ttm/ttm_bo_driver.h>
> @@ -1263,6 +1265,13 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return void amdgpu_register_gpu_instance(struct amdgpu_device *adev); void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev);
>
> +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev,
> + pci_channel_state_t state);
> +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev);
> +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev); void
> +amdgpu_pci_resume(struct pci_dev *pdev);
> +
> +
> #include "amdgpu_object.h"
>
> /* used by df_v3_6.c and amdgpu_pmu.c */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5a948ed..84f8d14 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -350,7 +350,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
> *
> * Returns the 8 bit value from the offset specified.
> */
> -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
> +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
> +
> if (offset < adev->rmmio_size)
> return (readb(adev->rmmio + offset));
> BUG();
> @@ -371,7 +373,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
> *
> * Writes the value specified to the offset specified.
> */
> -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
> +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset,
> +uint8_t value) {
> +
> if (offset < adev->rmmio_size)
> writeb(value, adev->rmmio + offset);
> else
> @@ -380,6 +384,7 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>
> void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) {
> +
> trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
>
> if ((reg * 4) < adev->rmmio_size)
> @@ -407,6 +412,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
> uint32_t acc_flags)
> {
> +
> if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
> return amdgpu_kiq_wreg(adev, reg, v);
>
> @@ -461,6 +467,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
> */
> void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) {
> +
> if ((reg * 4) < adev->rio_mem_size)
> iowrite32(v, adev->rio_mem + (reg * 4));
> else {
> @@ -480,6 +487,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
> */
> u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) {
> +
> if (index < adev->doorbell.num_doorbells) {
> return readl(adev->doorbell.ptr + index);
> } else {
> @@ -500,6 +508,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
> */
> void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) {
> +
> if (index < adev->doorbell.num_doorbells) {
> writel(v, adev->doorbell.ptr + index);
> } else {
> @@ -518,6 +527,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
> */
> u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) {
> +
> if (index < adev->doorbell.num_doorbells) {
> return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
> } else {
> @@ -538,6 +548,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
> */
> void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) {
> +
> if (index < adev->doorbell.num_doorbells) {
> atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
> } else {
> @@ -2989,6 +3000,7 @@ static const struct attribute *amdgpu_dev_attributes[] = {
> NULL
> };
>
> +
> /**
> * amdgpu_device_init - initialize the driver
> *
> @@ -3207,6 +3219,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> }
> }
>
> + pci_enable_pcie_error_reporting(adev->ddev.pdev);
> +
> +
> /* Post card if necessary */
> if (amdgpu_device_need_post(adev)) {
> if (!adev->bios) {
> @@ -3359,6 +3374,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> if (r)
> dev_err(adev->dev, "amdgpu_pmu_init failed\n");
>
> + if (pci_save_state(pdev))
> + DRM_ERROR("Failed to save PCI state!!\n");
> +
> return 0;
>
> failed:
> @@ -4701,3 +4719,162 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>
> return 0;
> }
> +
> +/**
> + * amdgpu_pci_error_detected - Called when a PCI error is detected.
> + * @pdev: PCI device struct
> + * @state: PCI channel state
> + *
> + * Description: Called when a PCI error is detected.
> + *
> + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
> + */
> +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev,
> +pci_channel_state_t state) {
> + struct drm_device *dev = pci_get_drvdata(pdev);
> + struct amdgpu_device *adev = drm_to_adev(dev);
> +
> + DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
> +
> + switch (state) {
> + case pci_channel_io_normal:
> + return PCI_ERS_RESULT_CAN_RECOVER;
> + case pci_channel_io_frozen: {
> + /* Fatal error, prepare for slot reset */
> +
> + amdgpu_device_lock_adev(adev);
> + return PCI_ERS_RESULT_NEED_RESET;
> + }
> + case pci_channel_io_perm_failure:
> + /* Permanent error, prepare for device removal */
> + return PCI_ERS_RESULT_DISCONNECT;
> + }
> + return PCI_ERS_RESULT_NEED_RESET;
> +}
> +
> +/**
> + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
> + * @pdev: pointer to PCI device
> + */
> +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) {
> +
> + DRM_INFO("PCI error: mmio enabled callback!!\n");
> +
> + /* TODO - dump whatever for debugging purposes */
> +
> + /* This called only if amdgpu_pci_error_detected returns
> + * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
> + * works, no need to reset slot.
> + */
> +
> + return PCI_ERS_RESULT_RECOVERED;
> +}
> +
> +/**
> + * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
> + * @pdev: PCI device struct
> + *
> + * Description: This routine is called by the pci error recovery
> + * code after the PCI slot has been reset, just before we
> + * should resume normal operations.
> + */
> +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) {
> + struct drm_device *dev = pci_get_drvdata(pdev);
> + struct amdgpu_device *adev = drm_to_adev(dev);
> + int r;
> + bool vram_lost;
> +
> + DRM_INFO("PCI error: slot reset callback!!\n");
> +
> + pci_restore_state(pdev);
> +
> + r = amdgpu_device_ip_suspend(adev);
> + if (r)
> + goto out;
> +
> +
> + /* post card */
> + r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
> + if (r)
> + goto out;
> +
> + r = amdgpu_device_ip_resume_phase1(adev);
> + if (r)
> + goto out;
> +
> + vram_lost = amdgpu_device_check_vram_lost(adev);
> + if (vram_lost) {
> + DRM_INFO("VRAM is lost due to GPU reset!\n");
> + amdgpu_inc_vram_lost(adev);
> + }
> +
> + r = amdgpu_gtt_mgr_recover(
> + &adev->mman.bdev.man[TTM_PL_TT]);
> + if (r)
> + goto out;
> +
> + r = amdgpu_device_fw_loading(adev);
> + if (r)
> + return r;
> +
> + r = amdgpu_device_ip_resume_phase2(adev);
> + if (r)
> + goto out;
> +
> + if (vram_lost)
> + amdgpu_device_fill_reset_magic(adev);
> +
> + /*
> + * Add this ASIC as tracked as reset was already
> + * complete successfully.
> + */
> + amdgpu_register_gpu_instance(adev);
> +
> + r = amdgpu_device_ip_late_init(adev);
> + if (r)
> + goto out;
> +
> + amdgpu_fbdev_set_suspend(adev, 0);
> +
> + /* must succeed. */
> + amdgpu_ras_resume(adev);
> +
> +
> + amdgpu_irq_gpu_reset_resume_helper(adev);
> + r = amdgpu_ib_ring_tests(adev);
> + if (r)
> + goto out;
> +
> + r = amdgpu_device_recover_vram(adev);
> +
> +out:
> +
> + if (!r)
> + DRM_INFO("PCIe error recovery succeeded\n");
> + else {
> + DRM_ERROR("PCIe error recovery failed, err:%d", r);
> + amdgpu_device_unlock_adev(adev);
> + }
> +
> + return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; }
> +
> +/**
> + * amdgpu_pci_resume() - resume normal ops after PCI reset
> + * @pdev: pointer to PCI device
> + *
> + * Called when the error recovery driver tells us that its
> + * OK to resume normal operation. Use completion to allow
> + * halted scsi ops to resume.
> + */
> +void amdgpu_pci_resume(struct pci_dev *pdev) {
> + struct drm_device *dev = pci_get_drvdata(pdev);
> + struct amdgpu_device *adev = drm_to_adev(dev);
> +
> + amdgpu_device_unlock_adev(adev);
> +
> + DRM_INFO("PCI error: resume callback!!\n"); }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index d984c6a..4bbcc70 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -31,7 +31,6 @@
> #include <drm/drm_pciids.h>
> #include <linux/console.h>
> #include <linux/module.h>
> -#include <linux/pci.h>
> #include <linux/pm_runtime.h>
> #include <linux/vga_switcheroo.h>
> #include <drm/drm_probe_helper.h>
> @@ -1534,6 +1533,13 @@ static struct drm_driver kms_driver = {
> .patchlevel = KMS_DRIVER_PATCHLEVEL,
> };
>
> +static struct pci_error_handlers amdgpu_pci_err_handler = {
> + .error_detected = amdgpu_pci_error_detected,
> + .mmio_enabled = amdgpu_pci_mmio_enabled,
> + .slot_reset = amdgpu_pci_slot_reset,
> + .resume = amdgpu_pci_resume,
> +};
> +
> static struct pci_driver amdgpu_kms_pci_driver = {
> .name = DRIVER_NAME,
> .id_table = pciidlist,
> @@ -1541,6 +1547,7 @@ static struct pci_driver amdgpu_kms_pci_driver = {
> .remove = amdgpu_pci_remove,
> .shutdown = amdgpu_pci_shutdown,
> .driver.pm = &amdgpu_pm_ops,
> + .err_handler = &amdgpu_pci_err_handler,
> };
>
> static int __init amdgpu_init(void)
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CDennis.Li%40amd.com%7Cfbaac227eff74bcab03108d849cedd81%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637340500116686286&sdata=lYjIk50kyGIHG7UaAzx%2FswghLD8gc2DdCXUmTNj4EkU%3D&reserved=0
More information about the amd-gfx
mailing list