[PATCH 1/7] drm/amdgpu: Implement DPC recovery

Mon Aug 31 14:30:17 UTC 2020

On 8/28/20 10:07 PM, Luben Tuikov wrote:
> On 2020-08-26 10:46, Andrey Grodzovsky wrote:
>> Add DPC handlers with basic recovery functionality.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   9 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 181 ++++++++++++++++++++++++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   9 +-
>>   3 files changed, 196 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 49ea9fa..3399242 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -49,6 +49,8 @@
>>   #include <linux/rbtree.h>
>>   #include <linux/hashtable.h>
>>   #include <linux/dma-fence.h>
>> +#include <linux/pci.h>
>> +#include <linux/aer.h>
>>   
>>   #include <drm/ttm/ttm_bo_api.h>
>>   #include <drm/ttm/ttm_bo_driver.h>
>> @@ -1263,6 +1265,13 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return
>>   void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
>>   void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev);
>>   
>> +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev,
>> +					   pci_channel_state_t state);
>> +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev);
>> +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev);
>> +void amdgpu_pci_resume(struct pci_dev *pdev);
>> +
>> +
>>   #include "amdgpu_object.h"
>>   
>>   /* used by df_v3_6.c and amdgpu_pmu.c */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5a948ed..84f8d14 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -350,7 +350,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>    *
>>    * Returns the 8 bit value from the offset specified.
>>    */
>> -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
>> +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>> +{
>> +
>>   	if (offset < adev->rmmio_size)
>>   		return (readb(adev->rmmio + offset));
>>   	BUG();
>> @@ -371,7 +373,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
>>    *
>>    * Writes the value specified to the offset specified.
>>    */
>> -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
>> +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>> +{
>> +
>>   	if (offset < adev->rmmio_size)
>>   		writeb(value, adev->rmmio + offset);
>>   	else
>> @@ -380,6 +384,7 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>>   
>>   void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
>>   {
>> +
>>   	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
>>   
>>   	if ((reg * 4) < adev->rmmio_size)
>> @@ -407,6 +412,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>>   void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>   		    uint32_t acc_flags)
>>   {
>> +
>>   	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>>   		return amdgpu_kiq_wreg(adev, reg, v);
>>   
>> @@ -461,6 +467,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>>    */
>>   void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>>   {
>> +
>>   	if ((reg * 4) < adev->rio_mem_size)
>>   		iowrite32(v, adev->rio_mem + (reg * 4));
>>   	else {
>> @@ -480,6 +487,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>>    */
>>   u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>>   {
>> +
>>   	if (index < adev->doorbell.num_doorbells) {
>>   		return readl(adev->doorbell.ptr + index);
>>   	} else {
>> @@ -500,6 +508,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>>    */
>>   void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>>   {
>> +
>>   	if (index < adev->doorbell.num_doorbells) {
>>   		writel(v, adev->doorbell.ptr + index);
>>   	} else {
>> @@ -518,6 +527,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>>    */
>>   u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>>   {
>> +
>>   	if (index < adev->doorbell.num_doorbells) {
>>   		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
>>   	} else {
>> @@ -538,6 +548,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>>    */
>>   void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
>>   {
>> +
>>   	if (index < adev->doorbell.num_doorbells) {
>>   		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
>>   	} else {
>> @@ -2989,6 +3000,7 @@ static const struct attribute *amdgpu_dev_attributes[] = {
>>   	NULL
>>   };
>>   
>> +
>>   /**
>>    * amdgpu_device_init - initialize the driver
>>    *
>> @@ -3207,6 +3219,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>   		}
>>   	}
>>   
>> +	pci_enable_pcie_error_reporting(adev->ddev.pdev);
>> +
>> +
>>   	/* Post card if necessary */
>>   	if (amdgpu_device_need_post(adev)) {
>>   		if (!adev->bios) {
>> @@ -3359,6 +3374,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>   	if (r)
>>   		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
>>   
>> +	if (pci_save_state(pdev))
>> +		DRM_ERROR("Failed to save PCI state!!\n");
>> +
>>   	return 0;
>>   
>>   failed:
>> @@ -4701,3 +4719,162 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>>   
>>   	return 0;
>>   }
>> +
>> +/**
>> + * amdgpu_pci_error_detected - Called when a PCI error is detected.
>> + * @pdev: PCI device struct
>> + * @state: PCI channel state
>> + *
>> + * Description: Called when a PCI error is detected.
>> + *
>> + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
>> + */
>> +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
>> +{
>> +	struct drm_device *dev = pci_get_drvdata(pdev);
>> +	struct amdgpu_device *adev = drm_to_adev(dev);
>> +
>> +	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
>> +
>> +	switch (state) {
>> +	case pci_channel_io_normal:
>> +		return PCI_ERS_RESULT_CAN_RECOVER;
>> +	case pci_channel_io_frozen: {
>> +		/* Fatal error, prepare for slot reset */
>> +
>> +		amdgpu_device_lock_adev(adev);
>> +		return PCI_ERS_RESULT_NEED_RESET;
>> +	}
>> +	case pci_channel_io_perm_failure:
>> +		/* Permanent error, prepare for device removal */
>> +		return PCI_ERS_RESULT_DISCONNECT;
>> +	}
>> +	return PCI_ERS_RESULT_NEED_RESET;
>> +}
> Perhaps an empty line before the "return".
>
>> +
>> +/**
>> + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
>> + * @pdev: pointer to PCI device
>> + */
>> +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
>> +{
>> +
>> +	DRM_INFO("PCI error: mmio enabled callback!!\n");
>> +
>> +	/* TODO - dump whatever for debugging purposes */
>> +
>> +	/* This called only if amdgpu_pci_error_detected returns
>> +	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
>> +	 * works, no need to reset slot.
>> +	 */
>> +
>> +	return PCI_ERS_RESULT_RECOVERED;
>> +}
>> +
>> +/**
>> + * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
>> + * @pdev: PCI device struct
>> + *
>> + * Description: This routine is called by the pci error recovery
>> + * code after the PCI slot has been reset, just before we
>> + * should resume normal operations.
>> + */
>> +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>> +{
>> +	struct drm_device *dev = pci_get_drvdata(pdev);
>> +	struct amdgpu_device *adev = drm_to_adev(dev);
>> +	int r;
>> +	bool vram_lost;
>> +
>> +	DRM_INFO("PCI error: slot reset callback!!\n");
>> +
>> +	pci_restore_state(pdev);
>> +
>> +	r = amdgpu_device_ip_suspend(adev);
>> +	if (r)
>> +		goto out;
>> +
>> +
>> +	/* post card */
>> +	r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
>> +	if (r)
>> +		goto out;
>> +
>> +	r = amdgpu_device_ip_resume_phase1(adev);
>> +	if (r)
>> +		goto out;
>> +
>> +	vram_lost = amdgpu_device_check_vram_lost(adev);
>> +	if (vram_lost) {
>> +		DRM_INFO("VRAM is lost due to GPU reset!\n");
>> +		amdgpu_inc_vram_lost(adev);
>> +	}
>> +
>> +	r = amdgpu_gtt_mgr_recover(
>> +		&adev->mman.bdev.man[TTM_PL_TT]);
>> +	if (r)
>> +		goto out;
>> +
>> +	r = amdgpu_device_fw_loading(adev);
>> +	if (r)
>> +		return r;
>> +
>> +	r = amdgpu_device_ip_resume_phase2(adev);
>> +	if (r)
>> +		goto out;
>> +
>> +	if (vram_lost)
>> +		amdgpu_device_fill_reset_magic(adev);
>> +
>> +	/*
>> +	 * Add this ASIC as tracked as reset was already
>> +	 * complete successfully.
>> +	 */
>> +	amdgpu_register_gpu_instance(adev);
>> +
>> +	r = amdgpu_device_ip_late_init(adev);
>> +	if (r)
>> +		goto out;
>> +
>> +	amdgpu_fbdev_set_suspend(adev, 0);
>> +
>> +	/* must succeed. */
>> +	amdgpu_ras_resume(adev);
>> +
>> +
>> +	amdgpu_irq_gpu_reset_resume_helper(adev);
>> +	r = amdgpu_ib_ring_tests(adev);
>> +	if (r)
>> +		goto out;
>> +
>> +	r = amdgpu_device_recover_vram(adev);
>> +
>> +out:
>> +
>> +	if (!r)
>> +		DRM_INFO("PCIe error recovery succeeded\n");
>> +	else {
>> +		DRM_ERROR("PCIe error recovery failed, err:%d", r);
>> +		amdgpu_device_unlock_adev(adev);
>> +	}
> Add braces around the "if ()" if you're going to add them
> around the "else".
>
> It's a good idea to run patches through checkpatch.pl--as
> a general guidance--sometimes it finds good things.
>
> Regards,
> Luben

I did run it, I might have missed this of course.

Andrey

>
>> +
>> +	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
>> +}
>> +
>> +/**
>> + * amdgpu_pci_resume() - resume normal ops after PCI reset
>> + * @pdev: pointer to PCI device
>> + *
>> + * Called when the error recovery driver tells us that its
>> + * OK to resume normal operation. Use completion to allow
>> + * halted scsi ops to resume.
>> + */
>> +void amdgpu_pci_resume(struct pci_dev *pdev)
>> +{
>> +	struct drm_device *dev = pci_get_drvdata(pdev);
>> +	struct amdgpu_device *adev = drm_to_adev(dev);
>> +
>> +	amdgpu_device_unlock_adev(adev);
>> +
>> +	DRM_INFO("PCI error: resume callback!!\n");
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index d984c6a..4bbcc70 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -31,7 +31,6 @@
>>   #include <drm/drm_pciids.h>
>>   #include <linux/console.h>
>>   #include <linux/module.h>
>> -#include <linux/pci.h>
>>   #include <linux/pm_runtime.h>
>>   #include <linux/vga_switcheroo.h>
>>   #include <drm/drm_probe_helper.h>
>> @@ -1534,6 +1533,13 @@ static struct drm_driver kms_driver = {
>>   	.patchlevel = KMS_DRIVER_PATCHLEVEL,
>>   };
>>   
>> +static struct pci_error_handlers amdgpu_pci_err_handler = {
>> +	.error_detected	= amdgpu_pci_error_detected,
>> +	.mmio_enabled	= amdgpu_pci_mmio_enabled,
>> +	.slot_reset	= amdgpu_pci_slot_reset,
>> +	.resume		= amdgpu_pci_resume,
>> +};
>> +
>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>   	.name = DRIVER_NAME,
>>   	.id_table = pciidlist,
>> @@ -1541,6 +1547,7 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>   	.remove = amdgpu_pci_remove,
>>   	.shutdown = amdgpu_pci_shutdown,
>>   	.driver.pm = &amdgpu_pm_ops,
>> +	.err_handler = &amdgpu_pci_err_handler,
>>   };
>>   
>>   static int __init amdgpu_init(void)
>>