[RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler

Nirmoy Das nirmoy.das at amd.com
Tue Aug 11 13:30:53 UTC 2020


This patch will ignore non-fatal errors and try to
stop amdgpu's sw stack on fatal errors.

Signed-off-by: Nirmoy Das <nirmoy.das at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c1219af2e7d6..2b9ede3000ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -35,6 +35,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/vga_switcheroo.h>
 #include <drm/drm_probe_helper.h>
+#include <drm/drm_atomic_helper.h>
 #include <linux/mmu_notifier.h>
 
 #include "amdgpu.h"
@@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
 	.patchlevel = KMS_DRIVER_PATCHLEVEL,
 };
 
+static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
+						pci_channel_state_t state)
+{
+	struct drm_device *dev = pci_get_drvdata(pdev);
+	struct amdgpu_device *adev = dev->dev_private;
+	int i;
+	int ret = PCI_ERS_RESULT_DISCONNECT;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		ret = PCI_ERS_RESULT_CAN_RECOVER;
+		break;
+	default:
+		/* Disable power management */
+		adev->runpm = 0;
+		/* Suspend all IO operations */
+		amdgpu_fbdev_set_suspend(adev, 1);
+		cancel_delayed_work_sync(&adev->delayed_init_work);
+		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+			struct amdgpu_ring *ring = adev->rings[i];
+
+			if (!ring || !ring->sched.thread)
+				continue;
+
+			amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
+		}
+
+		if (adev->mode_info.mode_config_initialized) {
+			if (!amdgpu_device_has_dc_support(adev))
+				drm_helper_force_disable_all(adev->ddev);
+			else
+				drm_atomic_helper_shutdown(adev->ddev);
+		}
+
+		amdgpu_fence_driver_fini(adev);
+		amdgpu_fbdev_fini(adev);
+		/* Try to close drm device to stop applications
+		 * from opening dri files for further IO operations.
+		 * TODO: This will throw warning as ttm is not
+		 * cleaned perperly */
+		drm_dev_fini(dev);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct pci_error_handlers amdgpu_err_handler = {
+       .error_detected = amdgpu_pci_err_detected,
+};
+
+
 static struct pci_driver amdgpu_kms_pci_driver = {
 	.name = DRIVER_NAME,
 	.id_table = pciidlist,
@@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
 	.remove = amdgpu_pci_remove,
 	.shutdown = amdgpu_pci_shutdown,
 	.driver.pm = &amdgpu_pm_ops,
+	.err_handler = &amdgpu_err_handler,
 };
 
-
-
 static int __init amdgpu_init(void)
 {
 	int r;
-- 
2.27.0



More information about the amd-gfx mailing list