[RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
Nirmoy Das
nirmoy.das at amd.com
Tue Aug 11 13:30:53 UTC 2020
This patch will ignore non-fatal errors and try to
stop amdgpu's sw stack on fatal errors.
Signed-off-by: Nirmoy Das <nirmoy.das at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
1 file changed, 54 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c1219af2e7d6..2b9ede3000ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -35,6 +35,7 @@
#include <linux/pm_runtime.h>
#include <linux/vga_switcheroo.h>
#include <drm/drm_probe_helper.h>
+#include <drm/drm_atomic_helper.h>
#include <linux/mmu_notifier.h>
#include "amdgpu.h"
@@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
.patchlevel = KMS_DRIVER_PATCHLEVEL,
};
+static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ struct drm_device *dev = pci_get_drvdata(pdev);
+ struct amdgpu_device *adev = dev->dev_private;
+ int i;
+ int ret = PCI_ERS_RESULT_DISCONNECT;
+
+ switch (state) {
+ case pci_channel_io_normal:
+ ret = PCI_ERS_RESULT_CAN_RECOVER;
+ break;
+ default:
+ /* Disable power management */
+ adev->runpm = 0;
+ /* Suspend all IO operations */
+ amdgpu_fbdev_set_suspend(adev, 1);
+ cancel_delayed_work_sync(&adev->delayed_init_work);
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = adev->rings[i];
+
+ if (!ring || !ring->sched.thread)
+ continue;
+
+ amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
+ }
+
+ if (adev->mode_info.mode_config_initialized) {
+ if (!amdgpu_device_has_dc_support(adev))
+ drm_helper_force_disable_all(adev->ddev);
+ else
+ drm_atomic_helper_shutdown(adev->ddev);
+ }
+
+ amdgpu_fence_driver_fini(adev);
+ amdgpu_fbdev_fini(adev);
+ /* Try to close drm device to stop applications
+ * from opening dri files for further IO operations.
+ * TODO: This will throw warning as ttm is not
+ * cleaned perperly */
+ drm_dev_fini(dev);
+ break;
+ }
+
+ return ret;
+}
+
+static const struct pci_error_handlers amdgpu_err_handler = {
+ .error_detected = amdgpu_pci_err_detected,
+};
+
+
static struct pci_driver amdgpu_kms_pci_driver = {
.name = DRIVER_NAME,
.id_table = pciidlist,
@@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
.remove = amdgpu_pci_remove,
.shutdown = amdgpu_pci_shutdown,
.driver.pm = &amdgpu_pm_ops,
+ .err_handler = &amdgpu_err_handler,
};
-
-
static int __init amdgpu_init(void)
{
int r;
--
2.27.0
More information about the amd-gfx
mailing list