[PATCH 09/22] drm/amdgpu: fix kmd reload bug on bare-metal

Monk Liu Monk.Liu at amd.com
Mon Feb 26 05:18:07 UTC 2018


issue:
on bare-metal when doing kmd reload test, there are chance
that kernel hit fatal error afer driver unloaded/reloaded

fix:
the cause is that those "idle work" not really stopped and
if kmd was is unloaded too quick that were chance that
"idle work" run after driver structures already released which
introduces this issue.

Change-Id: Idb0f7db771e7ca60dba925d1d0f48b1de08dc89e
Signed-off-by: Monk Liu <Monk.Liu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c    | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c    | 4 +++-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 54145ec..69fb5e50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1419,7 +1419,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
 		}
 	}
 
-	mod_delayed_work(system_wq, &adev->late_init_work,
+	if (!amdgpu_sriov_vf(adev))
+		mod_delayed_work(system_wq, &adev->late_init_work,
 			msecs_to_jiffies(AMDGPU_RESUME_MS));
 
 	amdgpu_device_fill_reset_magic(adev);
@@ -2087,7 +2088,11 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
 		adev->firmware.gpu_info_fw = NULL;
 	}
 	adev->accel_working = false;
-	cancel_delayed_work_sync(&adev->late_init_work);
+
+	if (!amdgpu_sriov_vf(adev))
+		while (cancel_delayed_work_sync(&adev->late_init_work))
+			schedule(); /* to make sure late_init_work really stopped */
+
 	/* free i2c buses */
 	if (!amdgpu_device_has_dc_support(adev))
 		amdgpu_i2c_fini(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index caba610..337db57 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -299,7 +299,8 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev)
 		return 0;
 
 	if (!amdgpu_sriov_vf(adev))
-		cancel_delayed_work_sync(&adev->uvd.idle_work);
+		while (cancel_delayed_work_sync(&adev->uvd.idle_work))
+			schedule(); /* to make sure idle work really stopped */
 
 	for (i = 0; i < adev->uvd.max_handles; ++i)
 		if (atomic_read(&adev->uvd.handles[i]))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index a829350..2874fda 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -243,7 +243,9 @@ int amdgpu_vce_suspend(struct amdgpu_device *adev)
 		return 0;
 
 	if (!amdgpu_sriov_vf(adev))
-		cancel_delayed_work_sync(&adev->vce.idle_work);
+		while (cancel_delayed_work_sync(&adev->vce.idle_work))
+			schedule(); /* to make sure the idle_work really stopped */
+
 	/* TODO: suspending running encoding sessions isn't supported */
 	return -EINVAL;
 }
-- 
2.7.4



More information about the amd-gfx mailing list