[PATCH v5 19/27] drm/amdgpu: Finilise device fences on device remove.

Wed Apr 28 15:11:59 UTC 2021

Make sure all fecens dependent on HW present are force signaled
when handling device removal. This helpes later to scope all HW
accesing code such as IOCTLs in drm_dev_enter/exit and use
drm_dev_unplug as synchronization point past which we know HW
will not be accessed anymore outside of pci remove driver callback.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98 ++++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 12 +--
 4 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0db0ba4fba89..df6c5ed676b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1374,6 +1374,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev);
 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev);
 bool amdgpu_device_load_pci_state(struct pci_dev *pdev);
 
+void amdgpu_finilize_device_fences(struct drm_device *dev);
+
 #include "amdgpu_object.h"
 
 static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 33e8e9e1d1fe..55afc11c17e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3692,15 +3692,12 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 		amdgpu_virt_fini_data_exchange(adev);
 	}
 
-	/* disable all interrupts */
-	amdgpu_irq_disable_all(adev);
 	if (adev->mode_info.mode_config_initialized){
 		if (!amdgpu_device_has_dc_support(adev))
 			drm_helper_force_disable_all(adev_to_drm(adev));
 		else
 			drm_atomic_helper_shutdown(adev_to_drm(adev));
 	}
-	amdgpu_fence_driver_fini_hw(adev);
 
 	if (adev->pm_sysfs_en)
 		amdgpu_pm_sysfs_fini(adev);
@@ -4567,14 +4564,19 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
 	return true;
 }
 
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+static void amdgpu_device_unlock_adev_imp(struct amdgpu_device *adev, bool skip_in_gpu_reset)
 {
 	amdgpu_vf_error_trans_all(adev);
 	adev->mp1_state = PP_MP1_STATE_NONE;
-	atomic_set(&adev->in_gpu_reset, 0);
+	!skip_in_gpu_reset ? atomic_set(&adev->in_gpu_reset, 0) : 0;
 	up_write(&adev->reset_sem);
 }
 
+static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+{
+	amdgpu_device_unlock_adev_imp(adev, false);
+}
+
 /*
  * to lockup a list of amdgpu devices in a hive safely, if not a hive
  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
@@ -5321,3 +5323,89 @@ bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
 }
 
 
+static void amdgpu_finilize_schedulded_fences(struct amdgpu_ctx_mgr *mgr)
+{
+	struct amdgpu_ctx *ctx;
+	struct idr *idp;
+	uint32_t id, i, j;
+
+	idp = &mgr->ctx_handles;
+
+	idr_for_each_entry(idp, ctx, id) {
+		for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+			for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+				struct drm_sched_entity *entity;
+
+				if (!ctx->entities[i][j])
+					continue;
+
+				entity = &ctx->entities[i][j]->entity;
+				drm_sched_entity_kill_jobs(entity);
+			}
+		}
+	}
+}
+
+/**
+ * amdgpu_finilize_device_fences() - Finilize all device fences
+ * @pdev: pointer to PCI device
+ *
+ * Will disable and finilise ISRs and will signal all fences
+ * that might hang if HW is gone
+ */
+void amdgpu_finilize_device_fences(struct drm_device *dev)
+{
+	struct amdgpu_device *adev = drm_to_adev(dev);
+	struct drm_file *file;
+
+	/*
+	 *  Block TDRs from further execution by setting adev->in_gpu_reset
+	 *  instead of holding full reset lock in order to not deadlock
+	 *  further ahead against any thread locking the reset lock when we
+	 *  wait for it's completion
+	 */
+	while (!amdgpu_device_lock_adev(adev, NULL))
+		amdgpu_cancel_all_tdr(adev);
+	amdgpu_device_unlock_adev_imp(adev, true);
+
+
+	/* disable all HW interrupts */
+	amdgpu_irq_disable_all(adev);
+
+	/* stop and flush all in flight HW interrupts handlers */
+	disable_irq(pci_irq_vector(adev->pdev, 0));
+
+	/*
+	 * Stop SW GPU schedulers and force completion on all HW fences. Since
+	 * in the prev. step all ISRs were disabled and completed the
+	 * HW fence array is idle (no insertions or extractions) and so it's
+	 * safe to iterate it bellow.
+	 * After this step all HW fences in the system are signaled. As a result
+	 * also all the scheduler 'finished' fences are also signaled.
+	 */
+	amdgpu_fence_driver_fini_hw(adev);
+
+	/*
+	 * Reject any further jobs to any scheduler entity queue. After this
+	 * step no new insertions and because schedulers are stopped also no
+	 * new extractions.
+	 */
+	down_read(&adev->sched_fence_completion_sem);
+	adev->stop_job_submissions = true;
+	up_read(&adev->sched_fence_completion_sem);
+
+	/*
+	 * Complete all scheduler 'scheduled' fences currently pending.
+	 * It's OK if new contexts and sched entities are concurrently
+	 * still created as they will fail in pushing jobs to SW queues
+	 * and their schedule fences will be signaled with error
+	 */
+	mutex_lock(&adev->ddev.filelist_mutex);
+	list_for_each_entry(file, &adev->ddev.filelist, lhead) {
+		struct amdgpu_fpriv *fpriv = file->driver_priv;
+		amdgpu_finilize_schedulded_fences(&fpriv->ctx_mgr);
+	}
+	mutex_unlock(&adev->ddev.filelist_mutex);
+}
+
+
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index f799c40d7e72..8a19b8dd02ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1249,6 +1249,12 @@ amdgpu_pci_remove(struct pci_dev *pdev)
 {
 	struct drm_device *dev = pci_get_drvdata(pdev);
 
+	/*
+	 * Force completion of all device related fences that might hang us when
+	 * synchronizing SRCU in the following step.
+	 */
+	amdgpu_finilize_device_fences(dev);
+
 	drm_dev_unplug(dev);
 	amdgpu_driver_unload_kms(dev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 2670201e78d3..af592b28cd35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -526,7 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
  */
 void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
 {
-	int i, r;
+	int i;
 
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
 		struct amdgpu_ring *ring = adev->rings[i];
@@ -535,18 +535,10 @@ void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
 			continue;
 
 		/* Stop any new job submissions from sched before flushing the ring */
-		/* TODO Handle amdgpu_job_submit_direct and amdgpu_amdkfd_submit_ib */
 		if (!ring->no_scheduler)
 			drm_sched_fini(&ring->sched);
 
-		/* You can't wait for HW to signal if it's gone */
-		if (!drm_dev_is_unplugged(&adev->ddev))
-			r = amdgpu_fence_wait_empty(ring);
-		else
-			r = -ENODEV;
-		/* no need to trigger GPU reset as we are unloading */
-		if (r)
-			amdgpu_fence_driver_force_completion(ring);
+		amdgpu_fence_driver_force_completion(ring);
 
 		if (ring->fence_drv.irq_src)
 			amdgpu_irq_put(adev, ring->fence_drv.irq_src,
-- 
2.25.1