[PATCH 01/23] drm/amdgpu: add flag to indicate nps mode switch

Tao Zhou tao.zhou1 at amd.com
Fri Nov 8 11:14:01 UTC 2024


There are two types of gpu reset, nps mode switch and normal
gpu reset, add a flag to distigush them.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  9 +++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 13 ++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  2 +-
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9365b43c0055..ba9b0d322b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1681,6 +1681,7 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
 }
 
 int amdgpu_in_reset(struct amdgpu_device *adev);
+int amdgpu_in_nps_switch(struct amdgpu_device *adev);
 
 extern const struct attribute_group amdgpu_vram_mgr_attr_group;
 extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 286f0fdfcb50..d69fcbb28b0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5862,7 +5862,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* We need to lock reset domain only once both for XGMI and single device */
 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
 				    reset_list);
-	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+	amdgpu_device_lock_reset_domain(tmp_adev);
 
 	/* block all schedulers and reset given job's ring */
 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
@@ -6343,7 +6343,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
 		 * Locking adev->reset_domain->sem will prevent any external access
 		 * to GPU during PCI error recovery
 		 */
-		amdgpu_device_lock_reset_domain(adev->reset_domain);
+		amdgpu_device_lock_reset_domain(adev);
 		amdgpu_device_set_mp1_state(adev);
 
 		/*
@@ -6579,6 +6579,11 @@ int amdgpu_in_reset(struct amdgpu_device *adev)
 	return atomic_read(&adev->reset_domain->in_gpu_reset);
 }
 
+int amdgpu_in_nps_switch(struct amdgpu_device *adev)
+{
+	return atomic_read(&adev->reset_domain->in_nps_switch);
+}
+
 /**
  * amdgpu_device_halt() - bring hardware to some kind of halt state
  *
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index f4c08fa83756..1becf8fbbc71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -301,15 +301,25 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
 
 	INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work);
 	atomic_set(&reset_domain->in_gpu_reset, 0);
+	atomic_set(&reset_domain->in_nps_switch, 0);
 	atomic_set(&reset_domain->reset_res, 0);
 	init_rwsem(&reset_domain->sem);
 
 	return reset_domain;
 }
 
-void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
+void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev)
 {
+	struct amdgpu_reset_domain *reset_domain = adev->reset_domain;
+
 	atomic_set(&reset_domain->in_gpu_reset, 1);
+	/* The life time of in_nps_switch is longer than
+	 * amdgpu_device_nps_switch_needed
+	 */
+	if (adev->nbio.funcs && adev->nbio.funcs->is_nps_switch_requested &&
+	    adev->nbio.funcs->is_nps_switch_requested(adev))
+		atomic_set(&reset_domain->in_nps_switch, 1);
+
 	down_write(&reset_domain->sem);
 }
 
@@ -317,6 +327,7 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
 {
 	atomic_set(&reset_domain->in_gpu_reset, 0);
+	atomic_set(&reset_domain->in_nps_switch, 0);
 	up_write(&reset_domain->sem);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 977b2dd2205a..c74a1f88f0ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -97,6 +97,7 @@ struct amdgpu_reset_domain {
 	enum amdgpu_reset_domain_type type;
 	struct rw_semaphore sem;
 	atomic_t in_gpu_reset;
+	atomic_t in_nps_switch;
 	atomic_t reset_res;
 	struct work_struct clear;
 	bool drain;
@@ -158,7 +159,7 @@ static inline bool amdgpu_reset_pending(struct amdgpu_reset_domain *domain)
 	return rwsem_is_contended(&domain->sem);
 }
 
-void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev);
 
 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index daa69dfb4dca..8387e075c385 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1540,7 +1540,7 @@ static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work)
 
 	tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
 				    reset_list);
-	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+	amdgpu_device_lock_reset_domain(tmp_adev);
 
 	reset_context.method = AMD_RESET_METHOD_ON_INIT;
 	reset_context.reset_req_dev = tmp_adev;
-- 
2.34.1



More information about the amd-gfx mailing list