[PATCH 01/23] drm/amdgpu: add flag to indicate nps mode switch
Tao Zhou
tao.zhou1 at amd.com
Fri Nov 8 11:14:01 UTC 2024
There are two types of gpu reset, nps mode switch and normal
gpu reset, add a flag to distigush them.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 13 ++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 3 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
5 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9365b43c0055..ba9b0d322b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1681,6 +1681,7 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
}
int amdgpu_in_reset(struct amdgpu_device *adev);
+int amdgpu_in_nps_switch(struct amdgpu_device *adev);
extern const struct attribute_group amdgpu_vram_mgr_attr_group;
extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 286f0fdfcb50..d69fcbb28b0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5862,7 +5862,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* We need to lock reset domain only once both for XGMI and single device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
- amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+ amdgpu_device_lock_reset_domain(tmp_adev);
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
@@ -6343,7 +6343,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
* Locking adev->reset_domain->sem will prevent any external access
* to GPU during PCI error recovery
*/
- amdgpu_device_lock_reset_domain(adev->reset_domain);
+ amdgpu_device_lock_reset_domain(adev);
amdgpu_device_set_mp1_state(adev);
/*
@@ -6579,6 +6579,11 @@ int amdgpu_in_reset(struct amdgpu_device *adev)
return atomic_read(&adev->reset_domain->in_gpu_reset);
}
+int amdgpu_in_nps_switch(struct amdgpu_device *adev)
+{
+ return atomic_read(&adev->reset_domain->in_nps_switch);
+}
+
/**
* amdgpu_device_halt() - bring hardware to some kind of halt state
*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index f4c08fa83756..1becf8fbbc71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -301,15 +301,25 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work);
atomic_set(&reset_domain->in_gpu_reset, 0);
+ atomic_set(&reset_domain->in_nps_switch, 0);
atomic_set(&reset_domain->reset_res, 0);
init_rwsem(&reset_domain->sem);
return reset_domain;
}
-void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
+void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev)
{
+ struct amdgpu_reset_domain *reset_domain = adev->reset_domain;
+
atomic_set(&reset_domain->in_gpu_reset, 1);
+ /* The life time of in_nps_switch is longer than
+ * amdgpu_device_nps_switch_needed
+ */
+ if (adev->nbio.funcs && adev->nbio.funcs->is_nps_switch_requested &&
+ adev->nbio.funcs->is_nps_switch_requested(adev))
+ atomic_set(&reset_domain->in_nps_switch, 1);
+
down_write(&reset_domain->sem);
}
@@ -317,6 +327,7 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
{
atomic_set(&reset_domain->in_gpu_reset, 0);
+ atomic_set(&reset_domain->in_nps_switch, 0);
up_write(&reset_domain->sem);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 977b2dd2205a..c74a1f88f0ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -97,6 +97,7 @@ struct amdgpu_reset_domain {
enum amdgpu_reset_domain_type type;
struct rw_semaphore sem;
atomic_t in_gpu_reset;
+ atomic_t in_nps_switch;
atomic_t reset_res;
struct work_struct clear;
bool drain;
@@ -158,7 +159,7 @@ static inline bool amdgpu_reset_pending(struct amdgpu_reset_domain *domain)
return rwsem_is_contended(&domain->sem);
}
-void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev);
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index daa69dfb4dca..8387e075c385 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1540,7 +1540,7 @@ static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work)
tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
reset_list);
- amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+ amdgpu_device_lock_reset_domain(tmp_adev);
reset_context.method = AMD_RESET_METHOD_ON_INIT;
reset_context.reset_req_dev = tmp_adev;
--
2.34.1
More information about the amd-gfx
mailing list