[PATCH 06/10] drm/amdgpu: Refactor XGMI reset on init handling
Lijo Lazar
lijo.lazar at amd.com
Mon Sep 2 07:34:13 UTC 2024
Use XGMI hive information to rely on resetting XGMI devices on
initialization rather than using mgpu structure. mgpu structure may have
other devices as well.
Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +--
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 --
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 72 ++++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 +++--
drivers/gpu/drm/amd/amdgpu/soc15.c | 5 ++
6 files changed, 90 insertions(+), 19 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 468c4f590183..9f33de7ab656 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -155,7 +155,8 @@ struct amdgpu_init_level amdgpu_init_minimal = {
.level = AMDGPU_INIT_LEVEL_MINIMAL,
.hwini_ip_block_mask =
BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
- BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH)
+ BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
+ BIT(AMD_IP_BLOCK_TYPE_PSP)
};
static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
@@ -2832,6 +2833,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_init(struct amdgpu_device *adev)
{
+ bool init_badpage;
int i, r;
r = amdgpu_ras_init(adev);
@@ -2945,7 +2947,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
* Note: theoretically, this should be called before all vram allocations
* to protect retired page from abusing
*/
- r = amdgpu_ras_recovery_init(adev, true);
+ init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL);
+ r = amdgpu_ras_recovery_init(adev, init_badpage);
if (r)
goto init_failed;
@@ -4501,8 +4504,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL)
- queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
- msecs_to_jiffies(AMDGPU_RESUME_MS));
+ amdgpu_xgmi_reset_on_init(adev);
amdgpu_device_check_iommu_direct_map(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 65c891b6b999..2c29f4c34e64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3216,12 +3216,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
- /* Todo: During test the SMU might fail to read the eeprom through I2C
- * when the GPU is pending on XGMI reset during probe time
- * (Mostly after second bus reset), skip it now
- */
- if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL)
- return 0;
if (init_bp_info) {
ret = amdgpu_ras_init_badpage_info(adev);
if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index a7a892512cb9..6a473a4262f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -860,8 +860,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.supported)
return 0;
- if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) &&
- amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+ if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
ret = psp_xgmi_initialize(&adev->psp, false, true);
if (ret) {
dev_err(adev->dev,
@@ -907,8 +906,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
task_barrier_add_task(&hive->tb);
- if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) &&
- amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+ if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
/* update node list for other device in the hive */
if (tmp_adev != adev) {
@@ -985,7 +983,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
}
}
- if (!ret && (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL))
+ if (!ret)
ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
exit_unlock:
@@ -1500,3 +1498,67 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
return 0;
}
+
+static void amdgpu_xgmi_roi_handler(struct work_struct *work)
+{
+ struct amdgpu_hive_info *hive =
+ container_of(work, struct amdgpu_hive_info, roi_work);
+ struct amdgpu_reset_context reset_context;
+ struct amdgpu_device *tmp_adev;
+ struct list_head device_list;
+ int r;
+
+ mutex_lock(&hive->hive_lock);
+
+ INIT_LIST_HEAD(&device_list);
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ list_add_tail(&tmp_adev->reset_list, &device_list);
+
+ tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
+ reset_list);
+ amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+
+ reset_context.method = AMD_RESET_METHOD_ON_INIT;
+ reset_context.reset_req_dev = tmp_adev;
+ reset_context.hive = hive;
+ reset_context.reset_device_list = &device_list;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
+
+ amdgpu_reset_xgmi_rst_on_init(&reset_context);
+ mutex_unlock(&hive->hive_lock);
+ amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+ r = amdgpu_ras_init_badpage_info(tmp_adev);
+ if (r && r != -EHWPOISON)
+ dev_err(tmp_adev->dev,
+ "error during bad page data initializtion");
+ }
+}
+
+static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive)
+{
+ INIT_WORK(&hive->roi_work, amdgpu_xgmi_roi_handler);
+ amdgpu_reset_domain_schedule(hive->reset_domain, &hive->roi_work);
+}
+
+int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev)
+{
+ struct amdgpu_hive_info *hive;
+ int r, num_devs;
+
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (!hive)
+ return -EINVAL;
+
+ mutex_lock(&hive->hive_lock);
+ num_devs = atomic_read(&hive->number_devices);
+ if (num_devs == adev->gmc.xgmi.num_physical_nodes)
+ amdgpu_xgmi_schedule_reset_on_init(hive);
+
+ mutex_unlock(&hive->hive_lock);
+ amdgpu_put_xgmi_hive(hive);
+
+ return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index a3bfc16de6d4..902c2f928653 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -45,6 +45,7 @@ struct amdgpu_hive_info {
struct amdgpu_reset_domain *reset_domain;
atomic_t ras_recovery;
struct ras_event_manager event_mgr;
+ struct work_struct roi_work;
};
struct amdgpu_pcs_ras_field {
@@ -75,5 +76,6 @@ static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
}
int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index c76ac0dfe572..bc30bc3b7851 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -2413,11 +2413,17 @@ static int gmc_v9_0_hw_fini(void *handle)
if (adev->mmhub.funcs->update_power_gating)
adev->mmhub.funcs->update_power_gating(adev, false);
- amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+ /*
+ * For minimal init, late_init is not called, hence VM fault/RAS irqs
+ * are not enabled.
+ */
+ if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) {
+ amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
- if (adev->gmc.ecc_irq.funcs &&
- amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
- amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+ if (adev->gmc.ecc_irq.funcs &&
+ amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+ amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+ }
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 8d16dacdc172..7901b3fbc127 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1295,7 +1295,12 @@ static int soc15_common_hw_fini(void *handle)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_put_irq(adev);
+ /*
+ * For minimal init, late_init is not called, hence RAS irqs are not
+ * enabled.
+ */
if ((!amdgpu_sriov_vf(adev)) &&
+ (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) &&
adev->nbio.ras_if &&
amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
if (adev->nbio.ras &&
--
2.25.1
More information about the amd-gfx
mailing list