[PATCH 1/2] drm/amdgpu: initialize ras structures for xgmi block

Chen, Guchun Guchun.Chen at amd.com
Tue Sep 10 03:22:59 UTC 2019


That's fine.

Regards,
Guchun

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang at amd.com> 
Sent: Tuesday, September 10, 2019 11:20 AM
To: Chen, Guchun <Guchun.Chen at amd.com>; amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>
Subject: RE: [PATCH 1/2] drm/amdgpu: initialize ras structures for xgmi block

I'd like to keep the conditional check in amdgpu_xgmi_ras_late_init internal so that it can be used anywhere without any conditional check from external. Please check v2.

Regards,
Hawking

-----Original Message-----
From: Chen, Guchun <Guchun.Chen at amd.com> 
Sent: 2019年9月9日 9:22
To: Zhang, Hawking <Hawking.Zhang at amd.com>; amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>
Subject: RE: [PATCH 1/2] drm/amdgpu: initialize ras structures for xgmi block



-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang at amd.com> 
Sent: Monday, September 9, 2019 6:07 AM
To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: [PATCH 1/2] drm/amdgpu: initialize ras structures for xgmi block

init ras common interface and fs node for xgmi block

Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h  |  1 +  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++++++++++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  7 +++++++
 4 files changed, 45 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 331ce50..f09bd30 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -120,6 +120,7 @@ struct amdgpu_xgmi {
 	/* gpu list in the same hive */
 	struct list_head head;
 	bool supported;
+	struct ras_common_if *ras_if;
 };
 
 struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 65aae75..7f6f2e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -25,6 +25,7 @@
 #include "amdgpu.h"
 #include "amdgpu_xgmi.h"
 #include "amdgpu_smu.h"
+#include "amdgpu_ras.h"
 #include "df/df_3_6_offset.h"
 
 static DEFINE_MUTEX(xgmi_mutex);
@@ -437,3 +438,38 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		mutex_unlock(&hive->hive_lock);
 	}
 }
+
+int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) {
+	int r;
+	struct ras_ih_if ih_info = {
+		.cb = NULL,
+	};
+	struct ras_fs_if fs_info = {
+		.sysfs_name = "xgmi_wafl_err_count",
+		.debugfs_name = "xgmi_wafl_err_inject",
+	};
+
+	if (!adev->gmc.xgmi.supported ||
+	    adev->gmc.xgmi.num_physical_nodes == 0)
+		return 0;
[Guchun]Looks this check num_physical_nodes==0 is redundant, as there is already one check outside of this function.
Is it better to move these two conditions outside of this function, to allow function entrance once xgmi.supported is true and num_physical_nodes > 0?
	if (adev->gmc.xgmi.num_physical_nodes > 1) {
		r = amdgpu_xgmi_ras_late_init(adev);
		if (r)
			return r;
	}

+	if (!adev->gmc.xgmi.ras_if) {
+		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+		if (!adev->gmc.xgmi.ras_if)
+			return -ENOMEM;
+		adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
+		adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+		adev->gmc.xgmi.ras_if->sub_block_index = 0;
+		strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
+	}
+	ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
+	r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
+				 &fs_info, &ih_info);
+	if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
+		kfree(adev->gmc.xgmi.ras_if);
+		adev->gmc.xgmi.ras_if = NULL;
+	}
+
+	return r;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index fbcee31..9023789 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -42,6 +42,7 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);  int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);  int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
 		struct amdgpu_device *peer_adev);
+int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev);
 
 static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
 		struct amdgpu_device *bo_adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index beb6c84..05a9a8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -51,6 +51,7 @@
 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
 
 #include "amdgpu_ras.h"
+#include "amdgpu_xgmi.h"
 
 /* add these here since we already include dce12 headers and these are for DCN */
 #define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION                                                          0x055d
@@ -802,6 +803,12 @@ static int gmc_v9_0_ecc_late_init(void *handle)
 		if (r)
 			return r;
 	}
+
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		r = amdgpu_xgmi_ras_late_init(adev);
+		if (r)
+			return r;
+	}
 	return 0;
 }
 
--
2.7.4



More information about the amd-gfx mailing list