[PATCH] drm/amdgpu: Register bad page handler for Aldebaran

Wed May 12 01:30:58 UTC 2021

On Aldebaran, GPU driver will handle bad page retirement
even though UMC is host managed. As a result, register a
bad page retirement handler on the mce notifier chain to
retire bad pages on Aldebaran.

Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
Reviewed-by: John Clements <John.Clements at amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 154 ++++++++++++++++++++++++
 1 file changed, 154 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b1c57a5b6e89..02263f509b36 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,7 +34,9 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
+#include <asm/mce.h>
 
+static bool notifier_registered;
 static const char *RAS_FS_NAME = "ras";
 
 const char *ras_error_string[] = {
@@ -73,6 +75,11 @@ const char *ras_block_string[] = {
 /* typical ECC bad page rate(1 bad page per 100MB VRAM) */
 #define RAS_BAD_PAGE_RATE		(100 * 1024 * 1024ULL)
 
+#define GET_MCA_IPID_GPUID(m)		(((m) >> 44) & 0xF)
+#define GET_UMC_INST_NIBBLE(m)		(((m) >> 20) & 0xF)
+#define GET_CHAN_INDEX_NIBBLE(m)	(((m) >> 12) & 0xF)
+#define GPU_ID_OFFSET			8
+
 enum amdgpu_ras_retire_page_reservation {
 	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
 	AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -85,6 +92,7 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
 				uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr);
+static void amdgpu_register_bad_pages_mca_notifier(void);
 
 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
 {
@@ -1978,6 +1986,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 			goto free;
 	}
 
+	if ((adev->asic_type == CHIP_ALDEBARAN) &&
+	    (adev->gmc.xgmi.connected_to_cpu))
+		amdgpu_register_bad_pages_mca_notifier();
+
 	return 0;
 
 free:
@@ -2427,3 +2439,145 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)
 		kfree(con);
 	}
 }
+
+static struct amdgpu_device *find_adev(uint32_t node_id)
+{
+	struct amdgpu_gpu_instance *gpu_instance;
+	int i;
+	struct amdgpu_device *adev = NULL;
+
+	mutex_lock(&mgpu_info.mutex);
+
+	for (i = 0; i < mgpu_info.num_gpu; i++) {
+		gpu_instance = &(mgpu_info.gpu_ins[i]);
+		adev = gpu_instance->adev;
+
+		if (adev->gmc.xgmi.connected_to_cpu &&
+		    adev->gmc.xgmi.physical_node_id == node_id)
+			break;
+		adev = NULL;
+	}
+
+	mutex_unlock(&mgpu_info.mutex);
+
+	return adev;
+}
+
+static void find_umc_inst_chan_index(struct mce *m, uint32_t *umc_inst,
+				     uint32_t *chan_index)
+{
+	uint32_t val1 = 0;
+	uint32_t val2 = 0;
+	uint32_t rem = 0;
+
+	/*
+	 * Bit 20-23 provides the UMC instance nibble.
+	 * Bit 12-15 provides the channel index nibble.
+	 */
+	val1 = GET_UMC_INST_NIBBLE(m->ipid);
+	val2 = GET_CHAN_INDEX_NIBBLE(m->ipid);
+
+	*umc_inst = val1/2;
+	rem = val1%2;
+
+	*chan_index = (4*rem) + val2;
+}
+
+static int amdgpu_bad_page_notifier(struct notifier_block *nb,
+				    unsigned long val, void *data)
+{
+	struct mce *m = (struct mce *)data;
+	struct amdgpu_device *adev = NULL;
+	uint32_t gpu_id = 0;
+	uint32_t umc_inst = 0;
+	uint32_t chan_index = 0;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	struct eeprom_table_record err_rec;
+	uint64_t retired_page;
+
+	/*
+	 * If the error was generated in UMC_V2, which belongs to GPU UMCs,
+	 * and error occurred in DramECC (Extended error code = 0) then only
+	 * process the error, else bail out.
+	 */
+	if (!m || !(is_smca_umc_v2(m->bank) && (XEC(m->status, 0x1f) == 0x0)))
+		return NOTIFY_DONE;
+
+	gpu_id = GET_MCA_IPID_GPUID(m->ipid);
+
+	/*
+	 * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register.
+	 */
+	gpu_id -= GPU_ID_OFFSET;
+
+	adev = find_adev(gpu_id);
+	if (!adev) {
+		dev_warn(adev->dev, "%s: Unable to find adev for gpu_id: %d\n",
+				     __func__, gpu_id);
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * If it is correctable error, then print a message and return.
+	 */
+	if (mce_is_correctable(m)) {
+		dev_info(adev->dev, "%s: UMC Correctable error detected.",
+				    __func__);
+		return NOTIFY_OK;
+	}
+
+	/*
+	 * If it is uncorrectable error, then find out UMC instance and
+	 * channel index.
+	 */
+	find_umc_inst_chan_index(m, &umc_inst, &chan_index);
+
+	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d,"
+			    " chan_idx: %d", umc_inst, chan_index);
+
+	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
+			ADDR_OF_256B_BLOCK(chan_index) |
+			OFFSET_IN_256B_BLOCK(m->addr);
+
+	err_rec.address = m->addr;
+	err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+	err_rec.ts = (uint64_t)ktime_get_real_seconds();
+	err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+	err_rec.cu = 0;
+	err_rec.mem_channel = chan_index;
+	err_rec.mcumc_id = umc_inst;
+
+	err_data.err_addr = &err_rec;
+	err_data.err_addr_cnt = 1;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amdgpu_bad_page_nb = {
+	.notifier_call  = amdgpu_bad_page_notifier,
+	.priority       = MCE_PRIO_ACCEL,
+};
+
+static void amdgpu_register_bad_pages_mca_notifier(void)
+{
+	/*
+	 * Register the x86 notifier with MCE subsystem.
+	 * Please note a notifier can be registered only once
+	 * with the MCE subsystem.
+	 */
+	if (notifier_registered == false) {
+		mce_register_decode_chain(&amdgpu_bad_page_nb);
+		notifier_registered = true;
+	}
+}
-- 
2.17.1