[PATCH 1/3] drm/amdgpu: add address conversion for UMC v12
Tao Zhou
tao.zhou1 at amd.com
Wed Sep 6 10:10:14 UTC 2023
Convert MCA error address to physical address and find out all pages in
one physical row.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 5 ++
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 97 ++++++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 64 ++++++++++++++++
3 files changed, 162 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 43321f57f557..417a6726c71b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -32,6 +32,11 @@
* is the index of 8KB block
*/
#define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5)
+/*
+ * (addr / 256) * 32768, the higher 26 bits in ErrorAddr
+ * is the index of 8KB block
+ */
+#define ADDR_OF_32KB_BLOCK(addr) (((addr) & ~0xffULL) << 7)
/* channel index is the index of 256B block */
#define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8)
/* offset in 256B block */
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 292159814340..2a135fd8ec15 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -27,6 +27,14 @@
#include "umc/umc_12_0_0_offset.h"
#include "umc/umc_12_0_0_sh_mask.h"
+/* mapping of MCA error address to normalized address */
+static const uint32_t umc_v12_0_ma2na_mapping[] = {
+ 0, 5, 6, 8, 9, 14, 12, 13,
+ 10, 11, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28,
+ 24, 7, 29, 30,
+};
+
static inline uint32_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
uint32_t node_inst,
uint32_t umc_inst,
@@ -133,12 +141,93 @@ static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,
umc_v12_0_reset_error_count(adev);
}
-static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
- struct ras_err_data *err_data, uint64_t err_addr,
- uint32_t ch_inst, uint32_t umc_inst,
- uint32_t node_inst, uint64_t mc_umc_status)
+static bool umc_v12_0_bit_wise_xor(uint32_t val)
{
+ bool result = 0;
+ int i;
+ for (i = 0; i < 32; i++)
+ result = result ^ ((val >> i) & 0x1);
+
+ return result;
+}
+
+static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
+ struct ras_err_data *err_data, uint64_t err_addr,
+ uint32_t ch_inst, uint32_t umc_inst,
+ uint32_t node_inst, uint64_t mc_umc_status)
+{
+ uint32_t channel_index, i;
+ uint64_t soc_pa, na, retired_page, column;
+ uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
+ uint32_t bank0, bank1, bank2, bank3, bank;
+
+ bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
+ bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL;
+ bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL;
+ bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL;
+ col = (err_addr >> 1) & 0x1fULL;
+ row = (err_addr >> 10) & 0x3fffULL;
+
+ /* apply bank hash algorithm */
+ bank0 =
+ bank_hash0 ^ (UMC_V12_0_XOR_EN0 &
+ (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^
+ (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0))));
+ bank1 =
+ bank_hash1 ^ (UMC_V12_0_XOR_EN1 &
+ (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^
+ (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1))));
+ bank2 =
+ bank_hash2 ^ (UMC_V12_0_XOR_EN2 &
+ (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^
+ (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2))));
+ bank3 =
+ bank_hash3 ^ (UMC_V12_0_XOR_EN3 &
+ (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^
+ (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3))));
+
+ bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3);
+ err_addr &= ~0x3c0ULL;
+ err_addr |= (bank << UMC_V12_0_MCA_B0_BIT);
+
+ na = 0x0;
+ /* convert mca error address to normalized address */
+ for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++)
+ na |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i];
+
+ channel_index =
+ adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
+ adev->umc.channel_inst_num +
+ umc_inst * adev->umc.channel_inst_num +
+ ch_inst];
+ /* translate umc channel address to soc pa, 3 parts are included */
+ soc_pa = ADDR_OF_32KB_BLOCK(na) |
+ ADDR_OF_256B_BLOCK(channel_index) |
+ OFFSET_IN_256B_BLOCK(na);
+
+ /* the umc channel bits are not original values, they are hashed */
+ UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa);
+
+ /* clear [C3 C2] in soc physical address */
+ soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+ /* clear [C4] in soc physical address */
+ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+ /* loop for all possibilities of [C4 C3 C2] */
+ for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+ retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+ retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+ dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
+ amdgpu_umc_fill_error_record(err_data, err_addr,
+ retired_page, channel_index, umc_inst);
+
+ /* shift R13 bit */
+ retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+ dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
+ amdgpu_umc_fill_error_record(err_data, err_addr,
+ retired_page, channel_index, umc_inst);
+ }
}
static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index 3f822b6e0c99..c20b4b4cbfda 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -48,6 +48,70 @@
#define UMC_V12_0_TOTAL_CHANNEL_NUM(adev) \
(UMC_V12_0_CHANNEL_INSTANCE_NUM * (adev)->gmc.num_umc)
+/* one piece of normalized address is mapped to 8 pieces of physical address */
+#define UMC_V12_0_NA_MAP_PA_NUM 8
+/* bank bits in MCA error address */
+#define UMC_V12_0_MCA_B0_BIT 6
+#define UMC_V12_0_MCA_B1_BIT 7
+#define UMC_V12_0_MCA_B2_BIT 8
+#define UMC_V12_0_MCA_B3_BIT 9
+/* column bits in SOC physical address */
+#define UMC_V12_0_PA_C2_BIT 15
+#define UMC_V12_0_PA_C4_BIT 21
+/* row bits in SOC physical address */
+#define UMC_V12_0_PA_R13_BIT 35
+/* channel index bits in SOC physical address */
+#define UMC_V12_0_PA_CH4_BIT 12
+#define UMC_V12_0_PA_CH5_BIT 13
+#define UMC_V12_0_PA_CH6_BIT 14
+
+/* bank hash settings */
+#define UMC_V12_0_XOR_EN0 1
+#define UMC_V12_0_XOR_EN1 1
+#define UMC_V12_0_XOR_EN2 1
+#define UMC_V12_0_XOR_EN3 1
+#define UMC_V12_0_COL_XOR0 0x0
+#define UMC_V12_0_COL_XOR1 0x0
+#define UMC_V12_0_COL_XOR2 0x800
+#define UMC_V12_0_COL_XOR3 0x1000
+#define UMC_V12_0_ROW_XOR0 0x11111
+#define UMC_V12_0_ROW_XOR1 0x22222
+#define UMC_V12_0_ROW_XOR2 0x4444
+#define UMC_V12_0_ROW_XOR3 0x8888
+
+/* channel hash settings */
+#define UMC_V12_0_HASH_4K 0
+#define UMC_V12_0_HASH_64K 1
+#define UMC_V12_0_HASH_2M 1
+#define UMC_V12_0_HASH_1G 1
+#define UMC_V12_0_HASH_1T 1
+
+/* XOR some bits of PA into CH4~CH6 bits (bits 12~14 of PA),
+ * hash bit is only effective when related setting is enabled
+ */
+#define UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) ((((channel_idx) >> 5) & 0x1) ^ \
+ (((pa) >> 20) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
+ (((pa) >> 27) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+ (((pa) >> 34) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+ (((pa) >> 41) & 0x1ULL & UMC_V12_0_HASH_1T))
+#define UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) ((((channel_idx) >> 6) & 0x1) ^ \
+ (((pa) >> 21) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
+ (((pa) >> 28) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+ (((pa) >> 35) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+ (((pa) >> 42) & 0x1ULL & UMC_V12_0_HASH_1T))
+#define UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) ((((channel_idx) >> 4) & 0x1) ^ \
+ (((pa) >> 19) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
+ (((pa) >> 26) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+ (((pa) >> 33) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+ (((pa) >> 40) & 0x1ULL & UMC_V12_0_HASH_1T) ^ \
+ (((pa) >> 47) & 0x1ULL & UMC_V12_0_HASH_4K))
+#define UMC_V12_0_SET_CHANNEL_HASH(channel_idx, pa) do { \
+ (pa) &= ~(0x7ULL << UMC_V12_0_PA_CH4_BIT); \
+ (pa) |= (UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) << UMC_V12_0_PA_CH4_BIT); \
+ (pa) |= (UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) << UMC_V12_0_PA_CH5_BIT); \
+ (pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \
+ } while (0)
+
#define GET_CROSS_NODE_ADDR(reg) \
((((reg) >> 32) & 0x3) ? ((reg) | (1ULL << 34)) : (reg))
--
2.35.1
More information about the amd-gfx
mailing list