[PATCH 14/21] drm/amdgpu: save UMC global channel index to eeprom
Tao Zhou
tao.zhou1 at amd.com
Tue Nov 19 06:35:57 UTC 2024
Save the global channel index returned by RAS TA to eeprom.
We can get memory physical address by MCA address and channel index.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 7 ++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 2 +-
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 13 ++++++++-----
4 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 871b2d6278e0..252e360c6416 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -481,6 +481,8 @@ struct ras_ecc_err {
uint64_t ipid;
uint64_t addr;
uint64_t pa_pfn;
+ /* save global channel index across all UMC instances */
+ uint32_t channel_idx;
struct ras_err_pages err_pages;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 11f7299f773f..17659fa4450a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -498,10 +498,9 @@ int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
uint64_t err_addr, uint32_t ch, uint32_t umc,
uint32_t node, uint32_t socket,
- uint64_t *addr, bool dump_addr)
+ struct ta_ras_query_address_output *addr_out, bool dump_addr)
{
struct ta_ras_query_address_input addr_in;
- struct ta_ras_query_address_output addr_out;
int ret;
memset(&addr_in, 0, sizeof(addr_in));
@@ -513,14 +512,12 @@ int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in,
- &addr_out, dump_addr);
+ addr_out, dump_addr);
if (ret)
return ret;
} else {
return 0;
}
- *addr = addr_out.pa.pa;
-
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index ce1e4fb385b5..2f71194d5da8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -146,5 +146,5 @@ int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev,
uint64_t err_addr, uint32_t ch, uint32_t umc,
uint32_t node, uint32_t socket,
- uint64_t *addr, bool dump_addr);
+ struct ta_ras_query_address_output *addr_out, bool dump_addr);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 17ef9a6743f5..cce93b4ffb58 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -180,7 +180,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
bool dump_addr)
{
uint32_t col, col_lower, row, row_lower, bank;
- uint32_t channel_index, umc_inst = 0;
+ uint32_t channel_index = 0, umc_inst = 0;
uint32_t i, loop_bits[UMC_V12_0_RETIRE_LOOP_BITS];
uint64_t soc_pa, column, err_addr;
struct ta_ras_query_address_output addr_out_tmp;
@@ -193,7 +193,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
else
paddr_out = addr_out;
- err_addr = bank = channel_index = 0;
+ err_addr = bank = 0;
if (addr_in) {
err_addr = addr_in->ma.err_addr;
addr_in->addr_type = TA_RAS_MCA_TO_PA;
@@ -206,7 +206,6 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
}
bank = paddr_out->pa.bank;
- channel_index = paddr_out->pa.channel_idx;
/* no need to care about umc inst if addr_in is NULL */
umc_inst = addr_in->ma.umc_inst;
}
@@ -228,6 +227,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev,
}
soc_pa = paddr_out->pa.pa;
+ channel_index = paddr_out->pa.channel_idx;
/* clear loop bits in soc physical address */
for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++)
soc_pa &= ~BIT_ULL(loop_bits[i]);
@@ -466,6 +466,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
uint64_t err_addr, pa_addr = 0;
struct ras_ecc_err *ecc_err;
+ struct ta_ras_query_address_output addr_out;
int count, ret, i;
hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
@@ -495,7 +496,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
ret = amdgpu_umc_mca_to_addr(adev,
err_addr, MCA_IPID_2_UMC_CH(ipid),
MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid),
- MCA_IPID_2_SOCKET_ID(ipid), &pa_addr, true);
+ MCA_IPID_2_SOCKET_ID(ipid), &addr_out, true);
if (ret)
return ret;
@@ -503,10 +504,12 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
if (!ecc_err)
return -ENOMEM;
+ pa_addr = addr_out.pa.pa;
ecc_err->status = status;
ecc_err->ipid = ipid;
ecc_err->addr = addr;
ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT;
+ ecc_err->channel_idx = addr_out.pa.channel_idx;
/* If converted pa_pfn is 0, use pa C4 pfn. */
if (!ecc_err->pa_pfn)
@@ -577,7 +580,7 @@ static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
ret = amdgpu_umc_fill_error_record(err_data,
ecc_err->addr,
page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
- MCA_IPID_2_UMC_CH(ecc_err->ipid),
+ ecc_err->channel_idx,
MCA_IPID_2_UMC_INST(ecc_err->ipid));
if (ret)
break;
--
2.34.1
More information about the amd-gfx
mailing list