[PATCH 3/4] drm/amdgpu: update algorithm of umc address conversion
Lazar, Lijo
lijo.lazar at amd.com
Wed Jan 26 03:26:20 UTC 2022
On 1/25/2022 4:16 PM, Tao Zhou wrote:
> On ALDEBARAN, we need to traverse all column bits higher than
> BIT11(C4C3C2) in a row, the shift of R14 bit should be also taken
> into account. Retire all pages we find.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 41 +++++++++++++++++++++------
> drivers/gpu/drm/amd/amdgpu/umc_v6_7.h | 4 +++
> 2 files changed, 37 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> index 300dee9ec6b4..1ecba7b5df1c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> @@ -119,7 +119,7 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
> uint32_t ch_inst,
> uint32_t umc_inst)
> {
> - uint64_t mc_umc_status, err_addr, retired_page;
> + uint64_t mc_umc_status, err_addr, soc_pa, retired_page, column;
> uint32_t channel_index;
> uint32_t eccinfo_table_idx;
> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> @@ -145,15 +145,27 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
> err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
>
> /* translate umc channel address to soc pa, 3 parts are included */
> - retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
> + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> ADDR_OF_256B_BLOCK(channel_index) |
> OFFSET_IN_256B_BLOCK(err_addr);
> + /* clear [C4 C3 C2] in soc physical address */
> + soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
>
> /* we only save ue error information currently, ce is skipped */
> if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
> - == 1)
> - amdgpu_umc_fill_error_record(err_data, err_addr,
> + == 1) {
> + /* loop for all possibilities of [C4 C3 C2] */
> + for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
> + retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
> + amdgpu_umc_fill_error_record(err_data, err_addr,
> retired_page, channel_index, umc_inst);
> +
> + /* shift R14 bit */
> + retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
> + amdgpu_umc_fill_error_record(err_data, err_addr,
> + retired_page, channel_index, umc_inst);
> + }
> + }
> }
> }
>
> @@ -332,8 +344,9 @@ static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
> uint32_t umc_inst)
> {
> uint32_t mc_umc_status_addr;
> - uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
> uint32_t channel_index;
> + uint64_t mc_umc_status, mc_umc_addrt0;
> + uint64_t err_addr, soc_pa, retired_page, column;
>
> mc_umc_status_addr =
> SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
> @@ -363,15 +376,27 @@ static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
> err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
>
> /* translate umc channel address to soc pa, 3 parts are included */
> - retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
> + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
> ADDR_OF_256B_BLOCK(channel_index) |
> OFFSET_IN_256B_BLOCK(err_addr);
> + /* clear [C4 C3 C2] in soc physical address */
> + soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
>
> /* we only save ue error information currently, ce is skipped */
> if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
> - == 1)
> - amdgpu_umc_fill_error_record(err_data, err_addr,
> + == 1) {
> + /* loop for all possibilities of [C4 C3 C2] */
> + for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
> + retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
> + amdgpu_umc_fill_error_record(err_data, err_addr,
> + retired_page, channel_index, umc_inst);
> +
> + /* shift R14 bit */
> + retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
> + amdgpu_umc_fill_error_record(err_data, err_addr,
> retired_page, channel_index, umc_inst);
> + }
> + }
> }
>
Better to maintain the page decode logic in a single function.
umc_v6_7_save_bad_page_info(err_addr, channel)
Thanks,
Lijo
> /* clear umc status */
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> index 9adebcf98582..b67677867b45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
> @@ -45,6 +45,10 @@
> #define UMC_V6_7_NA_MAP_PA_NUM 8
> /* R14 bit shift should be considered, double the number */
> #define UMC_V6_7_BAD_PAGE_NUM_PER_CHANNEL (UMC_V6_7_NA_MAP_PA_NUM * 2)
> +/* The C2 bit in SOC physical address */
> +#define UMC_V6_7_PA_C2_BIT 17
> +/* The R14 bit in SOC physical address */
> +#define UMC_V6_7_PA_R14_BIT 34
> /* UMC regiser per channel offset */
> #define UMC_V6_7_PER_CHANNEL_OFFSET 0x400
> extern struct amdgpu_umc_ras umc_v6_7_ras;
>
More information about the amd-gfx
mailing list