[PATCH 1/3] drm/amdgpu: optimize umc v12 address conversion function

Chai, Thomas YiPeng.Chai at amd.com
Thu Jul 18 01:41:54 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

OK


-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang at amd.com>
Sent: Wednesday, July 17, 2024 9:00 PM
To: Chai, Thomas <YiPeng.Chai at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>
Subject: RE: [PATCH 1/3] drm/amdgpu: optimize umc v12 address conversion function

[AMD Official Use Only - AMD Internal Distribution Only]

+static int umc_v12_0_expand_addr_to_bad_pages(struct amdgpu_device *adev,
+                       uint64_t pa_addr, uint64_t *pfns, int len)

I would call this function as lookup_bad_pages_in_a_row, or something like that.

Anyway, the series is

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: Chai, Thomas <YiPeng.Chai at amd.com>
Sent: Wednesday, July 17, 2024 16:16
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
Subject: [PATCH 1/3] drm/amdgpu: optimize umc v12 address conversion function

Split into 3 parts:
1. Convert soc physical address via ras ta.
2. Expand bad pages from soc physical address.
3. Dump bad address info.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 116 ++++++++++++++++---------
 1 file changed, 77 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 9dbb13adb661..eca5ac6a0532 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -225,26 +225,16 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
        }
 }

-static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
-                               struct ta_ras_query_address_input *addr_in,
-                               uint64_t *pfns, int len)
+static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev,
+                               struct ta_ras_query_address_output *addr_out,
+                               uint64_t err_addr)
 {
        uint32_t col, row, row_xor, bank, channel_index;
-       uint64_t soc_pa, retired_page, column, err_addr;
-       struct ta_ras_query_address_output addr_out;
-       uint32_t pos = 0;
-
-       err_addr = addr_in->ma.err_addr;
-       addr_in->addr_type = TA_RAS_MCA_TO_PA;
-       if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) {
-               dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
-                       err_addr);
-               return 0;
-       }
+       uint64_t soc_pa, retired_page, column;

-       soc_pa = addr_out.pa.pa;
-       bank = addr_out.pa.bank;
-       channel_index = addr_out.pa.channel_idx;
+       soc_pa = addr_out->pa.pa;
+       bank = addr_out->pa.bank;
+       channel_index = addr_out->pa.channel_idx;

        col = (err_addr >> 1) & 0x1fULL;
        row = (err_addr >> 10) & 0x3fffULL; @@ -258,11 +248,6 @@ static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
        for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
                retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
                retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
-
-               if (pos >= len)
-                       return 0;
-               pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
-
                /* include column bit 0 and 1 */
                col &= 0x3;
                col |= (column << 2);
@@ -270,6 +255,35 @@ static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
                        "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
                        retired_page, row, col, bank, channel_index);

+               /* shift R13 bit */
+               retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+               dev_info(adev->dev,
+                       "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
+                       retired_page, row_xor, col, bank, channel_index);
+       }
+}
+
+static int umc_v12_0_expand_addr_to_bad_pages(struct amdgpu_device *adev,
+                       uint64_t pa_addr, uint64_t *pfns, int len) {
+       uint64_t soc_pa, retired_page, column;
+       uint32_t pos = 0;
+
+       soc_pa = pa_addr;
+       /* clear [C3 C2] in soc physical address */
+       soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+       /* clear [C4] in soc physical address */
+       soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+       /* loop for all possibilities of [C4 C3 C2] */
+       for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+               retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+               retired_page |= (((column & 0x4) >> 2) <<
+ UMC_V12_0_PA_C4_BIT);
+
+               if (pos >= len)
+                       return 0;
+               pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
                /* shift R13 bit */
                retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);

@@ -277,14 +291,40 @@ static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
                        return 0;
                pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;

-               dev_info(adev->dev,
-                       "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
-                       retired_page, row_xor, col, bank, channel_index);
        }

        return pos;
 }

+static int umc_v12_0_convert_mca_to_addr(struct amdgpu_device *adev,
+                       uint64_t err_addr, uint32_t ch, uint32_t umc,
+                       uint32_t node, uint32_t socket,
+                       uint64_t *addr, bool dump_addr) {
+       struct ta_ras_query_address_input addr_in;
+       struct ta_ras_query_address_output addr_out;
+
+       memset(&addr_in, 0, sizeof(addr_in));
+       addr_in.ma.err_addr = err_addr;
+       addr_in.ma.ch_inst = ch;
+       addr_in.ma.umc_inst = umc;
+       addr_in.ma.node_inst = node;
+       addr_in.ma.socket_id = socket;
+       addr_in.addr_type = TA_RAS_MCA_TO_PA;
+       if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out)) {
+               dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
+                       err_addr);
+               return -EINVAL;
+       }
+
+       if (dump_addr)
+               umc_v12_0_dump_addr_info(adev, &addr_out, err_addr);
+
+       *addr = addr_out.pa.pa;
+
+       return 0;
+}
+
 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
                                        uint32_t node_inst, uint32_t umc_inst,
                                        uint32_t ch_inst, void *data) @@ -483,12 +523,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,  {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        uint16_t hwid, mcatype;
-       struct ta_ras_query_address_input addr_in;
        uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
-       uint64_t err_addr, hash_val = 0;
+       uint64_t err_addr, hash_val = 0, pa_addr = 0;
        struct ras_ecc_err *ecc_err;
-       int count;
-       int ret;
+       int count, ret;

        hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
        mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -514,17 +552,17 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
                MCA_IPID_2_UMC_CH(ipid),
                err_addr);

-       memset(page_pfn, 0, sizeof(page_pfn));
-
-       memset(&addr_in, 0, sizeof(addr_in));
-       addr_in.ma.err_addr = err_addr;
-       addr_in.ma.ch_inst = MCA_IPID_2_UMC_CH(ipid);
-       addr_in.ma.umc_inst = MCA_IPID_2_UMC_INST(ipid);
-       addr_in.ma.node_inst = MCA_IPID_2_DIE_ID(ipid);
-       addr_in.ma.socket_id = MCA_IPID_2_SOCKET_ID(ipid);
+       ret = umc_v12_0_convert_mca_to_addr(adev,
+                       err_addr, MCA_IPID_2_UMC_CH(ipid),
+                       MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid),
+                       MCA_IPID_2_SOCKET_ID(ipid), &pa_addr, true);
+       if (ret)
+               return ret;

-       count = umc_v12_0_convert_err_addr(adev,
-                               &addr_in, page_pfn, ARRAY_SIZE(page_pfn));
+       memset(page_pfn, 0, sizeof(page_pfn));
+       count = umc_v12_0_expand_addr_to_bad_pages(adev,
+                               pa_addr,
+                               page_pfn, ARRAY_SIZE(page_pfn));
        if (count <= 0) {
                dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
                return 0;
--
2.34.1




More information about the amd-gfx mailing list