[PATCH 3/3] drm/amdgpu: Add umc v12_0 ras functions

Zhou1, Tao Tao.Zhou1 at amd.com
Tue Sep 5 03:20:46 UTC 2023


[AMD Official Use Only - General]

The series is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Li, Candice <Candice.Li at amd.com>
> Sent: Monday, September 4, 2023 3:20 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Li, Candice <Candice.Li at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: Add umc v12_0 ras functions
>
> Add umc v12_0 ras error querying.
>
> Signed-off-by: Candice Li <candice.li at amd.com>
> Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile    |   2 +-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  16 +-
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 256
> +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/umc_v12_0.h |
> 56 ++++++
>  4 files changed, 327 insertions(+), 3 deletions(-)  create mode 100644
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index ce0188b329cdeb..adf5470aa81020 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -121,7 +121,7 @@ amdgpu-y += \
>
>  # add UMC block
>  amdgpu-y += \
> -     umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o
> +     umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o
> umc_v12_0.o
>
>  # add IH block
>  amdgpu-y += \
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 8447fcada8bb92..41e1759b5f1eaa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -56,6 +56,7 @@
>  #include "umc_v6_1.h"
>  #include "umc_v6_0.h"
>  #include "umc_v6_7.h"
> +#include "umc_v12_0.h"
>  #include "hdp_v4_0.h"
>  #include "mca_v3_0.h"
>
> @@ -737,7 +738,8 @@ static void gmc_v9_0_set_irq_funcs(struct
> amdgpu_device *adev)
>       adev->gmc.vm_fault.funcs = &gmc_v9_0_irq_funcs;
>
>       if (!amdgpu_sriov_vf(adev) &&
> -         !adev->gmc.xgmi.connected_to_cpu) {
> +         !adev->gmc.xgmi.connected_to_cpu &&
> +         !adev->gmc.is_app_apu) {
>               adev->gmc.ecc_irq.num_types = 1;
>               adev->gmc.ecc_irq.funcs = &gmc_v9_0_ecc_funcs;
>       }
> @@ -1487,6 +1489,15 @@ static void gmc_v9_0_set_umc_funcs(struct
> amdgpu_device *adev)
>               else
>                       adev->umc.channel_idx_tbl =
> &umc_v6_7_channel_idx_tbl_second[0][0];
>               break;
> +     case IP_VERSION(12, 0, 0):
> +             adev->umc.max_ras_err_cnt_per_query =
> UMC_V12_0_TOTAL_CHANNEL_NUM(adev);
> +             adev->umc.channel_inst_num =
> UMC_V12_0_CHANNEL_INSTANCE_NUM;
> +             adev->umc.umc_inst_num =
> UMC_V12_0_UMC_INSTANCE_NUM;
> +             adev->umc.node_inst_num /=
> UMC_V12_0_UMC_INSTANCE_NUM;
> +             adev->umc.channel_offs =
> UMC_V12_0_PER_CHANNEL_OFFSET;
> +             adev->umc.active_mask = adev->aid_mask;
> +             if (!adev->gmc.xgmi.connected_to_cpu && !adev-
> >gmc.is_app_apu)
> +                     adev->umc.ras = &umc_v12_0_ras;
>       default:
>               break;
>       }
> @@ -2131,7 +2142,8 @@ static int gmc_v9_0_sw_init(void *handle)
>               return r;
>
>       if (!amdgpu_sriov_vf(adev) &&
> -         !adev->gmc.xgmi.connected_to_cpu) {
> +         !adev->gmc.xgmi.connected_to_cpu &&
> +         !adev->gmc.is_app_apu) {
>               /* interrupt sent to DF. */
>               r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_DF, 0,
>                                     &adev->gmc.ecc_irq);
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> new file mode 100644
> index 00000000000000..b3d6db14b351f1
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -0,0 +1,256 @@
> +/*
> + * Copyright 2023 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> USE
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#include "umc_v12_0.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_umc.h"
> +#include "amdgpu.h"
> +#include "umc/umc_12_0_0_offset.h"
> +#include "umc/umc_12_0_0_sh_mask.h"
> +
> +static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
> +                                         uint32_t node_inst,
> +                                         uint32_t umc_inst,
> +                                         uint32_t ch_inst)
> +{
> +     uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
> +     uint64_t cross_node_offset = (node_inst == 0) ? 0 :
> +UMC_V12_0_CROSS_NODE_OFFSET;
> +
> +     umc_inst = index / 4;
> +     ch_inst = index % 4;
> +
> +     return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST *
> umc_inst +
> +             UMC_V12_0_NODE_DIST * node_inst + cross_node_offset; }
> +
> +static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device
> *adev,
> +                                     uint32_t node_inst, uint32_t umc_inst,
> +                                     uint32_t ch_inst, void *data)
> +{
> +     uint64_t odecc_err_cnt_addr;
> +     uint64_t umc_reg_offset =
> +             get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
> +
> +     odecc_err_cnt_addr =
> +             SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt);
> +
> +     /* clear error count */
> +     WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4,
> +                     UMC_V12_0_CE_CNT_INIT);
> +
> +     return 0;
> +}
> +
> +static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) {
> +     amdgpu_umc_loop_channels(adev,
> +             umc_v12_0_reset_error_count_per_channel, NULL); }
> +
> +static void umc_v12_0_query_correctable_error_count(struct amdgpu_device
> *adev,
> +                                                uint64_t umc_reg_offset,
> +                                                unsigned long *error_count)
> +{
> +     uint64_t mc_umc_status;
> +     uint64_t mc_umc_status_addr;
> +
> +     mc_umc_status_addr =
> +             SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_STATUST0);
> +
> +     /* Rely on MCUMC_STATUS for correctable error counter
> +      * MCUMC_STATUS is a 64 bit register
> +      */
> +     mc_umc_status =
> +             RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) *
> 4);
> +
> +     if (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
> +         REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
> +             *error_count += 1;
> +}
> +
> +static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device
> *adev,
> +                                                   uint64_t umc_reg_offset,
> +                                                   unsigned long *error_count)
> {
> +     uint64_t mc_umc_status;
> +     uint64_t mc_umc_status_addr;
> +
> +     mc_umc_status_addr =
> +             SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_STATUST0);
> +
> +     /* Check the MCUMC_STATUS. */
> +     mc_umc_status =
> +             RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) *
> 4);
> +
> +     if ((REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
> +         (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
> +         REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
> +         REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
> +         REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
> +         REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
> +             *error_count += 1;
> +}
> +
> +static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
> +                                     uint32_t node_inst, uint32_t umc_inst,
> +                                     uint32_t ch_inst, void *data)
> +{
> +     struct ras_err_data *err_data = (struct ras_err_data *)data;
> +     uint64_t umc_reg_offset =
> +             get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
> +
> +     umc_v12_0_query_correctable_error_count(adev,
> +                                     umc_reg_offset,
> +                                     &(err_data->ce_count));
> +     umc_v12_0_query_uncorrectable_error_count(adev,
> +                                     umc_reg_offset,
> +                                     &(err_data->ue_count));
> +
> +     return 0;
> +}
> +
> +static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,
> +                                        void *ras_error_status)
> +{
> +     amdgpu_umc_loop_channels(adev,
> +             umc_v12_0_query_error_count, ras_error_status);
> +
> +     umc_v12_0_reset_error_count(adev);
> +}
> +
> +static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
> +                                         struct ras_err_data *err_data,
> uint64_t err_addr,
> +                                         uint32_t ch_inst, uint32_t umc_inst,
> +                                         uint32_t node_inst, uint64_t
> mc_umc_status) {
> +
> +}
> +
> +static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
> +                                     uint32_t node_inst, uint32_t umc_inst,
> +                                     uint32_t ch_inst, void *data)
> +{
> +     uint64_t mc_umc_status_addr;
> +     uint64_t mc_umc_status, err_addr;
> +     uint64_t mc_umc_addrt0;
> +     struct ras_err_data *err_data = (struct ras_err_data *)data;
> +     uint64_t umc_reg_offset =
> +             get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
> +
> +     mc_umc_status_addr =
> +             SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_STATUST0);
> +
> +     mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr +
> umc_reg_offset)
> +* 4);
> +
> +     if (mc_umc_status == 0)
> +             return 0;
> +
> +     if (!err_data->err_addr) {
> +             /* clear umc status */
> +             WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) *
> 4, 0x0ULL);
> +
> +             return 0;
> +     }
> +
> +     /* calculate error address if ue error is detected */
> +     if (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
> +         REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 &&
> +         REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UECC) ==
> +1) {
> +
> +             mc_umc_addrt0 =
> +                     SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_ADDRT0);
> +
> +             err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 +
> umc_reg_offset) * 4);
> +
> +             err_addr = REG_GET_FIELD(err_addr,
> MCA_UMC_UMC0_MCUMC_ADDRT0,
> +ErrorAddr);
> +
> +             umc_v12_0_convert_error_address(adev, err_data, err_addr,
> +                                     ch_inst, umc_inst, node_inst);
> +     }
> +
> +     /* clear umc status */
> +     WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4,
> 0x0ULL);
> +
> +     return 0;
> +}
> +
> +static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev,
> +                                          void *ras_error_status)
> +{
> +     amdgpu_umc_loop_channels(adev,
> +             umc_v12_0_query_error_address, ras_error_status); }
> +
> +static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
> +                                     uint32_t node_inst, uint32_t umc_inst,
> +                                     uint32_t ch_inst, void *data)
> +{
> +     uint32_t odecc_cnt_sel;
> +     uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr;
> +     uint64_t umc_reg_offset =
> +             get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
> +
> +     odecc_cnt_sel_addr =
> +             SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccCntSel);
> +     odecc_err_cnt_addr =
> +             SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt);
> +
> +     odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr +
> umc_reg_offset)
> +* 4);
> +
> +     /* set ce error interrupt type to APIC based interrupt */
> +     odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel,
> +                                     OdEccErrInt, 0x1);
> +     WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4,
> +odecc_cnt_sel);
> +
> +     /* set error count to initial value */
> +     WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4,
> +UMC_V12_0_CE_CNT_INIT);
> +
> +     return 0;
> +}
> +
> +static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) {
> +     amdgpu_umc_loop_channels(adev,
> +             umc_v12_0_err_cnt_init_per_channel, NULL); }
> +
> +static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev)
> +{
> +     /*
> +      * Force return true, because regUMCCH0_EccCtrl
> +      * is not accessible from host side
> +      */
> +     return true;
> +}
> +
> +const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = {
> +     .query_ras_error_count = umc_v12_0_query_ras_error_count,
> +     .query_ras_error_address = umc_v12_0_query_ras_error_address,
> +};
> +
> +struct amdgpu_umc_ras umc_v12_0_ras = {
> +     .ras_block = {
> +             .hw_ops = &umc_v12_0_ras_hw_ops,
> +     },
> +     .err_cnt_init = umc_v12_0_err_cnt_init,
> +     .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, };
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> new file mode 100644
> index 00000000000000..2e63cc30766bc3
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> @@ -0,0 +1,56 @@
> +/*
> + * Copyright 2023 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> USE
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#ifndef __UMC_V12_0_H__
> +#define __UMC_V12_0_H__
> +
> +#include "soc15_common.h"
> +#include "amdgpu.h"
> +
> +#define UMC_V12_0_NODE_DIST          0x40000000
> +#define UMC_V12_0_INST_DIST          0x40000
> +
> +/* UMC register per channel offset */
> +#define UMC_V12_0_PER_CHANNEL_OFFSET 0x400
> +
> +/* UMC cross node offset */
> +#define UMC_V12_0_CROSS_NODE_OFFSET          0x100000000
> +
> +/* OdEccErrCnt max value */
> +#define UMC_V12_0_CE_CNT_MAX         0xffff
> +/* umc ce interrupt threshold */
> +#define UMC_V12_0_CE_INT_THRESHOLD   0xffff
> +/* umc ce count initial value */
> +#define UMC_V12_0_CE_CNT_INIT        (UMC_V12_0_CE_CNT_MAX -
> UMC_V12_0_CE_INT_THRESHOLD)
> +
> +/* number of umc channel instance with memory map register access */
> +#define UMC_V12_0_CHANNEL_INSTANCE_NUM               8
> +/* number of umc instance with memory map register access */
> +#define UMC_V12_0_UMC_INSTANCE_NUM           4
> +
> +/* Total channel instances for all available umc nodes */ #define
> +UMC_V12_0_TOTAL_CHANNEL_NUM(adev) \
> +     (UMC_V12_0_CHANNEL_INSTANCE_NUM * (adev)->gmc.num_umc)
> +
> +extern struct amdgpu_umc_ras umc_v12_0_ras;
> +
> +#endif
> --
> 2.25.1



More information about the amd-gfx mailing list