[PATCH 3/3] drm/amdgpu: Add umc v12_0 ras functions
Zhou1, Tao
Tao.Zhou1 at amd.com
Tue Sep 5 03:20:46 UTC 2023
[AMD Official Use Only - General]
The series is:
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: Li, Candice <Candice.Li at amd.com>
> Sent: Monday, September 4, 2023 3:20 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Li, Candice <Candice.Li at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: Add umc v12_0 ras functions
>
> Add umc v12_0 ras error querying.
>
> Signed-off-by: Candice Li <candice.li at amd.com>
> Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/Makefile | 2 +-
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 16 +-
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 256
> +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/umc_v12_0.h |
> 56 ++++++
> 4 files changed, 327 insertions(+), 3 deletions(-) create mode 100644
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> create mode 100644 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index ce0188b329cdeb..adf5470aa81020 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -121,7 +121,7 @@ amdgpu-y += \
>
> # add UMC block
> amdgpu-y += \
> - umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o
> + umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o
> umc_v12_0.o
>
> # add IH block
> amdgpu-y += \
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 8447fcada8bb92..41e1759b5f1eaa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -56,6 +56,7 @@
> #include "umc_v6_1.h"
> #include "umc_v6_0.h"
> #include "umc_v6_7.h"
> +#include "umc_v12_0.h"
> #include "hdp_v4_0.h"
> #include "mca_v3_0.h"
>
> @@ -737,7 +738,8 @@ static void gmc_v9_0_set_irq_funcs(struct
> amdgpu_device *adev)
> adev->gmc.vm_fault.funcs = &gmc_v9_0_irq_funcs;
>
> if (!amdgpu_sriov_vf(adev) &&
> - !adev->gmc.xgmi.connected_to_cpu) {
> + !adev->gmc.xgmi.connected_to_cpu &&
> + !adev->gmc.is_app_apu) {
> adev->gmc.ecc_irq.num_types = 1;
> adev->gmc.ecc_irq.funcs = &gmc_v9_0_ecc_funcs;
> }
> @@ -1487,6 +1489,15 @@ static void gmc_v9_0_set_umc_funcs(struct
> amdgpu_device *adev)
> else
> adev->umc.channel_idx_tbl =
> &umc_v6_7_channel_idx_tbl_second[0][0];
> break;
> + case IP_VERSION(12, 0, 0):
> + adev->umc.max_ras_err_cnt_per_query =
> UMC_V12_0_TOTAL_CHANNEL_NUM(adev);
> + adev->umc.channel_inst_num =
> UMC_V12_0_CHANNEL_INSTANCE_NUM;
> + adev->umc.umc_inst_num =
> UMC_V12_0_UMC_INSTANCE_NUM;
> + adev->umc.node_inst_num /=
> UMC_V12_0_UMC_INSTANCE_NUM;
> + adev->umc.channel_offs =
> UMC_V12_0_PER_CHANNEL_OFFSET;
> + adev->umc.active_mask = adev->aid_mask;
> + if (!adev->gmc.xgmi.connected_to_cpu && !adev-
> >gmc.is_app_apu)
> + adev->umc.ras = &umc_v12_0_ras;
> default:
> break;
> }
> @@ -2131,7 +2142,8 @@ static int gmc_v9_0_sw_init(void *handle)
> return r;
>
> if (!amdgpu_sriov_vf(adev) &&
> - !adev->gmc.xgmi.connected_to_cpu) {
> + !adev->gmc.xgmi.connected_to_cpu &&
> + !adev->gmc.is_app_apu) {
> /* interrupt sent to DF. */
> r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_DF, 0,
> &adev->gmc.ecc_irq);
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> new file mode 100644
> index 00000000000000..b3d6db14b351f1
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -0,0 +1,256 @@
> +/*
> + * Copyright 2023 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
> EVENT
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> USE
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#include "umc_v12_0.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_umc.h"
> +#include "amdgpu.h"
> +#include "umc/umc_12_0_0_offset.h"
> +#include "umc/umc_12_0_0_sh_mask.h"
> +
> +static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
> + uint32_t node_inst,
> + uint32_t umc_inst,
> + uint32_t ch_inst)
> +{
> + uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
> + uint64_t cross_node_offset = (node_inst == 0) ? 0 :
> +UMC_V12_0_CROSS_NODE_OFFSET;
> +
> + umc_inst = index / 4;
> + ch_inst = index % 4;
> +
> + return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST *
> umc_inst +
> + UMC_V12_0_NODE_DIST * node_inst + cross_node_offset; }
> +
> +static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device
> *adev,
> + uint32_t node_inst, uint32_t umc_inst,
> + uint32_t ch_inst, void *data)
> +{
> + uint64_t odecc_err_cnt_addr;
> + uint64_t umc_reg_offset =
> + get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
> +
> + odecc_err_cnt_addr =
> + SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt);
> +
> + /* clear error count */
> + WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4,
> + UMC_V12_0_CE_CNT_INIT);
> +
> + return 0;
> +}
> +
> +static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) {
> + amdgpu_umc_loop_channels(adev,
> + umc_v12_0_reset_error_count_per_channel, NULL); }
> +
> +static void umc_v12_0_query_correctable_error_count(struct amdgpu_device
> *adev,
> + uint64_t umc_reg_offset,
> + unsigned long *error_count)
> +{
> + uint64_t mc_umc_status;
> + uint64_t mc_umc_status_addr;
> +
> + mc_umc_status_addr =
> + SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_STATUST0);
> +
> + /* Rely on MCUMC_STATUS for correctable error counter
> + * MCUMC_STATUS is a 64 bit register
> + */
> + mc_umc_status =
> + RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) *
> 4);
> +
> + if (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
> + *error_count += 1;
> +}
> +
> +static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device
> *adev,
> + uint64_t umc_reg_offset,
> + unsigned long *error_count)
> {
> + uint64_t mc_umc_status;
> + uint64_t mc_umc_status_addr;
> +
> + mc_umc_status_addr =
> + SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_STATUST0);
> +
> + /* Check the MCUMC_STATUS. */
> + mc_umc_status =
> + RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) *
> 4);
> +
> + if ((REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
> + (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
> + *error_count += 1;
> +}
> +
> +static int umc_v12_0_query_error_count(struct amdgpu_device *adev,
> + uint32_t node_inst, uint32_t umc_inst,
> + uint32_t ch_inst, void *data)
> +{
> + struct ras_err_data *err_data = (struct ras_err_data *)data;
> + uint64_t umc_reg_offset =
> + get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
> +
> + umc_v12_0_query_correctable_error_count(adev,
> + umc_reg_offset,
> + &(err_data->ce_count));
> + umc_v12_0_query_uncorrectable_error_count(adev,
> + umc_reg_offset,
> + &(err_data->ue_count));
> +
> + return 0;
> +}
> +
> +static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,
> + void *ras_error_status)
> +{
> + amdgpu_umc_loop_channels(adev,
> + umc_v12_0_query_error_count, ras_error_status);
> +
> + umc_v12_0_reset_error_count(adev);
> +}
> +
> +static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
> + struct ras_err_data *err_data,
> uint64_t err_addr,
> + uint32_t ch_inst, uint32_t umc_inst,
> + uint32_t node_inst, uint64_t
> mc_umc_status) {
> +
> +}
> +
> +static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
> + uint32_t node_inst, uint32_t umc_inst,
> + uint32_t ch_inst, void *data)
> +{
> + uint64_t mc_umc_status_addr;
> + uint64_t mc_umc_status, err_addr;
> + uint64_t mc_umc_addrt0;
> + struct ras_err_data *err_data = (struct ras_err_data *)data;
> + uint64_t umc_reg_offset =
> + get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
> +
> + mc_umc_status_addr =
> + SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_STATUST0);
> +
> + mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr +
> umc_reg_offset)
> +* 4);
> +
> + if (mc_umc_status == 0)
> + return 0;
> +
> + if (!err_data->err_addr) {
> + /* clear umc status */
> + WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) *
> 4, 0x0ULL);
> +
> + return 0;
> + }
> +
> + /* calculate error address if ue error is detected */
> + if (REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 &&
> + REG_GET_FIELD(mc_umc_status,
> MCA_UMC_UMC0_MCUMC_STATUST0, UECC) ==
> +1) {
> +
> + mc_umc_addrt0 =
> + SOC15_REG_OFFSET(UMC, 0,
> regMCA_UMC_UMC0_MCUMC_ADDRT0);
> +
> + err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 +
> umc_reg_offset) * 4);
> +
> + err_addr = REG_GET_FIELD(err_addr,
> MCA_UMC_UMC0_MCUMC_ADDRT0,
> +ErrorAddr);
> +
> + umc_v12_0_convert_error_address(adev, err_data, err_addr,
> + ch_inst, umc_inst, node_inst);
> + }
> +
> + /* clear umc status */
> + WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4,
> 0x0ULL);
> +
> + return 0;
> +}
> +
> +static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev,
> + void *ras_error_status)
> +{
> + amdgpu_umc_loop_channels(adev,
> + umc_v12_0_query_error_address, ras_error_status); }
> +
> +static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
> + uint32_t node_inst, uint32_t umc_inst,
> + uint32_t ch_inst, void *data)
> +{
> + uint32_t odecc_cnt_sel;
> + uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr;
> + uint64_t umc_reg_offset =
> + get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst);
> +
> + odecc_cnt_sel_addr =
> + SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccCntSel);
> + odecc_err_cnt_addr =
> + SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt);
> +
> + odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr +
> umc_reg_offset)
> +* 4);
> +
> + /* set ce error interrupt type to APIC based interrupt */
> + odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel,
> + OdEccErrInt, 0x1);
> + WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4,
> +odecc_cnt_sel);
> +
> + /* set error count to initial value */
> + WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4,
> +UMC_V12_0_CE_CNT_INIT);
> +
> + return 0;
> +}
> +
> +static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) {
> + amdgpu_umc_loop_channels(adev,
> + umc_v12_0_err_cnt_init_per_channel, NULL); }
> +
> +static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev)
> +{
> + /*
> + * Force return true, because regUMCCH0_EccCtrl
> + * is not accessible from host side
> + */
> + return true;
> +}
> +
> +const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = {
> + .query_ras_error_count = umc_v12_0_query_ras_error_count,
> + .query_ras_error_address = umc_v12_0_query_ras_error_address,
> +};
> +
> +struct amdgpu_umc_ras umc_v12_0_ras = {
> + .ras_block = {
> + .hw_ops = &umc_v12_0_ras_hw_ops,
> + },
> + .err_cnt_init = umc_v12_0_err_cnt_init,
> + .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, };
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> new file mode 100644
> index 00000000000000..2e63cc30766bc3
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
> @@ -0,0 +1,56 @@
> +/*
> + * Copyright 2023 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
> EVENT
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> USE
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +#ifndef __UMC_V12_0_H__
> +#define __UMC_V12_0_H__
> +
> +#include "soc15_common.h"
> +#include "amdgpu.h"
> +
> +#define UMC_V12_0_NODE_DIST 0x40000000
> +#define UMC_V12_0_INST_DIST 0x40000
> +
> +/* UMC register per channel offset */
> +#define UMC_V12_0_PER_CHANNEL_OFFSET 0x400
> +
> +/* UMC cross node offset */
> +#define UMC_V12_0_CROSS_NODE_OFFSET 0x100000000
> +
> +/* OdEccErrCnt max value */
> +#define UMC_V12_0_CE_CNT_MAX 0xffff
> +/* umc ce interrupt threshold */
> +#define UMC_V12_0_CE_INT_THRESHOLD 0xffff
> +/* umc ce count initial value */
> +#define UMC_V12_0_CE_CNT_INIT (UMC_V12_0_CE_CNT_MAX -
> UMC_V12_0_CE_INT_THRESHOLD)
> +
> +/* number of umc channel instance with memory map register access */
> +#define UMC_V12_0_CHANNEL_INSTANCE_NUM 8
> +/* number of umc instance with memory map register access */
> +#define UMC_V12_0_UMC_INSTANCE_NUM 4
> +
> +/* Total channel instances for all available umc nodes */ #define
> +UMC_V12_0_TOTAL_CHANNEL_NUM(adev) \
> + (UMC_V12_0_CHANNEL_INSTANCE_NUM * (adev)->gmc.num_umc)
> +
> +extern struct amdgpu_umc_ras umc_v12_0_ras;
> +
> +#endif
> --
> 2.25.1
More information about the amd-gfx
mailing list