[PATCH 1/4] drm/amdgpu: export umc error address translation interface

Zhou1, Tao Tao.Zhou1 at amd.com
Mon Sep 26 03:46:11 UTC 2022


[AMD Official Use Only - General]



> -----Original Message-----
> From: Yang, Stanley <Stanley.Yang at amd.com>
> Sent: Monday, September 26, 2022 11:15 AM
> To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org; Zhang,
> Hawking <Hawking.Zhang at amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: export umc error address translation
> interface
> 
> [AMD Official Use Only - General]
> 
> Hi Tao,
> 
> > -----Original Message-----
> > From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> > Sent: Friday, September 23, 2022 5:21 PM
> > To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> > <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>
> > Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> > Subject: [PATCH 1/4] drm/amdgpu: export umc error address translation
> > interface
> >
> > Make it globally so we can convert specific mca address.
> >
> > Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  6 ++++++
> >  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   | 11 +++++------
> >  2 files changed, 11 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > index 3629d8f292ef..31fbefaaf676 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > @@ -22,6 +22,8 @@
> >  #define __AMDGPU_UMC_H__
> >  #include "amdgpu_ras.h"
> >
> > +#define UMC_INVALID_ADDR 0x1ULL
> > +
> >  /*
> >   * (addr / 256) * 4096, the higher 26 bits in ErrorAddr
> >   * is the index of 4KB block
> > @@ -51,6 +53,10 @@ struct amdgpu_umc_ras {
> >  	struct amdgpu_ras_block_object ras_block;
> >  	void (*err_cnt_init)(struct amdgpu_device *adev);
> >  	bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
> > +	void (*query_error_address_per_channel)(struct amdgpu_device
> > *adev,
> > +						 struct ras_err_data
> > *err_data,
> > +						 uint32_t umc_reg_offset,
> > uint32_t ch_inst,
> > +						 uint32_t umc_inst, uint64_t
> > mca_addr);
> >  	void (*ecc_info_query_ras_error_count)(struct amdgpu_device *adev,
> >  				      void *ras_error_status);
> >  	void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> > b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> > index bf7524f16b66..0f1b215653f3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
> > @@ -452,9 +452,8 @@ static void umc_v6_7_query_ras_error_count(struct
> > amdgpu_device *adev,
> >
> >  static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
> >  					 struct ras_err_data *err_data,
> > -					 uint32_t umc_reg_offset,
> > -					 uint32_t ch_inst,
> > -					 uint32_t umc_inst)
> > +					 uint32_t umc_reg_offset, uint32_t
> > ch_inst,
> > +					 uint32_t umc_inst, uint64_t
> > mca_addr)
> >  {
> >  	uint32_t mc_umc_status_addr;
> >  	uint32_t channel_index;
> > @@ -540,9 +539,8 @@ static void
> > umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
> >  							 ch_inst);
> >  		umc_v6_7_query_error_address(adev,
> >  					     err_data,
> > -					     umc_reg_offset,
> > -					     ch_inst,
> > -					     umc_inst);
> > +					     umc_reg_offset, ch_inst,
> > +					     umc_inst, UMC_INVALID_ADDR);
> >  	}
> >  }
> >
> > @@ -583,4 +581,5 @@ struct amdgpu_umc_ras umc_v6_7_ras = {
> >  	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
> >  	.ecc_info_query_ras_error_count =
> > umc_v6_7_ecc_info_query_ras_error_count,
> >  	.ecc_info_query_ras_error_address =
> > umc_v6_7_ecc_info_query_ras_error_address,
> > +	.query_error_address_per_channel =
> > umc_v6_7_query_error_address,
> 
> Stanley: According to patch#3, it's better to rename
> query_error_address_per_channel to
> covert/query_error_address_at_specific_channel due to the channel_instance
> and umc_instance get form the mce structure, using per_channel may cause
> misunderstanding.

[Tao]: thanks for your suggestion, I'll update the name. 

> 
> >  };
> > --
> > 2.35.1


More information about the amd-gfx mailing list