[PATCH Review 4/4] query umc error info from ecc_table

Wed Nov 17 11:14:41 UTC 2021

On 11/17/2021 3:41 PM, Stanley.Yang wrote:
> if smu support ECCTABLE, driver can message smu to get ecc_table
> then query umc error info from ECCTABLE
> apply pmfw version check to ensure backward compatibility
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 42 ++++++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  7 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       | 71 +++++++++++++------
>   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  1 +
>   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 12 ++++
>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  4 ++
>   6 files changed, 107 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 90f0db3b4f65..6b0f2ba1e420 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -888,6 +888,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
>   	}
>   }
>   
> +static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
> +{
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	/*
> +	 * choosing right query method according to
> +	 * whether smu support query error information
> +	 */
> +	if ((ras->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
> +			!smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc))) {
> +

This version check should be in aldebaran_ppt implementation. In general 
the callback will check the FW version that supports ECC table for the 
corresponding ASIC. It may return ENOTSUPP or similar if the FW version 
doesn't support ECC table and that may be checked here. Keeping 
smu_version in ras context is not needed.

> +		if (adev->umc.ras_funcs &&
> +			adev->umc.ras_funcs->message_smu_query_ras_error_count)
> +			adev->umc.ras_funcs->message_smu_query_ras_error_count(adev, err_data);
> +
> +		if (adev->umc.ras_funcs &&
> +			adev->umc.ras_funcs->message_smu_query_ras_error_address)
> +			adev->umc.ras_funcs->message_smu_query_ras_error_address(adev, err_data);
> +	} else {
> +		if (adev->umc.ras_funcs &&
> +			adev->umc.ras_funcs->query_ras_error_count)
> +			adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
> +
> +		/* umc query_ras_error_address is also responsible for clearing
> +		 * error status
> +		 */
> +		if (adev->umc.ras_funcs &&
> +		    adev->umc.ras_funcs->query_ras_error_address)
> +			adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
> +	}
> +}
> +
>   /* query/inject/cure begin */
>   int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
>   				  struct ras_query_if *info)
> @@ -901,15 +933,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
>   
>   	switch (info->head.block) {
>   	case AMDGPU_RAS_BLOCK__UMC:
> -		if (adev->umc.ras_funcs &&
> -		    adev->umc.ras_funcs->query_ras_error_count)
> -			adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
> -		/* umc query_ras_error_address is also responsible for clearing
> -		 * error status
> -		 */
> -		if (adev->umc.ras_funcs &&
> -		    adev->umc.ras_funcs->query_ras_error_address)
> -			adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
> +		amdgpu_ras_get_ecc_info(adev, &err_data);
>   		break;
>   	case AMDGPU_RAS_BLOCK__SDMA:
>   		if (adev->sdma.funcs->query_ras_error_count) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index bcbf3264d92f..3f0de0cc8403 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -322,6 +322,12 @@ struct ras_common_if {
>   
>   #define MAX_UMC_CHANNEL_NUM 32
>   
> +/*
> + * SMU support ECCTABLE since version 68.42.0,
> + * use this to decide query umc error info method
> + */
> +#define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> +
>   struct ecc_info_per_ch {
>   	uint16_t ce_count_lo_chip;
>   	uint16_t ce_count_hi_chip;
> @@ -375,6 +381,7 @@ struct amdgpu_ras {
>   
>   	/* record umc error info queried from smu */
>   	struct umc_ecc_info umc_ecc;
> +	uint32_t smu_version;
>   };
>   
>   struct ras_fs_data {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 0c7c56a91b25..2c3e97c9410b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -97,28 +97,57 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   
>   	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> -	if (adev->umc.ras_funcs &&
> -	    adev->umc.ras_funcs->query_ras_error_count)
> -	    adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
>   
> -	if (adev->umc.ras_funcs &&
> -	    adev->umc.ras_funcs->query_ras_error_address &&
> -	    adev->umc.max_ras_err_cnt_per_query) {
> -		err_data->err_addr =
> -			kcalloc(adev->umc.max_ras_err_cnt_per_query,
> -				sizeof(struct eeprom_table_record), GFP_KERNEL);
> -
> -		/* still call query_ras_error_address to clear error status
> -		 * even NOMEM error is encountered
> -		 */
> -		if(!err_data->err_addr)
> -			dev_warn(adev->dev, "Failed to alloc memory for "
> -					"umc error address record!\n");
> -
> -		/* umc query_ras_error_address is also responsible for clearing
> -		 * error status
> -		 */
> -		adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
> +	if ((con->smu_version >= SUPPORT_ECCTABLE_SMU_VERSION) &&
> +			!smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc))) {
> +
Same comment as above.

> +		if (adev->umc.ras_funcs &&
> +		    adev->umc.ras_funcs->message_smu_query_ras_error_count)
> +		    adev->umc.ras_funcs->message_smu_query_ras_error_count(adev, ras_error_status);
> +
> +		if (adev->umc.ras_funcs &&
> +		    adev->umc.ras_funcs->message_smu_query_ras_error_address &&
> +		    adev->umc.max_ras_err_cnt_per_query) {
> +			err_data->err_addr =
> +				kcalloc(adev->umc.max_ras_err_cnt_per_query,
> +					sizeof(struct eeprom_table_record), GFP_KERNEL);
> +
> +			/* still call query_ras_error_address to clear error status
> +			 * even NOMEM error is encountered
> +			 */
> +			if(!err_data->err_addr)
> +				dev_warn(adev->dev, "Failed to alloc memory for "
> +						"umc error address record!\n");
> +
> +			/* umc query_ras_error_address is also responsible for clearing
> +			 * error status
> +			 */
> +			adev->umc.ras_funcs->message_smu_query_ras_error_address(adev, ras_error_status);
> +		}
> +	} else {
> +		if (adev->umc.ras_funcs &&
> +		    adev->umc.ras_funcs->query_ras_error_count)
> +		    adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
> +
> +		if (adev->umc.ras_funcs &&
> +		    adev->umc.ras_funcs->query_ras_error_address &&
> +		    adev->umc.max_ras_err_cnt_per_query) {
> +			err_data->err_addr =
> +				kcalloc(adev->umc.max_ras_err_cnt_per_query,
> +					sizeof(struct eeprom_table_record), GFP_KERNEL);
> +
> +			/* still call query_ras_error_address to clear error status
> +			 * even NOMEM error is encountered
> +			 */
> +			if(!err_data->err_addr)
> +				dev_warn(adev->dev, "Failed to alloc memory for "
> +						"umc error address record!\n");
> +
> +			/* umc query_ras_error_address is also responsible for clearing
> +			 * error status
> +			 */
> +			adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
> +		}
>   	}
>   
>   	/* only uncorrectable error needs gpu reset */
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> index ea65de0160c3..7a06021a58f0 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> @@ -1404,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable);
>   
>   int smu_wait_for_event(struct amdgpu_device *adev, enum smu_event_type event,
>   		       uint64_t event_arg);
> +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
>   
>   #endif
>   #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 01168b8955bf..6340c079f35e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -3072,6 +3072,18 @@ int smu_set_light_sbr(struct smu_context *smu, bool enable)
>   	return ret;
>   }
>   
> +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc)
> +{
> +	int ret = -1;
> +
> +	if (smu->ppt_funcs &&
> +		smu->ppt_funcs->get_ecc_info)
> +		ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
> +

Shouldn't return -1 if ppt func is not present. If ppt func is not 
present, that means this method is not supported for the SOC; return 
ENOTSUPP.

> +	return ret;
> +
> +}
> +

Probably the above function should be clubbed with patch 3 - smu support 
for getting ras ecc info.

>   static int smu_get_prv_buffer_details(void *handle, void **addr, size_t *size)
>   {
>   	struct smu_context *smu = handle;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index 55421ea622fb..55ef10ca684a 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -200,11 +200,15 @@ int smu_v13_0_check_fw_version(struct smu_context *smu)
>   	uint16_t smu_major;
>   	uint8_t smu_minor, smu_debug;
>   	int ret = 0;
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(smu->adev);
>   
>   	ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
>   	if (ret)
>   		return ret;
>   
> +	/* record smu interface version, help umc query error method */
> +	ras->smu_version = smu_version;
> +

This is not needed. ASIC specific functions can check the FW version for 
ECC table support.

Thanks,
Lijo

>   	smu_major = (smu_version >> 16) & 0xffff;
>   	smu_minor = (smu_version >> 8) & 0xff;
>   	smu_debug = (smu_version >> 0) & 0xff;
>