[PATCH v3] drm/amdgpu: Move mca debug mode decision to ras

Lazar, Lijo lijo.lazar at amd.com
Thu Nov 16 11:27:50 UTC 2023


<ping>

On 11/10/2023 1:25 PM, Lijo Lazar wrote:
> Refactor code such that ras block decides the default mca debug mode,
> and not swsmu block.
> 
> By default mca debug mode is set to false.
> 
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
> ---
> v3: Default mca debug mode is set to false
> 
> v2: Set mca debug mode early before ras block late init as ras query is
> initiated during late init of ras blocks (KevinYang)
> 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c            |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c            | 14 +++++++++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h            |  2 +-
>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 12 ------------
>   4 files changed, 13 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> index cf33eb219e25..54f2f346579e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> @@ -377,7 +377,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val)
>   	struct amdgpu_device *adev = (struct amdgpu_device *)data;
>   	int ret;
>   
> -	ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false);
> +	ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
>   	if (ret)
>   		return ret;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 84e5987b14e0..6747fbe4feab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3132,6 +3132,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
>   	if (amdgpu_sriov_vf(adev))
>   		return 0;
>   
> +	amdgpu_ras_set_mca_debug_mode(adev, false);
> +
>   	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
>   		if (!node->ras_obj) {
>   			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
> @@ -3405,12 +3407,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> -void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
> +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
>   {
>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	int ret;
>   
> -	if (con)
> -		con->is_mca_debug_mode = enable;
> +	if (con) {
> +		ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
> +		if (!ret)
> +			con->is_mca_debug_mode = enable;
> +	}
> +
> +	return ret;
>   }
>   
>   bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 19161916ac46..6a941eb8fb8f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -773,7 +773,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
>   
>   int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);
>   
> -void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
> +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
>   bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
>   bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
>   				     unsigned int *mode);
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> index 6cbfb25a05de..f09f56efbdc3 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> @@ -1516,7 +1516,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct smu_context *smu, bool enable)
>   	if (smu->smc_fw_version < 0x554800)
>   		return 0;
>   
> -	amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
>   	return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
>   					       enable ? 0 : ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
>   					       NULL);
> @@ -2338,16 +2337,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct smu_context *smu,
>   	return ret;
>   }
>   
> -static int smu_v13_0_6_post_init(struct smu_context *smu)
> -{
> -	struct amdgpu_device *adev = smu->adev;
> -
> -	if (!amdgpu_sriov_vf(adev) && adev->ras_enabled)
> -		return smu_v13_0_6_mca_set_debug_mode(smu, false);
> -
> -	return 0;
> -}
> -
>   static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
>   {
>   	struct smu_context *smu = adev->powerplay.pp_handle;
> @@ -2904,7 +2893,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
>   	.i2c_init = smu_v13_0_6_i2c_control_init,
>   	.i2c_fini = smu_v13_0_6_i2c_control_fini,
>   	.send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
> -	.post_init = smu_v13_0_6_post_init,
>   };
>   
>   void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)


More information about the amd-gfx mailing list