[PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of each IP block

Thu Nov 25 11:41:13 UTC 2021

On 11/25/2021 4:26 PM, yipechai wrote:
> Define an unified ras function pointers for each ip block to adapt.
> 
> Signed-off-by: yipechai <YiPeng.Chai at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 36 ++++++++++++-------------
>   2 files changed, 37 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 90f0db3b4f65..dc6c8130e2d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2739,3 +2739,23 @@ static void amdgpu_register_bad_pages_mca_notifier(void)
>           }
>   }
>   #endif
> +
> +/* check if ras is supported on block, say, sdma, gfx */
> +int amdgpu_ras_is_supported(struct amdgpu_device *adev,
> +		unsigned int block)
> +{
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	if (block >= AMDGPU_RAS_BLOCK_COUNT)
> +		return 0;
> +	return ras && (adev->ras_enabled & (1 << block));
> +}
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> +	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> +		schedule_work(&ras->recovery_work);
> +	return 0;
> +}

These changes look unrelated. Maybe as another patch to move from .h 
file to .c file.

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index cdd0010a5389..4b7da40dd837 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -469,6 +469,19 @@ struct ras_debug_if {
>   	};
>   	int op;
>   };
> +
> +struct amdgpu_ras_block_ops {
> +	int (*ras_late_init)(struct amdgpu_device *adev);
> +	void (*ras_fini)(struct amdgpu_device *adev);
> +	int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
> +	void  (*query_ras_error_count)(struct amdgpu_device *adev,void *ras_error_status);
> +	void (*query_ras_error_status)(struct amdgpu_device *adev);
> +	bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
> +	void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
> +	void (*reset_ras_error_count)(struct amdgpu_device *adev);
> +	void (*reset_ras_error_status)(struct amdgpu_device *adev);
> +};
> +

Generic comment - Since all the operations are consolidated under _ops, 
it makes sense to rename the <ip>_ras_funcs to <ip>_ras.

Ex: amdgpu_gfx_ras_funcs => amdgpu_gfx_ras, amdgpu_xgmi_ras_funcs => 
amdgpu_xgmi_ras and so forth.

In future, these ras blocks may have data members to keep IP specific 
ras data.

Thanks,
Lijo

>   /* work flow
>    * vbios
>    * 1: ras feature enable (enabled by default)
> @@ -486,16 +499,6 @@ struct ras_debug_if {
>   #define amdgpu_ras_get_context(adev)		((adev)->psp.ras_context.ras)
>   #define amdgpu_ras_set_context(adev, ras_con)	((adev)->psp.ras_context.ras = (ras_con))
>   
> -/* check if ras is supported on block, say, sdma, gfx */
> -static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
> -		unsigned int block)
> -{
> -	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> -	if (block >= AMDGPU_RAS_BLOCK_COUNT)
> -		return 0;
> -	return ras && (adev->ras_enabled & (1 << block));
> -}
>   
>   int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
>   
> @@ -512,15 +515,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>   
>   int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
>   
> -static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
> -{
> -	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> -	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
> -		schedule_work(&ras->recovery_work);
> -	return 0;
> -}
> -
>   static inline enum ta_ras_block
>   amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
>   	switch (block) {
> @@ -652,4 +646,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block);
>   
>   bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
>   
> +int amdgpu_ras_is_supported(struct amdgpu_device *adev,	unsigned int block);
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
> +
>   #endif
>