[PATCH V3 01/12] drm/amdgpu: Unify ras block interface for each ras block

Zhou1, Tao Tao.Zhou1 at amd.com
Tue Jan 4 08:24:43 UTC 2022


[AMD Official Use Only]

The series is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

Please make sure basic RAS tests are successful before submit the series.

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai at amd.com>
> Sent: Wednesday, December 29, 2021 2:32 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Clements,
> John <John.Clements at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
> Subject: [PATCH V3 01/12] drm/amdgpu: Unify ras block interface for each ras
> block
> 
> 1. Define unified ops interface for each block.
> 2. Add ras_block_match function pointer in ops interface, each ras block can
> customize specail match function to identify itself.
> 3. Add amdgpu_ras_block_match_default new function. If a ras block doesn't
> define .ras_block_match, default execute amdgpu_ras_block_match_default to
> identify this ras block.
> 4. Define unified basic ras block data for each ras block.
> 5. Create dedicated amdgpu device ras block link list to manage all of the ras
> blocks.
> 6. Add amdgpu_ras_register_ras_block new function interface for each ras block
> to register itself to ras controlling block.
> 
> Signed-off-by: yipechai <YiPeng.Chai at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 46 ++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 28 +++++++++++++
>  4 files changed, 78 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index db1505455761..eddf230856e2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1151,6 +1151,8 @@ struct amdgpu_device {
>  	bool				barrier_has_auto_waitcnt;
> 
>  	struct amdgpu_reset_control     *reset_cntl;
> +
> +	struct list_head		ras_list;
>  };
> 
>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 73ec46140d68..0980396ee709 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3578,6 +3578,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> 
>  	INIT_LIST_HEAD(&adev->reset_list);
> 
> +	INIT_LIST_HEAD(&adev->ras_list);
> +
>  	INIT_DELAYED_WORK(&adev->delayed_init_work,
>  			  amdgpu_device_delayed_init_work_handler);
>  	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 90f0db3b4f65..9dd698354e04 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -862,6 +862,40 @@ static int amdgpu_ras_enable_all_features(struct
> amdgpu_device *adev,  }
>  /* feature ctl end */
> 
> +int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object*
> +block_obj, enum amdgpu_ras_block block) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if (block_obj->block == block)
> +		return 0;
> +
> +	return -EINVAL;
> +}
> +
> +static struct amdgpu_ras_block_object* amdgpu_ras_get_ras_block(struct
> amdgpu_device *adev,
> +					enum amdgpu_ras_block block,
> uint32_t sub_block_index) {
> +	struct amdgpu_ras_block_object *obj, *tmp;
> +
> +	if (block >= AMDGPU_RAS_BLOCK__LAST)
> +		return NULL;
> +
> +	if (!amdgpu_ras_is_supported(adev, block))
> +		return NULL;
> +
> +	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
> +		if (obj->ras_block_match) {
> +			if (obj->ras_block_match(obj, block, sub_block_index)
> == 0)
> +				return obj;
> +		} else {
> +			if (amdgpu_ras_block_match_default(obj, block) == 0)
> +				return obj;
> +		}
> +	}
> +
> +	return NULL;
> +}
> 
>  void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
>  				       struct ras_common_if *ras_block, @@ -
> 2739,3 +2773,15 @@ static void
> amdgpu_register_bad_pages_mca_notifier(void)
>          }
>  }
>  #endif
> +/* Register each ip ras block into amdgpu ras */ int
> +amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
> +		struct amdgpu_ras_block_object* ras_block_obj) {
> +	if (!adev || !ras_block_obj)
> +		return -EINVAL;
> +
> +	INIT_LIST_HEAD(&ras_block_obj->node);
> +	list_add_tail(&ras_block_obj->node, &adev->ras_list);
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index cdd0010a5389..9dbe8d49b891 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -469,6 +469,33 @@ struct ras_debug_if {
>  	};
>  	int op;
>  };
> +
> +struct amdgpu_ras_block_object {
> +	/* block name */
> +	char name[32];
> +
> +	enum amdgpu_ras_block block;
> +
> +	uint32_t sub_block_index;
> +
> +	/* ras block link */
> +	struct list_head node;
> +
> +	int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
> enum amdgpu_ras_block block, uint32_t sub_block_index);
> +	int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info);
> +	void (*ras_fini)(struct amdgpu_device *adev);
> +	const struct amdgpu_ras_block_hw_ops *hw_ops; };
> +
> +struct amdgpu_ras_block_hw_ops {
> +	int  (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
> +	void (*query_ras_error_count)(struct amdgpu_device *adev,void
> *ras_error_status);
> +	void (*query_ras_error_status)(struct amdgpu_device *adev);
> +	void (*query_ras_error_address)(struct amdgpu_device *adev, void
> *ras_error_status);
> +	void (*reset_ras_error_count)(struct amdgpu_device *adev);
> +	void (*reset_ras_error_status)(struct amdgpu_device *adev); };
> +
>  /* work flow
>   * vbios
>   * 1: ras feature enable (enabled by default) @@ -652,4 +679,5 @@ const char
> *get_ras_block_str(struct ras_common_if *ras_block);
> 
>  bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
> 
> +int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct
> +amdgpu_ras_block_object* ras_block_obj);
>  #endif
> --
> 2.25.1


More information about the amd-gfx mailing list