[PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops

Chai, Thomas YiPeng.Chai at amd.com
Tue Dec 7 03:37:06 UTC 2021


Hi tao:
     I add my comments behind your comments. Please review.

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com> 
Sent: Monday, December 6, 2021 2:58 PM
To: Chai, Thomas <YiPeng.Chai at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops

[AMD Official Use Only]

Please see my comments inline.

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai at amd.com>
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhang, Hawking 
> <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas 
> <YiPeng.Chai at amd.com>
> Subject: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the 
> unified ras block data and ops
> 
> 1.Modify gfx block to fit for the unified ras block data and ops 
> 2.Implement .ras_block_match function pointer for gfx block to identify itself.
> 3.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the corresponding 
> variable name remove _funcs suffix.
> 4.Remove the const flag of gfx ras variable so that gfx ras block can 
> be able to be insertted into amdgpu device ras block link list.
> 5.Invoke amdgpu_ras_register_ras_block function to register gfx ras 
> block into amdgpu device ras block link list.
> 6.Remove the redundant code about gfx in amdgpu_ras.c after using the 
> unified ras block.
> 
> Signed-off-by: yipechai <YiPeng.Chai at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  6 +- 
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 15 ++--- 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 80 ++++++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 73 +++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 39 ++++++++----
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 42 +++++++++---- 
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
>  8 files changed, 178 insertions(+), 81 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 1795d448c700..da8691259ac1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct
> amdgpu_device *adev,
>  	 */
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>  		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_count)
> -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> err_data);
> +		if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +		    adev->gfx.ras->ras_block.ops->query_ras_error_count)
> +			adev->gfx.ras->ras_block.ops-
> >query_ras_error_count(adev, err_data);
>  		amdgpu_ras_reset_gpu(adev);
>  	}
>  	return AMDGPU_RAS_SUCCESS;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 6b78b4a0e182..ff4a8428a84b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -31,6 +31,7 @@
>  #include "amdgpu_ring.h"
>  #include "amdgpu_rlc.h"
>  #include "soc15.h"
> +#include "amdgpu_ras.h"
> 
>  /* GFX current status */
>  #define AMDGPU_GFX_NORMAL_MODE			0x00000000L
> @@ -213,16 +214,8 @@ struct amdgpu_cu_info {
>  	uint32_t bitmap[4][4];
>  };
> 
> -struct amdgpu_gfx_ras_funcs {
> -	int (*ras_late_init)(struct amdgpu_device *adev);
> -	void (*ras_fini)(struct amdgpu_device *adev);
> -	int (*ras_error_inject)(struct amdgpu_device *adev,
> -				void *inject_if);
> -	int (*query_ras_error_count)(struct amdgpu_device *adev,
> -				     void *ras_error_status);
> -	void (*reset_ras_error_count)(struct amdgpu_device *adev);
> -	void (*query_ras_error_status)(struct amdgpu_device *adev);
> -	void (*reset_ras_error_status)(struct amdgpu_device *adev);
> +struct amdgpu_gfx_ras {
> +	struct amdgpu_ras_block_object  ras_block;
>  	void (*enable_watchdog_timer)(struct amdgpu_device *adev);  };

>[Tao] Can we add " enable_watchdog_timer" function into amdgpu_ras_block_ops structure?
>And I think using ras_block directly is more simple than amdgpu_gfx_ras gfx_v9_0_ras structure.

[Thomas] The ' enable_watchdog_timer ' function is not a common function. It is only defined by gfx_v9_4_2.c and called in gfx_v9_0.c. 
	   I think the function pointers in the amdgpu_ras_block_ops structure should be the functions used by most blocks and the final goal of amdgpu_ras_block_ops structure is to eliminate explicit calls to special blocks in amdgpu_ras.c file. 
                 So, I think it had better that the enable_watchdog_timer function only use in gfx but not move to amdgpu_ras_block_ops.

> 
> @@ -348,7 +341,7 @@ struct amdgpu_gfx {
> 
>  	/*ras */
>  	struct ras_common_if			*ras_if;
> -	const struct amdgpu_gfx_ras_funcs	*ras_funcs;
> +	struct amdgpu_gfx_ras	*ras;
>  };
> 
>  #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs-
> >get_gpu_clock_counter((adev))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 1cf1f6331db1..190a4a4e9d7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -862,6 +862,27 @@ static int amdgpu_ras_enable_all_features(struct
> amdgpu_device *adev,  }
>  /* feature ctl end */
> 
> +static struct amdgpu_ras_block_object* 
> +amdgpu_ras_get_ras_block(struct
> amdgpu_device *adev,
> +					enum amdgpu_ras_block block,
> uint32_t sub_block_index) {
> +	struct amdgpu_ras_block_object *obj, *tmp;
> +
> +	if (block >= AMDGPU_RAS_BLOCK__LAST) {
> +		return NULL;
> +	}
>[Tao] The "{}" can be dropped since only one line under the if.
    [Thomas] OK.

> +
> +	list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) {
> +		if( !obj->ops || !obj->ops->ras_block_match) {
[Tao]  Need a space after "if" and the space before "!obj" can be removed. 

> +			dev_info(adev->dev, "%s don't config ops or
> ras_block_match\n", obj->name);
> +			continue;
> +		}
> +		if (!obj->ops->ras_block_match(obj, block, sub_block_index)) {
> +			return obj;
> +		}
>[Tao] The "{}" can be removed.
      [Thomas] OK.

> +	}
> +
> +	return NULL;
> +}
>[Tao] This is a generic ras function, not gfx specific, the code can be moved to patch #1.
    [Thomas] OK.
> 
>  void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
>  				       struct ras_common_if *ras_block, @@ -
> 892,6 +913,7 @@ void amdgpu_ras_mca_query_error_status(struct
> amdgpu_device *adev,  int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  				  struct ras_query_if *info)
>  {
> +	struct amdgpu_ras_block_object* block_obj = NULL;
>  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>  	struct ras_err_data err_data = {0, 0, 0, NULL};
>  	int i;
> @@ -899,6 +921,8 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  	if (!obj)
>  		return -EINVAL;
> 
> +	block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
> +
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__UMC:
>  		if (adev->umc.ras_funcs &&
> @@ -919,13 +943,17 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>  		}
>  		break;
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_count)
> -			adev->gfx.ras_funcs->query_ras_error_count(adev,
> &err_data);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> +				get_ras_block_str(&info->head));
> +			return -EINVAL;
> +		}
>[Tao] Can we put the check behind "block_obj = amdgpu_ras_get_ras_block"? The same suggestion to all similar code.
       [Thomas] OK.
> +
> +		if (block_obj->ops->query_ras_error_count)
> +			block_obj->ops->query_ras_error_count(adev,
> &err_data);
> 
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_status)
> -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> +		if (block_obj->ops->query_ras_error_status)
> +			block_obj->ops->query_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> @@ -1012,18 +1040,21 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,  int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>  		enum amdgpu_ras_block block)
>  {
> +	struct amdgpu_ras_block_object* block_obj = 
> +amdgpu_ras_get_ras_block(adev, block, 0);
>  	if (!amdgpu_ras_is_supported(adev, block))
>  		return -EINVAL;
> 
>  	switch (block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->reset_ras_error_count)
> -			adev->gfx.ras_funcs->reset_ras_error_count(adev);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> ras_block_str(block));
> +			return -EINVAL;
> +		}
> +		if (block_obj->ops->reset_ras_error_count)
> +			block_obj->ops->reset_ras_error_count(adev);
> 
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->reset_ras_error_status)
> -			adev->gfx.ras_funcs->reset_ras_error_status(adev);
> +		if (block_obj->ops->reset_ras_error_status)
> +			block_obj->ops->reset_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> @@ -1088,7 +1119,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device 
> *adev,
>  		.address = info->address,
>  		.value = info->value,
>  	};
> -	int ret = 0;
> +	int ret = -EINVAL;
> +	struct amdgpu_ras_block_object* block_obj = 
> +amdgpu_ras_get_ras_block(adev, info->head.block,
> +info->head.sub_block_index);
> 
>  	if (!obj)
>  		return -EINVAL;
> @@ -1102,11 +1134,12 @@ int amdgpu_ras_error_inject(struct 
> amdgpu_device *adev,
> 
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->ras_error_inject)
> -			ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
> -		else
> -			ret = -EINVAL;
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> +			return -EINVAL;
> +		}
> +		if (block_obj->ops->ras_error_inject)
> +			ret = block_obj->ops->ras_error_inject(adev, info);
>  		break;
>  	case AMDGPU_RAS_BLOCK__UMC:
>  	case AMDGPU_RAS_BLOCK__SDMA:
> @@ -1727,15 +1760,20 @@ static void 
> amdgpu_ras_log_on_err_counter(struct
> amdgpu_device *adev)  static void amdgpu_ras_error_status_query(struct
> amdgpu_device *adev,
>  					  struct ras_query_if *info)
>  {
> +	struct amdgpu_ras_block_object* block_obj = 
> +amdgpu_ras_get_ras_block(adev, info->head.block,
> +info->head.sub_block_index);
>  	/*
>  	 * Only two block need to query read/write
>  	 * RspStatus at current state
>  	 */
>  	switch (info->head.block) {
>  	case AMDGPU_RAS_BLOCK__GFX:
> -		if (adev->gfx.ras_funcs &&
> -		    adev->gfx.ras_funcs->query_ras_error_status)
> -			adev->gfx.ras_funcs->query_ras_error_status(adev);
> +		if (!block_obj || !block_obj->ops)	{
> +			dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(&info->head));
> +			return ;
> +		}
> +
> +		if (block_obj->ops->query_ras_error_status)
> +			block_obj->ops->query_ras_error_status(adev);
>  		break;
>  	case AMDGPU_RAS_BLOCK__MMHUB:
>  		if (adev->mmhub.ras_funcs &&
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 08e91e7245df..2ffde223c4f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -817,7 +817,7 @@ static int gfx_v9_0_get_cu_info(struct 
> amdgpu_device *adev,  static uint64_t 
> gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);  static 
> void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring); static u64 
> gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); -static int 
> gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device 
> +*adev,
>  					  void *ras_error_status);
>  static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
>  				     void *inject_if);
> @@ -2118,6 +2118,18 @@ static void gfx_v9_0_select_me_pipe_q(struct 
> amdgpu_device *adev,
>  	soc15_grbm_select(adev, me, pipe, q, vm);  }
> 
> +static int gfx_v9_0_ras_block_match(struct amdgpu_ras_block_object* 
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +
> +	return -EINVAL;
>[Tao] The return type can be changed to bool and return value is true or false instead of -EINVAL and 0.
       [Thomas] I think the return type is int maybe have more scalability for a unified ops interface.

> +}
>[Tao] It's better to implement a general ras block match function in amdgpu_ras.c
       [Thomas] The match method of mca block is different from other blocks. Others blocks only use block to match, but mac block should use block and sub block index to match.
	          But I can add a default match function in the amdgpu_ras.c, if ip block does't define .ras_block_match function, it will use the default match function in amdgpu_ras.c.
> +
>  static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
>          .get_gpu_clock_counter = &gfx_v9_0_get_gpu_clock_counter,
>          .select_se_sh = &gfx_v9_0_select_se_sh, @@ -2127,12 +2139,21 
> @@ static const struct amdgpu_gfx_funcs gfx_v9_0_gfx_funcs = {
>          .select_me_pipe_q = &gfx_v9_0_select_me_pipe_q,  };
> 
> -static const struct amdgpu_gfx_ras_funcs gfx_v9_0_ras_funcs = {
> -	.ras_late_init = amdgpu_gfx_ras_late_init,
> -	.ras_fini = amdgpu_gfx_ras_fini,
> -	.ras_error_inject = &gfx_v9_0_ras_error_inject,
> -	.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> -	.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count,
> +const struct amdgpu_ras_block_ops  gfx_v9_0_ras_ops = {

>[Tao]  static const?
    [Thomas] OK.
> +		.ras_block_match = gfx_v9_0_ras_block_match,
> +		.ras_late_init = amdgpu_gfx_ras_late_init,
> +		.ras_fini = amdgpu_gfx_ras_fini,
> +		.ras_error_inject = &gfx_v9_0_ras_error_inject,
> +		.query_ras_error_count = &gfx_v9_0_query_ras_error_count,
> +		.reset_ras_error_count = &gfx_v9_0_reset_ras_error_count, };
> +
> +static struct amdgpu_gfx_ras gfx_v9_0_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_0_ras_ops,
> +	},
>  };
> 
>  static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) @@ 
> -2161,7
> +2182,7 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device 
> +*adev)
>  		DRM_INFO("fix gfx.config for vega12\n");
>  		break;
>  	case CHIP_VEGA20:
> -		adev->gfx.ras_funcs = &gfx_v9_0_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_0_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2187,7 +2208,7 @@ static int gfx_v9_0_gpu_early_init(struct 
> amdgpu_device
> *adev)
>  			gb_addr_config = RAVEN_GB_ADDR_CONFIG_GOLDEN;
>  		break;
>  	case CHIP_ARCTURUS:
> -		adev->gfx.ras_funcs = &gfx_v9_4_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_4_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2208,7 +2229,7 @@ static int gfx_v9_0_gpu_early_init(struct 
> amdgpu_device
> *adev)
>  		gb_addr_config |= 0x22010042;
>  		break;
>  	case CHIP_ALDEBARAN:
> -		adev->gfx.ras_funcs = &gfx_v9_4_2_ras_funcs;
> +		adev->gfx.ras = &gfx_v9_4_2_ras;
>  		adev->gfx.config.max_hw_contexts = 8;
>  		adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
>  		adev->gfx.config.sc_prim_fifo_size_backend = 0x100; @@ -
> 2227,6 +2248,14 @@ static int gfx_v9_0_gpu_early_init(struct 
> amdgpu_device
> *adev)
>  		break;
>  	}
> 
> +	if (adev->gfx.ras) {
> +		err = amdgpu_ras_register_ras_block(adev, &adev->gfx.ras-
> >ras_block);
> +		if (err) {
> +			DRM_ERROR("Failed to register gfx ras block!\n");
> +			return err;
> +		}
> +	}
> +
>  	adev->gfx.config.gb_addr_config = gb_addr_config;
> 
>  	adev->gfx.config.gb_addr_config_fields.num_pipes = 1 << @@ -2448,9
> +2477,9 @@ static int gfx_v9_0_sw_fini(void *handle)
>  	int i;
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->ras_fini)
> -		adev->gfx.ras_funcs->ras_fini(adev);
> +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +	    adev->gfx.ras->ras_block.ops->ras_fini)
> +		adev->gfx.ras->ras_block.ops->ras_fini(adev);
> 
>  	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>  		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
> @@ -4888,16 +4917,16 @@ static int gfx_v9_0_ecc_late_init(void *handle)
>  	if (r)
>  		return r;
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->ras_late_init) {
> -		r = adev->gfx.ras_funcs->ras_late_init(adev);
> +	if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> +	    adev->gfx.ras->ras_block.ops->ras_late_init) {
> +		r = adev->gfx.ras->ras_block.ops->ras_late_init(adev);
>  		if (r)
>  			return r;
>  	}
> 
> -	if (adev->gfx.ras_funcs &&
> -	    adev->gfx.ras_funcs->enable_watchdog_timer)
> -		adev->gfx.ras_funcs->enable_watchdog_timer(adev);
> +	if (adev->gfx.ras &&
> +	    adev->gfx.ras->enable_watchdog_timer)
> +		adev->gfx.ras->enable_watchdog_timer(adev);
> 
>  	return 0;
>  }
> @@ -6841,7 +6870,7 @@ static void 
> gfx_v9_0_reset_ras_error_count(struct
> amdgpu_device *adev)
>  	WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);  }
> 
> -static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_0_query_ras_error_count(struct amdgpu_device 
> +*adev,
>  					  void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data 
> *)ras_error_status; @@ -6850,7 +6879,7 @@ static int 
> gfx_v9_0_query_ras_error_count(struct
> amdgpu_device *adev,
>  	uint32_t reg_value;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -6879,8 +6908,6 @@ static int gfx_v9_0_query_ras_error_count(struct
> amdgpu_device *adev,
>  	mutex_unlock(&adev->grbm_idx_mutex);
> 
>  	gfx_v9_0_query_utc_edc_status(adev, err_data);
> -
> -	return 0;
>  }
> 
>  static void gfx_v9_0_emit_mem_sync(struct amdgpu_ring *ring) diff 
> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> index b4789dfc2bb9..2d816addbd4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
> @@ -863,7 +863,7 @@ static int gfx_v9_4_ras_error_count(struct 
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> -static int gfx_v9_4_query_ras_error_count(struct amdgpu_device *adev,
> +static void gfx_v9_4_query_ras_error_count(struct amdgpu_device 
> +*adev,
>  					  void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data 
> *)ras_error_status; @@ -872,7 +872,7 @@ static int 
> gfx_v9_4_query_ras_error_count(struct
> amdgpu_device *adev,
>  	uint32_t reg_value;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -903,7 +903,6 @@ static int gfx_v9_4_query_ras_error_count(struct
> amdgpu_device *adev,
> 
>  	gfx_v9_4_query_utc_edc_status(adev, err_data);
> 
> -	return 0;
>  }
> 
>  static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device 
> *adev) @@
> -1029,11 +1028,31 @@ static void 
> gfx_v9_4_query_ras_error_status(struct
> amdgpu_device *adev)
>  	mutex_unlock(&adev->grbm_idx_mutex);
>  }
> 
> -const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs = {
> -        .ras_late_init = amdgpu_gfx_ras_late_init,
> -        .ras_fini = amdgpu_gfx_ras_fini,
> -        .ras_error_inject = &gfx_v9_4_ras_error_inject,
> -        .query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> -        .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> -        .query_ras_error_status = &gfx_v9_4_query_ras_error_status,
> +static int gfx_v9_4_ras_block_match(struct amdgpu_ras_block_object* 
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +	return -EINVAL;
> +}
> +
> +const struct amdgpu_ras_block_ops  gfx_v9_4_ras_ops = {
> +	.ras_block_match = gfx_v9_4_ras_block_match,
> +	.ras_late_init = amdgpu_gfx_ras_late_init,
> +	.ras_fini = amdgpu_gfx_ras_fini,
> +	.ras_error_inject = &gfx_v9_4_ras_error_inject,
> +	.query_ras_error_count = &gfx_v9_4_query_ras_error_count,
> +	.reset_ras_error_count = &gfx_v9_4_reset_ras_error_count,
> +	.query_ras_error_status = &gfx_v9_4_query_ras_error_status, };
> +
> +struct amdgpu_gfx_ras gfx_v9_4_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_4_ras_ops,
> +	},
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> index bdd16b568021..ca520a767267 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h
> @@ -24,6 +24,6 @@
>  #ifndef __GFX_V9_4_H__
>  #define __GFX_V9_4_H__
> 
> -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_ras_funcs;
> +extern struct amdgpu_gfx_ras gfx_v9_4_ras;
> 
>  #endif /* __GFX_V9_4_H__ */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> index 54306fd45ff1..2744709fa09d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
> @@ -1644,14 +1644,14 @@ static int 
> gfx_v9_4_2_query_utc_edc_count(struct
> amdgpu_device *adev,
>  	return 0;
>  }
> 
> -static int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device 
> *adev,
> +static void gfx_v9_4_2_query_ras_error_count(struct amdgpu_device 
> +*adev,
>  					    void *ras_error_status)
>  {
>  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
>  	uint32_t sec_count = 0, ded_count = 0;
> 
>  	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> -		return -EINVAL;
> +		return;
> 
>  	err_data->ue_count = 0;
>  	err_data->ce_count = 0;
> @@ -1664,7 +1664,6 @@ static int 
> gfx_v9_4_2_query_ras_error_count(struct
> amdgpu_device *adev,
>  	err_data->ce_count += sec_count;
>  	err_data->ue_count += ded_count;
> 
> -	return 0;
>  }
> 
>  static void gfx_v9_4_2_reset_utc_err_status(struct amdgpu_device 
> *adev) @@
> -1934,13 +1933,34 @@ static void 
> gfx_v9_4_2_reset_sq_timeout_status(struct
> amdgpu_device *adev)
>  	mutex_unlock(&adev->grbm_idx_mutex);
>  }
> 
> -const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs = {
> -	.ras_late_init = amdgpu_gfx_ras_late_init,
> -	.ras_fini = amdgpu_gfx_ras_fini,
> -	.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> -	.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> -	.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> -	.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
> -	.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> +static int gfx_v9_4_2_ras_block_match(struct amdgpu_ras_block_object* 
> +block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index) {
> +	if(!block_obj)
> +		return -EINVAL;
> +
> +	if(block_obj->block == block) {
> +		return 0;
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +struct amdgpu_ras_block_ops  gfx_v9_4_2_ras_ops ={
> +		.ras_block_match = gfx_v9_4_2_ras_block_match,
> +		.ras_late_init = amdgpu_gfx_ras_late_init,
> +		.ras_fini = amdgpu_gfx_ras_fini,
> +		.ras_error_inject = &gfx_v9_4_2_ras_error_inject,
> +		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
> +		.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
> +		.query_ras_error_status =
> &gfx_v9_4_2_query_ras_error_status,
> +		.reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status,
> +};
> +
> +struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
> +	.ras_block = {
> +		.name = "gfx",
> +		.block = AMDGPU_RAS_BLOCK__GFX,
> +		.ops = &gfx_v9_4_2_ras_ops,
> +	},
>  	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> index 6db1f88509af..7584624b641c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h
> @@ -31,6 +31,6 @@ void gfx_v9_4_2_init_golden_registers(struct
> amdgpu_device *adev,  void gfx_v9_4_2_set_power_brake_sequence(struct
> amdgpu_device *adev);  int gfx_v9_4_2_do_edc_gpr_workarounds(struct
> amdgpu_device *adev);
> 
> -extern const struct amdgpu_gfx_ras_funcs gfx_v9_4_2_ras_funcs;
> +extern struct amdgpu_gfx_ras gfx_v9_4_2_ras;
> 
>  #endif /* __GFX_V9_4_2_H__ */
> --
> 2.25.1


More information about the amd-gfx mailing list