[PATCH V3 1/1] drm/amdkfd: fix set kfd node ras properties value

Chen, Guchun Guchun.Chen at amd.com
Tue Aug 25 02:44:48 UTC 2020


[AMD Public Use]

amd-gfx-bounces distribution list is not correct. Simply correct to amd-gfx DL in mail TO line.

Regards,
Guchun

-----Original Message-----
From: Yang, Stanley <Stanley.Yang at amd.com> 
Sent: Tuesday, August 25, 2020 9:53 AM
To: Chen, Guchun <Guchun.Chen at amd.com>; amd-gfx-bounces at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Tye, Tony <Tony.Tye at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Li, Dennis <Dennis.Li at amd.com>
Subject: RE: [PATCH V3 1/1] drm/amdkfd: fix set kfd node ras properties value

[AMD Public Use]

Hi Guchun,

Thanks, I will fix typos and push it to branch.

Regards,
Stanley
> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen at amd.com>
> Sent: Monday, August 24, 2020 9:48 PM
> To: Yang, Stanley <Stanley.Yang at amd.com>; amd-gfx- 
> bounces at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Tye, Tony 
> <Tony.Tye at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Li, 
> Dennis <Dennis.Li at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>
> Subject: RE: [PATCH V3 1/1] drm/amdkfd: fix set kfd node ras 
> properties value
> 
> [AMD Public Use]
> 
> Three comments inline.
> 
> With these comments fixed, the patch is:
> Reviewed-by: Guchun Chen <guchun.chen at amd.com>
> 
> Regards,
> Guchun
> 
> -----Original Message-----
> From: Stanley.Yang <Stanley.Yang at amd.com>
> Sent: Monday, August 24, 2020 6:34 PM
> To: amd-gfx-bounces at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Tye, Tony 
> <Tony.Tye at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Chen, 
> Guchun <Guchun.Chen at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Yang, 
> Stanley <Stanley.Yang at amd.com>
> Subject: [PATCH V3 1/1] drm/amdkfd: fix set kfd node ras properties 
> value
> 
> The ctx->features are new RAS implementation which is only available 
> for
> Vega20 and onwards, it is not available for vega10, vega10 should 
> follow legacy ECC implementation.
> 
> Changed from V1:
>     wrap function to initialize kfd node properties
> 
> Changed from V2:
>     remove wrap funcion, remove SRMA ECC check
> 
> [Guchun]Spelling typos, not funcion or SRMA. Moreover, it should be 
> more concise by 'remove wrap function and SDMA SRAM ECC check'
> 
> Change-Id: I1e3ff899bf066611fe5775e67104ce2e0bf8b7d0
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 16 ++++++++-------
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 24 
> +++++++++++------------
>  3 files changed, 21 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1f9d97f61aa5..573e2712df35 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -986,6 +986,7 @@ struct amdgpu_device {
> 
>  	atomic_t			throttling_logging_enabled;
>  	struct ratelimit_state		throttling_logging_rs;
> +	uint32_t			ras_features;
>  };
> 
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct 
> ttm_bo_device *bdev) diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index cd1403f83dcf..d462244863f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1974,7 +1974,8 @@ static void amdgpu_ras_check_supported(struct 
> amdgpu_device *adev,
>  	*supported = 0;
> 
>  	if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
> -	    (adev->asic_type != CHIP_VEGA20   &&
> +	    (adev->asic_type != CHIP_VEGA10 &&
> +	     adev->asic_type != CHIP_VEGA20 &&
>  	     adev->asic_type != CHIP_ARCTURUS &&
>  	     adev->asic_type != CHIP_SIENNA_CICHLID)) [Guchun]I suggest 
> moving all ASIC check into one static function. This will benefit 
> later modification if user adds more ASICs to support RAS, and it's 
> indeed more readable.
> 
>  		return;
> @@ -1998,6 +1999,7 @@ static void amdgpu_ras_check_supported(struct 
> amdgpu_device *adev,
> 
>  	*supported = amdgpu_ras_enable == 0 ?
>  			0 : *hw_supported & amdgpu_ras_mask;
> +	adev->ras_features = *supported;
>  }
> 
>  int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2020,9 +2022,9 @@ 
> int amdgpu_ras_init(struct amdgpu_device *adev)
> 
>  	amdgpu_ras_check_supported(adev, &con->hw_supported,
>  			&con->supported);
> -	if (!con->hw_supported) {
> +	if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
>  		r = 0;
> -		goto err_out;
> +		goto release_con;
>  	}
> 
>  	con->features = 0;
> @@ -2033,25 +2035,25 @@ int amdgpu_ras_init(struct amdgpu_device
> *adev)
>  	if (adev->nbio.funcs->init_ras_controller_interrupt) {
>  		r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
>  		if (r)
> -			goto err_out;
> +			goto release_con;
>  	}
> 
>  	if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
>  		r = adev->nbio.funcs-
> >init_ras_err_event_athub_interrupt(adev);
>  		if (r)
> -			goto err_out;
> +			goto release_con;
>  	}
> 
>  	if (amdgpu_ras_fs_init(adev)) {
>  		r = -EINVAL;
> -		goto err_out;
> +		goto release_con;
>  	}
> 
>  	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>  			"hardware ability[%x] ras_mask[%x]\n",
>  			con->hw_supported, con->supported);
>  	return 0;
> -err_out:
> +release_con:
>  	amdgpu_ras_set_context(adev, NULL);
>  	kfree(con);
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index f185f6cbc05c..0ba960a17ead 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1239,7 +1239,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>  	void *crat_image = NULL;
>  	size_t image_size = 0;
>  	int proximity_domain;
> -	struct amdgpu_ras *ctx;
> +	struct amdgpu_device *adev;
> 
>  	INIT_LIST_HEAD(&temp_topology_device_list);
> 
> @@ -1404,19 +1404,17 @@ int kfd_topology_add_device(struct kfd_dev
> *gpu)
>  		dev->node_props.max_waves_per_simd = 10;
>  	}
> 
> -	ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu-
> >kgd));
> -	if (ctx) {
> -		/* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on
> UMC */
> -		dev->node_props.capability |=
> -			(((ctx->features &
> BIT(AMDGPU_RAS_BLOCK__SDMA)) != 0) ||
> -			 ((ctx->features &
> BIT(AMDGPU_RAS_BLOCK__GFX)) != 0)) ?
> -			HSA_CAP_SRAM_EDCSUPPORTED : 0;
> -		dev->node_props.capability |= ((ctx->features &
> BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
> -			HSA_CAP_MEM_EDCSUPPORTED : 0;
> -
> -		dev->node_props.capability |= (ctx->features != 0) ?
> +	adev = (struct amdgpu_device *)(dev->gpu->kgd);
> +	/* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC
> */
> [Guchun]Modify comment to drop SDMA.
> 
> +	dev->node_props.capability |=
> +		((adev->ras_features &
> BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
> +		HSA_CAP_SRAM_EDCSUPPORTED : 0;
> +	dev->node_props.capability |= ((adev->ras_features &
> BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
> +		HSA_CAP_MEM_EDCSUPPORTED : 0;
> +
> +	if (adev->asic_type != CHIP_VEGA10)
> +		dev->node_props.capability |= (adev->ras_features != 0) ?
>  			HSA_CAP_RASEVENTNOTIFY : 0;
> -	}
> 
>  	kfd_debug_print_topology();
> 
> --
> 2.25.1


More information about the amd-gfx mailing list