[PATCH V3 1/1] drm/amdkfd: fix set kfd node ras properties value

Felix Kuehling felix.kuehling at amd.com
Mon Aug 24 16:55:55 UTC 2020


The KFD part looks good to me, other than the SDMA comment that Guchun
pointed out. With that fixed this patch is

Acked-by: Felix Kuehling <Felix.Kuehling at amd.com>

Thanks,
  Felix


Am 2020-08-24 um 6:33 a.m. schrieb Stanley.Yang:
> The ctx->features are new RAS implementation which
> is only available for Vega20 and onwards, it is not
> available for vega10, vega10 should follow legacy
> ECC implementation.
>
> Changed from V1:
>     wrap function to initialize kfd node properties
>
> Changed from V2:
>     remove wrap funcion, remove SRMA ECC check
>
> Change-Id: I1e3ff899bf066611fe5775e67104ce2e0bf8b7d0
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 16 ++++++++-------
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 24 +++++++++++------------
>  3 files changed, 21 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1f9d97f61aa5..573e2712df35 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -986,6 +986,7 @@ struct amdgpu_device {
>  
>  	atomic_t			throttling_logging_enabled;
>  	struct ratelimit_state		throttling_logging_rs;
> +	uint32_t			ras_features;
>  };
>  
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index cd1403f83dcf..d462244863f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1974,7 +1974,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
>  	*supported = 0;
>  
>  	if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
> -	    (adev->asic_type != CHIP_VEGA20   &&
> +	    (adev->asic_type != CHIP_VEGA10 &&
> +	     adev->asic_type != CHIP_VEGA20 &&
>  	     adev->asic_type != CHIP_ARCTURUS &&
>  	     adev->asic_type != CHIP_SIENNA_CICHLID))
>  		return;
> @@ -1998,6 +1999,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
>  
>  	*supported = amdgpu_ras_enable == 0 ?
>  			0 : *hw_supported & amdgpu_ras_mask;
> +	adev->ras_features = *supported;
>  }
>  
>  int amdgpu_ras_init(struct amdgpu_device *adev)
> @@ -2020,9 +2022,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>  
>  	amdgpu_ras_check_supported(adev, &con->hw_supported,
>  			&con->supported);
> -	if (!con->hw_supported) {
> +	if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
>  		r = 0;
> -		goto err_out;
> +		goto release_con;
>  	}
>  
>  	con->features = 0;
> @@ -2033,25 +2035,25 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>  	if (adev->nbio.funcs->init_ras_controller_interrupt) {
>  		r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
>  		if (r)
> -			goto err_out;
> +			goto release_con;
>  	}
>  
>  	if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
>  		r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
>  		if (r)
> -			goto err_out;
> +			goto release_con;
>  	}
>  
>  	if (amdgpu_ras_fs_init(adev)) {
>  		r = -EINVAL;
> -		goto err_out;
> +		goto release_con;
>  	}
>  
>  	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>  			"hardware ability[%x] ras_mask[%x]\n",
>  			con->hw_supported, con->supported);
>  	return 0;
> -err_out:
> +release_con:
>  	amdgpu_ras_set_context(adev, NULL);
>  	kfree(con);
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index f185f6cbc05c..0ba960a17ead 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1239,7 +1239,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>  	void *crat_image = NULL;
>  	size_t image_size = 0;
>  	int proximity_domain;
> -	struct amdgpu_ras *ctx;
> +	struct amdgpu_device *adev;
>  
>  	INIT_LIST_HEAD(&temp_topology_device_list);
>  
> @@ -1404,19 +1404,17 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>  		dev->node_props.max_waves_per_simd = 10;
>  	}
>  
> -	ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu->kgd));
> -	if (ctx) {
> -		/* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC */
> -		dev->node_props.capability |=
> -			(((ctx->features & BIT(AMDGPU_RAS_BLOCK__SDMA)) != 0) ||
> -			 ((ctx->features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0)) ?
> -			HSA_CAP_SRAM_EDCSUPPORTED : 0;
> -		dev->node_props.capability |= ((ctx->features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
> -			HSA_CAP_MEM_EDCSUPPORTED : 0;
> -
> -		dev->node_props.capability |= (ctx->features != 0) ?
> +	adev = (struct amdgpu_device *)(dev->gpu->kgd);
> +	/* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC */
> +	dev->node_props.capability |=
> +		((adev->ras_features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
> +		HSA_CAP_SRAM_EDCSUPPORTED : 0;
> +	dev->node_props.capability |= ((adev->ras_features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
> +		HSA_CAP_MEM_EDCSUPPORTED : 0;
> +
> +	if (adev->asic_type != CHIP_VEGA10)
> +		dev->node_props.capability |= (adev->ras_features != 0) ?
>  			HSA_CAP_RASEVENTNOTIFY : 0;
> -	}
>  
>  	kfd_debug_print_topology();
>  


More information about the amd-gfx mailing list