[PATCH 1/2] drm/amdgpu: support RAS error inject for SRIOV

Thu Sep 1 02:41:51 UTC 2022

+ Andy and Jerry,

Hi Tao & Stanley,

As security concern, we cannot define such PF-VF mailbox message for ECC or RAS error injection.  As Vignesh said, we might need to test script to communicate between host and guest.

Thanks,
HaiJun

-----Original Message-----
From: Yang, Stanley <Stanley.Yang at amd.com> 
Sent: Thursday, September 1, 2022 10:24 AM
To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Liu, Monk <Monk.Liu at amd.com>; Skvortsov, Victor <Victor.Skvortsov at amd.com>; Chang, HaiJun <HaiJun.Chang at amd.com>; Chander, Vignesh <Vignesh.Chander at amd.com>; Wan, Gavin <Gavin.Wan at amd.com>; Liu, Shaoyun <Shaoyun.Liu at amd.com>
Subject: RE: [PATCH 1/2] drm/amdgpu: support RAS error inject for SRIOV

[AMD Official Use Only - General]

The series is fine for me, these patches also need to be reviewed by the virtualization group.

Regards,
Stanley
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Sent: Wednesday, August 31, 2022 4:39 PM
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking 
> <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Liu, 
> Monk <Monk.Liu at amd.com>; Skvortsov, Victor <Victor.Skvortsov at amd.com>; 
> Chang, HaiJun <HaiJun.Chang at amd.com>; Chander, Vignesh 
> <Vignesh.Chander at amd.com>; Wan, Gavin <Gavin.Wan at amd.com>; Liu, 
> Shaoyun <Shaoyun.Liu at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: support RAS error inject for SRIOV
> 
> In SRIOV, RAS error injection request will be sent to PF via mailbox, 
> the injection input information should also be transferred to PF.
> 
> Generally, the error injection is operated on PF side directly, but 
> for RAS poison test, since workload is launched on VF side, VF has to 
> tell PF about the injection information.
> 
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 26 ++++++++++++++++------
> --  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c    | 24
> ++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h    |  9 ++++++++
>  4 files changed, 53 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index ab9ba5a9c33d..498642eb5fb7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1103,15 +1103,25 @@ int amdgpu_ras_error_inject(struct 
> amdgpu_device *adev,
>  							  block_info.address);
>  	}
> 
> -	if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
> -		if (block_obj->hw_ops->ras_error_inject)
> -			ret = block_obj->hw_ops->ras_error_inject(adev,
> info);
> +	if (!amdgpu_sriov_vf(adev)) {
> +		if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
> +			if (block_obj->hw_ops->ras_error_inject)
> +				ret = block_obj->hw_ops-
> >ras_error_inject(adev, info);
> +		} else {
> +			/* If defined special ras_error_inject(e.g: xgmi),
> implement special ras_error_inject */
> +			if (block_obj->hw_ops->ras_error_inject)
> +				ret = block_obj->hw_ops-
> >ras_error_inject(adev, &block_info);
> +			else  /*If not defined .ras_error_inject, use default
> ras_error_inject*/
> +				ret = psp_ras_trigger_error(&adev->psp,
> &block_info);
> +		}
>  	} else {
> -		/* If defined special ras_error_inject(e.g: xgmi), implement
> special ras_error_inject */
> -		if (block_obj->hw_ops->ras_error_inject)
> -			ret = block_obj->hw_ops->ras_error_inject(adev,
> &block_info);
> -		else  /*If not defined .ras_error_inject, use default
> ras_error_inject*/
> -			ret = psp_ras_trigger_error(&adev->psp,
> &block_info);
> +		if (adev->virt.ops && adev->virt.ops->ras_trigger_error) {
> +			adev->virt.ops->ras_trigger_error(adev, &block_info);
> +			ret = 0;
> +		} else {
> +			dev_warn(adev->dev,
> +				"No ras_trigger_error interface in SRIOV!\n");
> +		}
>  	}
> 
>  	if (ret)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 239f232f9c02..4534e6f70a4b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -84,6 +84,8 @@ struct amdgpu_virt_ops {
>  	int (*reset_gpu)(struct amdgpu_device *adev);
>  	int (*wait_reset)(struct amdgpu_device *adev);
>  	void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1,
> u32 data2, u32 data3);
> +	void (*ras_trigger_error)(struct amdgpu_device *adev,
> +				struct ta_ras_trigger_error_input *info);
>  };
> 
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index a2f04b249132..3b4c5162a237 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -33,6 +33,7 @@
>  #include "mxgpu_ai.h"
> 
>  #include "amdgpu_reset.h"
> +#include "ta_ras_if.h"
> 
>  static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)  { 
> @@ -
> 405,6 +406,28 @@ static int xgpu_ai_request_init_data(struct 
> amdgpu_device *adev)
>  	return xgpu_ai_send_access_requests(adev,
> IDH_REQ_GPU_INIT_DATA);  }
> 
> +void amdgpu_virt_ras_trigger_error(struct amdgpu_device *adev,
> +               struct ta_ras_trigger_error_input *info) {
> +       uint32_t addr_lo, addr_hi, data1 = 0;
> +
> +       addr_lo = lower_32_bits(info->address);
> +       addr_hi = upper_32_bits(info->address);
> +
> +       /* block id : bits[0:4], inject_error_type : bits[5:8]
> +        * sub_block_index : bits[9:17], value : bits[18:19]
> +        */
> +       data1 = info->block_id & RAS_BLOCK_ID_MASK;
> +       data1 |= (info->inject_error_type & 
> + RAS_INJECT_ERROR_TYPE_MASK)
> <<
> +			RAS_INJECT_ERROR_TYPE_SHIFT;
> +       data1 |= (info->sub_block_index & RAS_SUB_BLOCK_INDEX_MASK) <<
> +			RAS_SUB_BLOCK_INDEX_SHIFT;
> +       data1 |= (info->value & RAS_VALUE_MASK) << RAS_VALUE_SHIFT;
> +
> +       xgpu_ai_mailbox_trans_msg(adev, IDH_RAS_ERROR_INJECT, data1,
> +				addr_lo, addr_hi);
> +}
> +
>  const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
>  	.req_full_gpu	= xgpu_ai_request_full_gpu_access,
>  	.rel_full_gpu	= xgpu_ai_release_full_gpu_access,
> @@ -412,4 +435,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
>  	.wait_reset = NULL,
>  	.trans_msg = xgpu_ai_mailbox_trans_msg,
>  	.req_init_data  = xgpu_ai_request_init_data,
> +	.ras_trigger_error = amdgpu_virt_ras_trigger_error,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> index fa7e13e0459e..0841d6632328 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> @@ -29,6 +29,14 @@
>  #define AI_MAILBOX_POLL_FLR_TIMEDOUT	10000
>  #define AI_MAILBOX_POLL_MSG_REP_MAX	11
> 
> +#define RAS_BLOCK_ID_MASK 0x1f
> +#define RAS_INJECT_ERROR_TYPE_MASK 0xf #define 
> +RAS_INJECT_ERROR_TYPE_SHIFT 5 #define RAS_SUB_BLOCK_INDEX_MASK 0x1ff 
> +#define RAS_SUB_BLOCK_INDEX_SHIFT 9 #define RAS_VALUE_MASK 0x3 
> +#define RAS_VALUE_SHIFT 18
> +
>  enum idh_request {
>  	IDH_REQ_GPU_INIT_ACCESS = 1,
>  	IDH_REL_GPU_INIT_ACCESS,
> @@ -39,6 +47,7 @@ enum idh_request {
> 
>  	IDH_LOG_VF_ERROR       = 200,
>  	IDH_READY_TO_RESET 	= 201,
> +	IDH_RAS_ERROR_INJECT = 202,
>  };
> 
>  enum idh_event {
> --
> 2.35.1