[PATCH 1/2] drm/amdgpu: support RAS error inject for SRIOV
Chang, HaiJun
HaiJun.Chang at amd.com
Thu Sep 1 02:41:51 UTC 2022
+ Andy and Jerry,
Hi Tao & Stanley,
As security concern, we cannot define such PF-VF mailbox message for ECC or RAS error injection. As Vignesh said, we might need to test script to communicate between host and guest.
Thanks,
HaiJun
-----Original Message-----
From: Yang, Stanley <Stanley.Yang at amd.com>
Sent: Thursday, September 1, 2022 10:24 AM
To: Zhou1, Tao <Tao.Zhou1 at amd.com>; amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Liu, Monk <Monk.Liu at amd.com>; Skvortsov, Victor <Victor.Skvortsov at amd.com>; Chang, HaiJun <HaiJun.Chang at amd.com>; Chander, Vignesh <Vignesh.Chander at amd.com>; Wan, Gavin <Gavin.Wan at amd.com>; Liu, Shaoyun <Shaoyun.Liu at amd.com>
Subject: RE: [PATCH 1/2] drm/amdgpu: support RAS error inject for SRIOV
[AMD Official Use Only - General]
The series is fine for me, these patches also need to be reviewed by the virtualization group.
Regards,
Stanley
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Sent: Wednesday, August 31, 2022 4:39 PM
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Liu,
> Monk <Monk.Liu at amd.com>; Skvortsov, Victor <Victor.Skvortsov at amd.com>;
> Chang, HaiJun <HaiJun.Chang at amd.com>; Chander, Vignesh
> <Vignesh.Chander at amd.com>; Wan, Gavin <Gavin.Wan at amd.com>; Liu,
> Shaoyun <Shaoyun.Liu at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: support RAS error inject for SRIOV
>
> In SRIOV, RAS error injection request will be sent to PF via mailbox,
> the injection input information should also be transferred to PF.
>
> Generally, the error injection is operated on PF side directly, but
> for RAS poison test, since workload is launched on VF side, VF has to
> tell PF about the injection information.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 26 ++++++++++++++++------
> -- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 2 ++
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 24
> ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 9 ++++++++
> 4 files changed, 53 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index ab9ba5a9c33d..498642eb5fb7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1103,15 +1103,25 @@ int amdgpu_ras_error_inject(struct
> amdgpu_device *adev,
> block_info.address);
> }
>
> - if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
> - if (block_obj->hw_ops->ras_error_inject)
> - ret = block_obj->hw_ops->ras_error_inject(adev,
> info);
> + if (!amdgpu_sriov_vf(adev)) {
> + if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
> + if (block_obj->hw_ops->ras_error_inject)
> + ret = block_obj->hw_ops-
> >ras_error_inject(adev, info);
> + } else {
> + /* If defined special ras_error_inject(e.g: xgmi),
> implement special ras_error_inject */
> + if (block_obj->hw_ops->ras_error_inject)
> + ret = block_obj->hw_ops-
> >ras_error_inject(adev, &block_info);
> + else /*If not defined .ras_error_inject, use default
> ras_error_inject*/
> + ret = psp_ras_trigger_error(&adev->psp,
> &block_info);
> + }
> } else {
> - /* If defined special ras_error_inject(e.g: xgmi), implement
> special ras_error_inject */
> - if (block_obj->hw_ops->ras_error_inject)
> - ret = block_obj->hw_ops->ras_error_inject(adev,
> &block_info);
> - else /*If not defined .ras_error_inject, use default
> ras_error_inject*/
> - ret = psp_ras_trigger_error(&adev->psp,
> &block_info);
> + if (adev->virt.ops && adev->virt.ops->ras_trigger_error) {
> + adev->virt.ops->ras_trigger_error(adev, &block_info);
> + ret = 0;
> + } else {
> + dev_warn(adev->dev,
> + "No ras_trigger_error interface in SRIOV!\n");
> + }
> }
>
> if (ret)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 239f232f9c02..4534e6f70a4b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -84,6 +84,8 @@ struct amdgpu_virt_ops {
> int (*reset_gpu)(struct amdgpu_device *adev);
> int (*wait_reset)(struct amdgpu_device *adev);
> void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1,
> u32 data2, u32 data3);
> + void (*ras_trigger_error)(struct amdgpu_device *adev,
> + struct ta_ras_trigger_error_input *info);
> };
>
> /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index a2f04b249132..3b4c5162a237 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -33,6 +33,7 @@
> #include "mxgpu_ai.h"
>
> #include "amdgpu_reset.h"
> +#include "ta_ras_if.h"
>
> static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev) {
> @@ -
> 405,6 +406,28 @@ static int xgpu_ai_request_init_data(struct
> amdgpu_device *adev)
> return xgpu_ai_send_access_requests(adev,
> IDH_REQ_GPU_INIT_DATA); }
>
> +void amdgpu_virt_ras_trigger_error(struct amdgpu_device *adev,
> + struct ta_ras_trigger_error_input *info) {
> + uint32_t addr_lo, addr_hi, data1 = 0;
> +
> + addr_lo = lower_32_bits(info->address);
> + addr_hi = upper_32_bits(info->address);
> +
> + /* block id : bits[0:4], inject_error_type : bits[5:8]
> + * sub_block_index : bits[9:17], value : bits[18:19]
> + */
> + data1 = info->block_id & RAS_BLOCK_ID_MASK;
> + data1 |= (info->inject_error_type &
> + RAS_INJECT_ERROR_TYPE_MASK)
> <<
> + RAS_INJECT_ERROR_TYPE_SHIFT;
> + data1 |= (info->sub_block_index & RAS_SUB_BLOCK_INDEX_MASK) <<
> + RAS_SUB_BLOCK_INDEX_SHIFT;
> + data1 |= (info->value & RAS_VALUE_MASK) << RAS_VALUE_SHIFT;
> +
> + xgpu_ai_mailbox_trans_msg(adev, IDH_RAS_ERROR_INJECT, data1,
> + addr_lo, addr_hi);
> +}
> +
> const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
> .req_full_gpu = xgpu_ai_request_full_gpu_access,
> .rel_full_gpu = xgpu_ai_release_full_gpu_access,
> @@ -412,4 +435,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
> .wait_reset = NULL,
> .trans_msg = xgpu_ai_mailbox_trans_msg,
> .req_init_data = xgpu_ai_request_init_data,
> + .ras_trigger_error = amdgpu_virt_ras_trigger_error,
> };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> index fa7e13e0459e..0841d6632328 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
> @@ -29,6 +29,14 @@
> #define AI_MAILBOX_POLL_FLR_TIMEDOUT 10000
> #define AI_MAILBOX_POLL_MSG_REP_MAX 11
>
> +#define RAS_BLOCK_ID_MASK 0x1f
> +#define RAS_INJECT_ERROR_TYPE_MASK 0xf #define
> +RAS_INJECT_ERROR_TYPE_SHIFT 5 #define RAS_SUB_BLOCK_INDEX_MASK 0x1ff
> +#define RAS_SUB_BLOCK_INDEX_SHIFT 9 #define RAS_VALUE_MASK 0x3
> +#define RAS_VALUE_SHIFT 18
> +
> enum idh_request {
> IDH_REQ_GPU_INIT_ACCESS = 1,
> IDH_REL_GPU_INIT_ACCESS,
> @@ -39,6 +47,7 @@ enum idh_request {
>
> IDH_LOG_VF_ERROR = 200,
> IDH_READY_TO_RESET = 201,
> + IDH_RAS_ERROR_INJECT = 202,
> };
>
> enum idh_event {
> --
> 2.35.1
More information about the amd-gfx
mailing list