答复: [PATCH Review 1/1] drm/amdgpu: support ras on SRIOV

Yang, Stanley Stanley.Yang at amd.com
Thu May 19 02:32:45 UTC 2022


[AMD Official Use Only - General]


[AMD Official Use Only - General]
Thanks tao, will update before submit.

Regards,
Stanley
发件人: Zhou1, Tao <Tao.Zhou1 at amd.com>
日期: 星期四, 2022年5月19日 上午10:30
收件人: Yang, Stanley <Stanley.Yang at amd.com>, amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>, Zhang, Hawking <Hawking.Zhang at amd.com>
抄送: Yang, Stanley <Stanley.Yang at amd.com>
主题: RE: [PATCH Review 1/1] drm/amdgpu: support ras on SRIOV


> -----Original Message-----
> From: Stanley.Yang <Stanley.Yang at amd.com>
> Sent: Wednesday, May 18, 2022 11:44 PM
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Yang, Stanley <Stanley.Yang at amd.com>
> Subject: [PATCH Review 1/1] drm/amdgpu: support ras on SRIOV
>
> support umc/gfx/sdma ras on guest side
>
> Changed from V1:
>     move sriov judgment in amdgpu_ras_interrupt_fatal_error_handler
>
> Change-Id: Ic7dda45d8f8cf2d5f1abc7705abc153d558da8a1
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 42 ++++++++++++++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   |  4 +++
>  drivers/gpu/drm/amd/amdgpu/psp_v13_0.c     |  9 +++--
>  4 files changed, 45 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b583026dc893..ba7990d0dc0e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5218,6 +5218,10 @@ int amdgpu_device_gpu_recover_imp(struct
> amdgpu_device *adev,
>                r = amdgpu_device_reset_sriov(adev, job ? false : true);
>                if (r)
>                        adev->asic_reset_res = r;
> +
> +             /* Aldebaran supports ras in SRIOV, so need resume ras during
> reset */
> +             if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
> +                     amdgpu_ras_resume(adev);
>        } else {
>                r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
>                if (r && r == -EAGAIN)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index a653cf3b3d13..2b28210c4994 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -726,7 +726,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device
> *adev,
>        /* Do not enable if it is not allowed. */
>        WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
>
> -     if (!amdgpu_ras_intr_triggered()) {
> +     /* Only enable ras feature operation handle on host side */
> +     if (!amdgpu_sriov_vf(adev) &&
> +             !amdgpu_ras_intr_triggered()) {
>                ret = psp_ras_enable_features(&adev->psp, info, enable);
>                if (ret) {
>                        dev_err(adev->dev, "ras %s %s failed poison:%d
> ret:%d\n", @@ -1523,6 +1525,10 @@ static int amdgpu_ras_fs_fini(struct
> amdgpu_device *adev)
>   */
>  void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)  {
> +     /* Fatal error events are handled on host side */
> +     if (amdgpu_sriov_vf(adev))
> +             return;
> +
>        if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
>                return;

[Tao] The two conditions above can be merged, other than that the patch is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

>
> @@ -2270,10 +2276,14 @@ static void amdgpu_ras_check_supported(struct
> amdgpu_device *adev)  {
>        adev->ras_hw_enabled = adev->ras_enabled = 0;
>
> -     if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
> +     if (!adev->is_atom_fw ||
>            !amdgpu_ras_asic_supported(adev))
>                return;
>
> +     if (!(amdgpu_sriov_vf(adev) &&
> +             (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))))
> +             return;
> +
>        if (!adev->gmc.xgmi.connected_to_cpu) {
>                if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
>                        dev_info(adev->dev, "MEM ECC is active.\n"); @@ -
> 2285,15 +2295,21 @@ static void amdgpu_ras_check_supported(struct
> amdgpu_device *adev)
>
>                if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
>                        dev_info(adev->dev, "SRAM ECC is active.\n");
> -                     adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> -                                                 1 <<
> AMDGPU_RAS_BLOCK__DF);
> -
> -                     if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2,
> 6, 0))
> -                             adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> -                                             1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> -                     else
> -                             adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> -                                             1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +                     if (!amdgpu_sriov_vf(adev)) {
> +                             adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> +                                                         1 <<
> AMDGPU_RAS_BLOCK__DF);
> +
> +                             if (adev->ip_versions[VCN_HWIP][0] ==
> IP_VERSION(2, 6, 0))
> +                                     adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +                                                     1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +                             else
> +                                     adev->ras_hw_enabled &= ~(1 <<
> AMDGPU_RAS_BLOCK__VCN |
> +                                                     1 <<
> AMDGPU_RAS_BLOCK__JPEG);
> +                     } else {
> +                             adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> +                                                             1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> +                                                             1 <<
> AMDGPU_RAS_BLOCK__GFX);
> +                     }
>                } else {
>                        dev_info(adev->dev, "SRAM ECC is not presented.\n");
>                }
> @@ -2637,6 +2653,10 @@ int amdgpu_ras_late_init(struct amdgpu_device
> *adev)
>        struct amdgpu_ras_block_object *obj;
>        int r;
>
> +     /* Guest side doesn't need init ras feature */
> +     if (amdgpu_sriov_vf(adev))
> +             return 0;
> +
>        list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
>                if (!node->ras_obj) {
>                        dev_warn(adev->dev, "Warning: abnormal ras list
> node.\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 8e221a1ba937..42c1f050542f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -124,6 +124,10 @@ int amdgpu_sdma_process_ras_data_cb(struct
> amdgpu_device *adev,
>                struct amdgpu_iv_entry *entry)
>  {
>        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +
> +     if (amdgpu_sriov_vf(adev))
> +             return AMDGPU_RAS_SUCCESS;
> +
>        amdgpu_ras_reset_gpu(adev);
>
>        return AMDGPU_RAS_SUCCESS;
> diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> index d6d79e97def9..18014ed0e853 100644
> --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> @@ -85,9 +85,12 @@ static int psp_v13_0_init_microcode(struct psp_context
> *psp)
>                err = psp_init_sos_microcode(psp, chip_name);
>                if (err)
>                        return err;
> -             err = psp_init_ta_microcode(&adev->psp, chip_name);
> -             if (err)
> -                     return err;
> +             /* It's not necessary to load ras ta on Guest side */
> +             if (!amdgpu_sriov_vf(adev)) {
> +                     err = psp_init_ta_microcode(&adev->psp, chip_name);
> +                     if (err)
> +                             return err;
> +             }
>                break;
>        case IP_VERSION(13, 0, 1):
>        case IP_VERSION(13, 0, 3):
> --
> 2.17.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20220519/e5948812/attachment-0001.htm>


More information about the amd-gfx mailing list