[PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma

Joshi, Mukul Mukul.Joshi at amd.com
Thu Sep 23 14:18:52 UTC 2021


[AMD Official Use Only]



> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Thursday, September 23, 2021 6:05 AM
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>; Clements, John <John.Clements at amd.com>;
> Yang, Stanley <Stanley.Yang at amd.com>
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma
> 
> [CAUTION: External Email]
> 
> In ras poison mode, page retirement will be handled by the irq handler of the
> module which consumes corrupted data.
> 
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c    | 17 ++++++++++++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c |  6 ++++--
> drivers/gpu/drm/amd/include/kgd_kfd_interface.h |  2 ++
>  3 files changed, 22 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index 46cd4ee6bafb..27fc4e52aba9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -23,6 +23,20 @@
>  #include "amdgpu_amdkfd.h"
>  #include "amdgpu_amdkfd_arcturus.h"
>  #include "amdgpu_amdkfd_gfx_v9.h"
> +#include "amdgpu_ras.h"
> +#include "amdgpu_umc.h"
> +
> +int kgd_aldebaran_ras_process_cb(struct kgd_dev *kgd) {
> +       struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> +       struct ras_err_data err_data = {0, 0, 0, NULL};
> +
> +       /* cpu mca will handle it if connected_to_cpu is 1 */
> +       if (!adev->gmc.xgmi.connected_to_cpu)
> +               return amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
> +       else
> +               return 0;
> +}
> 
>  const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>         .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
> @@ -44,5 +58,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>         .get_atc_vmid_pasid_mapping_info =
>                                 kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
>         .set_vm_context_page_table_base =
> kgd_gfx_v9_set_vm_context_page_table_base,
> -       .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings
> +       .program_trap_handler_settings =
> kgd_gfx_v9_program_trap_handler_settings,
> +       .ras_process_cb = kgd_aldebaran_ras_process_cb
>  };
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 12d91e53556c..4a48b78f918e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -231,7 +231,8 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>                                 if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST
> &&
>                                         sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
>                                         kfd_signal_poison_consumed_event(dev, pasid);
> -                                       amdgpu_amdkfd_gpu_reset(dev->kgd);
> +                                       if (dev->kfd2kgd->ras_process_cb)
> +
> + dev->kfd2kgd->ras_process_cb(dev->kgd);

Firstly, removing the call, amdgpu_amdkfd_gpu_reset(), will prevent the GPU from mode-2 reset when an application consumes a poison in GFX/SDMA.
That is not correct.

Secondly, page retirement and poison consumption are independent events. You can have page retirement without the application consuming a poison.
So, we should not calling page retirement everytime application consumes a poison.

>                                         return;
>                                 }
>                                 break;
> @@ -253,7 +254,8 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
>                         kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
>                 } else if (source_id == SOC15_INTSRC_SDMA_ECC) {
>                         kfd_signal_poison_consumed_event(dev, pasid);
> -                       amdgpu_amdkfd_gpu_reset(dev->kgd);
> +                       if (dev->kfd2kgd->ras_process_cb)
> +                               dev->kfd2kgd->ras_process_cb(dev->kgd);

The same comment as before.

Regards,
Mukul

>                         return;
>                 }
>         } else if (client_id == SOC15_IH_CLIENTID_VMC || diff --git
> a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index c84bd7b2cf59..828131415901 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -301,6 +301,8 @@ struct kfd2kgd_calls {
>                         int *max_waves_per_cu);
>         void (*program_trap_handler_settings)(struct kgd_dev *kgd,
>                         uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr);
> +
> +       int (*ras_process_cb)(struct kgd_dev *kgd);
>  };
> 
>  #endif /* KGD_KFD_INTERFACE_H_INCLUDED */
> --
> 2.17.1


More information about the amd-gfx mailing list