[PATCH] drm/amdgpu: Add debug option to enable mode2 for poison recovery

Zhou1, Tao Tao.Zhou1 at amd.com
Mon Aug 12 06:03:58 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang at amd.com>
> Sent: Monday, August 12, 2024 11:26 AM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: Add debug option to enable mode2 for poison
> recovery
>
> Add debug option to enable mode2 for poison recovery for testing purpose only.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h             |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c         |  6 ++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 16 ++++++++++------
>  3 files changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e6b641cb362a..c34819f947ed 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1201,6 +1201,7 @@ struct amdgpu_device {
>       bool                            debug_disable_soft_recovery;
>       bool                            debug_use_vram_fw_buf;
>       bool                            debug_enable_ras_aca;
> +     bool                            debug_mode2_for_poison_recovery;
>  };
>
>  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index afe3b8bd35a1..be6b920933d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -133,6 +133,7 @@ enum AMDGPU_DEBUG_MASK {
>       AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
>       AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
>       AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
> +     AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY = BIT(5),
>  };
>
>  unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2229,6 +2230,11 @@ static
> void amdgpu_init_debug_options(struct amdgpu_device *adev)
>               pr_info("debug: enable RAS ACA\n");
>               adev->debug_enable_ras_aca = true;
>       }
> +
> +     if (amdgpu_debug_mask &
> AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY) {
> +             pr_info("debug: enable mode2 reset for poison consumption
> recovery");
> +             adev->debug_mode2_for_poison_recovery = true;
> +     }
>  }
>
>  static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long
> flags) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 816800555f7f..a355b2bc2214 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -164,10 +164,12 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>       case SOC15_IH_CLIENTID_SE3SH:
>       case SOC15_IH_CLIENTID_UTCL2:
>               block = AMDGPU_RAS_BLOCK__GFX;
> -             if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) ==
> IP_VERSION(9, 4, 3))
> -                     reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> -             else
> +             if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) ==
> IP_VERSION(9, 4, 3)) {
> +                     reset = ((dev->adev-
> >debug_mode2_for_poison_recovery) ?
> +                              AMDGPU_RAS_GPU_RESET_MODE2_RESET :
> AMDGPU_RAS_GPU_RESET_MODE1_RESET);
> +             } else {
>                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +             }
>               break;
>       case SOC15_IH_CLIENTID_VMC:
>       case SOC15_IH_CLIENTID_VMC1:
> @@ -180,10 +182,12 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>       case SOC15_IH_CLIENTID_SDMA3:
>       case SOC15_IH_CLIENTID_SDMA4:
>               block = AMDGPU_RAS_BLOCK__SDMA;
> -             if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) ==
> IP_VERSION(9, 4, 3))
> -                     reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> -             else
> +             if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) ==
> IP_VERSION(9, 4, 3)) {
> +                     reset = ((dev->adev-
> >debug_mode2_for_poison_recovery) ?
> +                              AMDGPU_RAS_GPU_RESET_MODE2_RESET :
> AMDGPU_RAS_GPU_RESET_MODE1_RESET);
> +             } else {
>                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +             }
>               break;
>       default:
>               dev_warn(dev->adev->dev,
> --
> 2.17.1



More information about the amd-gfx mailing list