[PATCH] drm/amdgpu: Add debug option to enable mode2 for poison recovery
Zhou1, Tao
Tao.Zhou1 at amd.com
Mon Aug 12 03:52:15 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang at amd.com>
> Sent: Monday, August 12, 2024 11:26 AM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: Add debug option to enable mode2 for poison
> recovery
>
> Add debug option to enable mode2 for poison recovery for testing purpose only.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++++
> drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 16 ++++++++++------
> 3 files changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e6b641cb362a..c34819f947ed 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1201,6 +1201,7 @@ struct amdgpu_device {
> bool debug_disable_soft_recovery;
> bool debug_use_vram_fw_buf;
> bool debug_enable_ras_aca;
> + bool debug_mode2_for_poison_recovery;
> };
>
> static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, diff
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index afe3b8bd35a1..be6b920933d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -133,6 +133,7 @@ enum AMDGPU_DEBUG_MASK {
> AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
> AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
> AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
> + AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY = BIT(5),
> };
>
> unsigned int amdgpu_vram_limit = UINT_MAX; @@ -2229,6 +2230,11 @@ static
> void amdgpu_init_debug_options(struct amdgpu_device *adev)
> pr_info("debug: enable RAS ACA\n");
> adev->debug_enable_ras_aca = true;
> }
> +
> + if (amdgpu_debug_mask &
> AMDGPU_DEBUG_MODE2_FOR_POISON_RECOVERY) {
> + pr_info("debug: enable mode2 reset for poison consumption
> recovery");
> + adev->debug_mode2_for_poison_recovery = true;
> + }
> }
>
> static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long
> flags) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index 816800555f7f..a355b2bc2214 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -164,10 +164,12 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> case SOC15_IH_CLIENTID_SE3SH:
> case SOC15_IH_CLIENTID_UTCL2:
> block = AMDGPU_RAS_BLOCK__GFX;
> - if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) ==
> IP_VERSION(9, 4, 3))
> - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> - else
> + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) ==
> IP_VERSION(9, 4, 3)) {
> + reset = ((dev->adev-
> >debug_mode2_for_poison_recovery) ?
> + AMDGPU_RAS_GPU_RESET_MODE2_RESET :
> AMDGPU_RAS_GPU_RESET_MODE1_RESET);
[Tao] can we apply the debug option for all ASICs?
> + } else {
> reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + }
> break;
> case SOC15_IH_CLIENTID_VMC:
> case SOC15_IH_CLIENTID_VMC1:
> @@ -180,10 +182,12 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> case SOC15_IH_CLIENTID_SDMA3:
> case SOC15_IH_CLIENTID_SDMA4:
> block = AMDGPU_RAS_BLOCK__SDMA;
> - if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) ==
> IP_VERSION(9, 4, 3))
> - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> - else
> + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) ==
> IP_VERSION(9, 4, 3)) {
> + reset = ((dev->adev-
> >debug_mode2_for_poison_recovery) ?
> + AMDGPU_RAS_GPU_RESET_MODE2_RESET :
> AMDGPU_RAS_GPU_RESET_MODE1_RESET);
> + } else {
> reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + }
> break;
> default:
> dev_warn(dev->adev->dev,
> --
> 2.17.1
More information about the amd-gfx
mailing list