[PATCH] drm/amdgpu: Use driver mode reset for data poison
Zhou1, Tao
Tao.Zhou1 at amd.com
Tue Apr 16 06:21:56 UTC 2024
[AMD Official Use Only - General]
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang at amd.com>
> Sent: Tuesday, April 16, 2024 2:16 PM
> To: amd-gfx at lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison
>
> mode-2 reset is the only reliable method that can get GC/SDMA back when
> poison is consumed. mmhub requires
> mode-1 reset.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 27 ++++++-------------
> 1 file changed, 8 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index c368c70df3f4a..c3beb872adf8d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -144,7 +144,7 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> uint16_t pasid, uint16_t client_id) {
> enum amdgpu_ras_block block = 0;
> - int old_poison, ret = -EINVAL;
> + int old_poison;
> uint32_t reset = 0;
> struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>
> @@ -163,17 +163,13 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> case SOC15_IH_CLIENTID_SE2SH:
> case SOC15_IH_CLIENTID_SE3SH:
> case SOC15_IH_CLIENTID_UTCL2:
> - ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
> block = AMDGPU_RAS_BLOCK__GFX;
> - if (ret)
> - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> break;
> case SOC15_IH_CLIENTID_VMC:
> case SOC15_IH_CLIENTID_VMC1:
> - ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
> block = AMDGPU_RAS_BLOCK__MMHUB;
> - if (ret)
> - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> break;
> case SOC15_IH_CLIENTID_SDMA0:
> case SOC15_IH_CLIENTID_SDMA1:
> @@ -184,22 +180,15 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
> reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> break;
> default:
> - break;
> + dev_warn(dev->adev->dev,
> + "client %d does not support poison consumption\n",
> client_id);
> + return;
> }
>
> kfd_signal_poison_consumed_event(dev, pasid);
>
> - /* resetting queue passes, do page retirement without gpu reset
> - * resetting queue fails, fallback to gpu reset solution
> - */
> - if (!ret)
> - dev_warn(dev->adev->dev,
> - "RAS poison consumption, unmap queue flow
> succeeded: client id %d\n",
> - client_id);
> - else
> - dev_warn(dev->adev->dev,
> - "RAS poison consumption, fall back to gpu reset flow:
> client id %d\n",
> - client_id);
> + dev_warn(dev->adev->dev,
> + "poison is consumed by client %d, kick off gpu reset flow\n",
> +client_id);
>
> amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block,
> reset); }
> --
> 2.17.1
More information about the amd-gfx
mailing list