[PATCH 2/3] drm/amdgpu: add RAS poison consumption handler
Lazar, Lijo
lijo.lazar at amd.com
Wed Apr 20 04:33:14 UTC 2022
On 4/20/2022 9:23 AM, Tao Zhou wrote:
> Add support for general RAS poison consumption handler.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 ++++++++++++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
> 2 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2d066cff70ea..4bd3c25be809 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1515,6 +1515,44 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
> /* ras fs end */
>
> /* ih begin */
> +static void ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
> + struct amdgpu_iv_entry *entry)
> +{
> + bool poison_stat = true, need_reset = true;
> + struct amdgpu_device *adev = obj->adev;
> + struct ras_err_data err_data = {0, 0, 0, NULL};
> + struct ras_ih_data *data = &obj->ih_data;
> + struct amdgpu_ras_block_object *block_obj =
> + amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
> +
> + if (!adev->gmc.xgmi.connected_to_cpu)
> + amdgpu_umc_poison_handler(adev, &err_data, false);
> +
> + /* Two ways for RAS block's poison consumption implementation:
> + * 1. define IH callback, or
> + * 2. implement query and consumption interfaces.
> + */
This doesn't look appropriate. Better to have one standard way. What if
an IP has call back implemented to handle errors in non-poison mode?
> + if (data->cb) {
> + need_reset = !!data->cb(obj->adev, &err_data, entry);
> + } else if (block_obj && block_obj->hw_ops) {
> + if (block_obj->hw_ops->query_poison_status) {
> + poison_stat = block_obj->hw_ops->query_poison_status(adev);
> + if (!poison_stat)
> + dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
> + block_obj->ras_comm.name);
> + }
> +
> + if (poison_stat && block_obj->hw_ops->handle_poison_consumption) {
> + poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
> + need_reset = poison_stat;
> + }
> + }
> +
> + /* gpu reset is fallback for all failed cases */
> + if (need_reset)
> + amdgpu_ras_reset_gpu(adev);
> +}
> +
> static void ras_interrupt_poison_creation_handler(struct ras_manager *obj,
> struct amdgpu_iv_entry *entry)
> {
> @@ -1563,7 +1601,10 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
> data->rptr = (data->aligned_element_size +
> data->rptr) % data->ring_size;
>
> - ras_interrupt_poison_creation_handler(obj, &entry);
> + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> + ras_interrupt_poison_creation_handler(obj, &entry);
> + else
> + ras_interrupt_poison_consumption_handler(obj, &entry);
Same problem - poison mode is implicitly assumed. Poison consumption is
relevant only in poison mode.
Thanks,
Lijo
> }
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 606df8869b89..c4b61785ab5c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -509,6 +509,7 @@ struct amdgpu_ras_block_hw_ops {
> void (*reset_ras_error_count)(struct amdgpu_device *adev);
> void (*reset_ras_error_status)(struct amdgpu_device *adev);
> bool (*query_poison_status)(struct amdgpu_device *adev);
> + bool (*handle_poison_consumption)(struct amdgpu_device *adev);
> };
>
> /* work flow
>
More information about the amd-gfx
mailing list