[PATCH Review V2 1/1] drm/amdgpu: Remove redundant poison consumption handler function

Zhou1, Tao Tao.Zhou1 at amd.com
Tue Jun 20 07:57:40 UTC 2023


[AMD Official Use Only - General]

> -----Original Message-----
> From: Stanley.Yang <Stanley.Yang at amd.com>
> Sent: Monday, June 19, 2023 9:50 PM
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>;
> Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
> Cc: Yang, Stanley <Stanley.Yang at amd.com>
> Subject: [PATCH Review V2 1/1] drm/amdgpu: Remove redundant poison
> consumption handler function
>
> The function callback handle_poison_consumption and callback function
> poison_consumption_handler are almost same to handle poison consumption,
> remove poison_consumption_handler.
>
> Changed from V1:
>       Add handle poison consumption function for VCN2.6, VCN4.0,
>       JPEG2.6 and JPEG4.0, return false when handle VCN/JPEGP poison
>       consumption.
>
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  9 ---------
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  4 ----
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  3 ++-
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 12 +++++++++---
>  drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c   |  9 +++++++++
>  drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   |  9 +++++++++
>  drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c    |  9 +++++++++
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c    |  9 +++++++++
>  9 files changed, 50 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a33d4bc34cee..c15dbdb2e0f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -840,15 +840,6 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device
> *adev)
>       return 0;
>  }
>
> -int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
> -                                             struct amdgpu_iv_entry *entry)
> -{
> -     if (adev->gfx.ras && adev->gfx.ras->poison_consumption_handler)
> -             return adev->gfx.ras->poison_consumption_handler(adev,
> entry);
> -
> -     return 0;
> -}
> -
>  int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
>               void *err_data,
>               struct amdgpu_iv_entry *entry)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index d0c3f2955821..95b80bc8cdb9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -273,8 +273,6 @@ struct amdgpu_gfx_ras {
>       int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
>                               struct amdgpu_irq_src *source,
>                               struct amdgpu_iv_entry *entry);
> -     int (*poison_consumption_handler)(struct amdgpu_device *adev,
> -                                             struct amdgpu_iv_entry
> *entry);
>  };
>
>  struct amdgpu_gfx_shadow_info {
> @@ -538,8 +536,6 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device
> *adev);  void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev,
> uint32_t ucode_id);
>
>  int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev); -int
> amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
> -                                             struct amdgpu_iv_entry
> *entry);
>
>  bool amdgpu_gfx_is_master_xcc(struct amdgpu_device *adev, int xcc_id);  int
> amdgpu_gfx_sysfs_init(struct amdgpu_device *adev); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5b6525d8dace..9ce7c7537751 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1668,7 +1668,7 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct
> amdgpu_device *adev)  static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
>                               struct amdgpu_iv_entry *entry)
>  {
> -     bool poison_stat = false;
> +     bool poison_stat = true;
>       struct amdgpu_device *adev = obj->adev;
>       struct amdgpu_ras_block_object *block_obj =
>               amdgpu_ras_get_ras_block(adev, obj->head.block, 0); @@ -
> 1694,15 +1694,13 @@ static void
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
>       amdgpu_umc_poison_handler(adev, false);
>
>       if (block_obj->hw_ops && block_obj->hw_ops-
> >handle_poison_consumption)
> -             poison_stat = block_obj->hw_ops-
> >handle_poison_consumption(adev);
> +             poison_stat = block_obj->hw_ops-
> >handle_poison_consumption(adev,
> +entry);

[Tao] !block_obj->hw_ops->handle_poison_consumption is allowed, we can add the following code to avoid adding handle_poison_consumption for vcn and jpeg.

else
       poison_stat = false;

For the change in GFX, it's better for Thomas to take a look.

>
>       /* gpu reset is fallback for failed and default cases */
> -     if (poison_stat) {
> +     if (poison_stat != true) {
>               dev_info(adev->dev, "GPU reset for %s RAS poison consumption
> is issued!\n",
>                               block_obj->ras_comm.name);
>               amdgpu_ras_reset_gpu(adev);
> -     } else {
> -             amdgpu_gfx_poison_consumption_handler(adev, entry);
>       }
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 46bf1889a9d7..03f3b3774b85 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -564,7 +564,8 @@ struct amdgpu_ras_block_hw_ops {
>       void (*reset_ras_error_count)(struct amdgpu_device *adev);
>       void (*reset_ras_error_status)(struct amdgpu_device *adev);
>       bool (*query_poison_status)(struct amdgpu_device *adev);
> -     bool (*handle_poison_consumption)(struct amdgpu_device *adev);
> +     bool (*handle_poison_consumption)(struct amdgpu_device *adev,
> +                     struct amdgpu_iv_entry *entry);
>  };
>
>  /* work flow
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
> index 26d6286d86c9..5b7eac547a05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
> @@ -78,7 +78,7 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct
> amdgpu_device *adev,
>       return 0;
>  }
>
> -static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device
> *adev,
> +static bool gfx_v11_0_3_handle_poison_consumption(struct amdgpu_device
> +*adev,
>                                       struct amdgpu_iv_entry *entry)
>  {
>       /* Workaround: when vmid and pasid are both zero, trigger gpu reset in
> KGD. */ @@ -99,10 +99,16 @@ static int
> gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
>               amdgpu_ras_reset_gpu(adev);
>       }
>
> -     return 0;
> +     return true;
>  }
>
> +struct amdgpu_ras_block_hw_ops gfx_v11_0_3_ras_ops = {
> +     .handle_poison_consumption =
> gfx_v11_0_3_handle_poison_consumption,
> +};
> +
>  struct amdgpu_gfx_ras gfx_v11_0_3_ras = {
> +     .ras_block = {
> +             .hw_ops = &gfx_v11_0_3_ras_ops,
> +     },
>       .rlc_gc_fed_irq = gfx_v11_0_3_rlc_gc_fed_irq,
> -     .poison_consumption_handler =
> gfx_v11_0_3_poison_consumption_handler,
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
> b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
> index aadb74de52bc..3d60610a9c70 100644
> --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
> +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
> @@ -809,8 +809,17 @@ static bool jpeg_v2_6_query_ras_poison_status(struct
> amdgpu_device *adev)
>       return !!poison_stat;
>  }
>
> +static bool jpeg_v2_6_handle_poison_consumption(struct amdgpu_device
> *adev,
> +                                             struct amdgpu_iv_entry *entry)
> +{
> +     /* TODO: reset IP engine instead of reset GPU */
> +
> +     return false;
> +}
> +
>  const struct amdgpu_ras_block_hw_ops jpeg_v2_6_ras_hw_ops = {
>       .query_poison_status = jpeg_v2_6_query_ras_poison_status,
> +     .handle_poison_consumption = jpeg_v2_6_handle_poison_consumption,
>  };
>
>  static struct amdgpu_jpeg_ras jpeg_v2_6_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
> index a707d407fbd0..328e3de14e35 100644
> --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
> @@ -818,8 +818,17 @@ static bool jpeg_v4_0_query_ras_poison_status(struct
> amdgpu_device *adev)
>       return !!poison_stat;
>  }
>
> +static bool jpeg_v4_0_handle_poison_consumption(struct amdgpu_device
> *adev,
> +                                             struct amdgpu_iv_entry *entry)
> +{
> +     /* TODO: reset IP engine instead of reset GPU */
> +
> +     return false;
> +}
> +
>  const struct amdgpu_ras_block_hw_ops jpeg_v4_0_ras_hw_ops = {
>       .query_poison_status = jpeg_v4_0_query_ras_poison_status,
> +     .handle_poison_consumption = jpeg_v4_0_handle_poison_consumption,
>  };
>
>  static struct amdgpu_jpeg_ras jpeg_v4_0_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> index bb1875f926f1..e6dbea441caa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> @@ -1974,8 +1974,17 @@ static bool vcn_v2_6_query_poison_status(struct
> amdgpu_device *adev)
>       return !!poison_stat;
>  }
>
> +static bool vcn_v2_6_handle_poison_consumption(struct amdgpu_device *adev,
> +                                             struct amdgpu_iv_entry *entry)
> +{
> +     /* TODO: reset IP engine instead of reset GPU */
> +
> +     return false;
> +}
> +
>  const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
>       .query_poison_status = vcn_v2_6_query_poison_status,
> +     .handle_poison_consumption = vcn_v2_6_handle_poison_consumption,
>  };
>
>  static struct amdgpu_vcn_ras vcn_v2_6_ras = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> index b48bb5212488..eb8625a6df92 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> @@ -2134,8 +2134,17 @@ static bool vcn_v4_0_query_ras_poison_status(struct
> amdgpu_device *adev)
>       return !!poison_stat;
>  }
>
> +static bool vcn_v4_0_handle_poison_consumption(struct amdgpu_device *adev,
> +                                             struct amdgpu_iv_entry *entry)
> +{
> +     /* TODO: reset IP engine instead of reset GPU */
> +
> +     return false;
> +}
> +
>  const struct amdgpu_ras_block_hw_ops vcn_v4_0_ras_hw_ops = {
>       .query_poison_status = vcn_v4_0_query_ras_poison_status,
> +     .handle_poison_consumption = vcn_v4_0_handle_poison_consumption,
>  };
>
>  static struct amdgpu_vcn_ras vcn_v4_0_ras = {
> --
> 2.25.1



More information about the amd-gfx mailing list