[PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
Paul Menzel
pmenzel at molgen.mpg.de
Mon Mar 28 07:52:06 UTC 2022
Dear Mohammad, dear Tao,
Tao, it’s hard to find your reply in the quote, as the message is not
quoted correctly (> prepended). Is it possible to use a different email
client?
Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
> -----Original Message-----
> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya at amd.com>
> Sent: Monday, March 28, 2022 2:25 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Lazar, Lijo <Lijo.Lazar at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya at amd.com>
> Subject: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>
> RAS error query support addition for VCN 2.6
>
> V2: removed unused option and corrected comment format Moved the register definition under header file
Please wrap lines after 75 characters. (`scripts/checkpatch.pl` should
have warned you about that).
> V3: poison query status check added.
> Removed error query interface
>
> V4: MMSCH poison check option removed, return true/false refactored.
>
> Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya at amd.com>
> Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
> drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 71 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h | 6 +++
> 3 files changed, 78 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 1e1a3b736859..606df8869b89 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops {
> void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
> void (*reset_ras_error_count)(struct amdgpu_device *adev);
> void (*reset_ras_error_status)(struct amdgpu_device *adev);
> + bool (*query_poison_status)(struct amdgpu_device *adev);
> };
>
> /* work flow
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> index 1869bae4104b..3988fc647741 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> @@ -31,6 +31,7 @@
> #include "soc15d.h"
> #include "vcn_v2_0.h"
> #include "mmsch_v1_0.h"
> +#include "vcn_v2_5.h"
>
> #include "vcn/vcn_2_5_offset.h"
> #include "vcn/vcn_2_5_sh_mask.h"
> @@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void *handle, static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev,
> int inst_idx, struct dpg_pause_state *new_state); static int vcn_v2_5_sriov_start(struct amdgpu_device *adev);
> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev);
>
> static int amdgpu_ih_clientid_vcns[] = {
> SOC15_IH_CLIENTID_VCN,
> @@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle)
> vcn_v2_5_set_dec_ring_funcs(adev);
> vcn_v2_5_set_enc_ring_funcs(adev);
> vcn_v2_5_set_irq_funcs(adev);
> + vcn_v2_5_set_ras_funcs(adev);
>
> return 0;
> }
> @@ -1930,3 +1933,71 @@ const struct amdgpu_ip_block_version vcn_v2_6_ip_block =
> .rev = 0,
> .funcs = &vcn_v2_6_ip_funcs,
> };
> +
> +static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
> + uint32_t instance, uint32_t sub_block) {
> + uint32_t poison_stat = 0, reg_value = 0;
> +
> + switch (sub_block) {
> + case AMDGPU_VCN_V2_6_VCPU_VCODEC:
> + reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS);
> + poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
> + break;
> + default:
> + break;
> + };
> +
> + if (poison_stat)
> + dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n",
> + instance, sub_block);
What should a user do with that information? Faulty hardware, …?
> +
> + return poison_stat;
> +}
> +
> +static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) {
> + uint32_t inst, sub;
> + uint32_t poison_stat = 0;
> +
> + for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
> + for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK; sub++)
> + poison_stat +=
> + vcn_v2_6_query_poison_by_instance(adev, inst, sub);
> +
> + return poison_stat ? true : false;
>
> [Tao] just want to confirm the logic, if the POISONED_PF of one instance is 1 and another is 0, the poison_stat is true, correct?
> Can we add a print message for this case? Same question to JPEG.
Is the `dev_info` message in `vcn_v2_6_query_poison_by_instance()` doing
what you want?
Also, instead of `poison_stat ? true : false;` a common pattern is
`!!poison_stat` I believe.
Kind regards,
Paul
> +}
> +
> +const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
> + .query_poison_status = vcn_v2_6_query_poison_status, };
> +
> +static struct amdgpu_vcn_ras vcn_v2_6_ras = {
> + .ras_block = {
> + .hw_ops = &vcn_v2_6_ras_hw_ops,
> + },
> +};
> +
> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev) {
> + switch (adev->ip_versions[VCN_HWIP][0]) {
> + case IP_VERSION(2, 6, 0):
> + adev->vcn.ras = &vcn_v2_6_ras;
> + break;
> + default:
> + break;
> + }
> +
> + if (adev->vcn.ras) {
> + amdgpu_ras_register_ras_block(adev, &adev->vcn.ras->ras_block);
> +
> + strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
> + adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
> + adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
> + adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
> +
> + /* If don't define special ras_late_init function, use default ras_late_init */
> + if (!adev->vcn.ras->ras_block.ras_late_init)
> + adev->vcn.ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
> + }
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> index e72f799ed0fd..1c19af74e4fd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> @@ -24,6 +24,12 @@
> #ifndef __VCN_V2_5_H__
> #define __VCN_V2_5_H__
>
> +enum amdgpu_vcn_v2_6_sub_block {
> + AMDGPU_VCN_V2_6_VCPU_VCODEC = 0,
> +
> + AMDGPU_VCN_V2_6_MAX_SUB_BLOCK,
> +};
> +
> extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block; extern const struct amdgpu_ip_block_version vcn_v2_6_ip_block;
>
> --
> 2.25.1
More information about the amd-gfx
mailing list