[PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support

Mon Mar 28 07:52:06 UTC 2022

Dear Mohammad, dear Tao,

Tao, it’s hard to find your reply in the quote, as the message is not 
quoted correctly (> prepended). Is it possible to use a different email 
client?

Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
> -----Original Message-----
> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya at amd.com>
> Sent: Monday, March 28, 2022 2:25 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Lazar, Lijo <Lijo.Lazar at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya at amd.com>
> Subject: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
> 
> RAS error query support addition for VCN 2.6
> 
> V2: removed unused option and corrected comment format Moved the register definition under header file

Please wrap lines after 75 characters. (`scripts/checkpatch.pl` should 
have warned you about that).

> V3: poison query status check added.
> Removed error query interface
> 
> V4: MMSCH poison check option removed, return true/false refactored.
> 
> Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya at amd.com>
> Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 71 +++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h   |  6 +++
>   3 files changed, 78 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 1e1a3b736859..606df8869b89 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops {
>   	void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
>   	void (*reset_ras_error_count)(struct amdgpu_device *adev);
>   	void (*reset_ras_error_status)(struct amdgpu_device *adev);
> +	bool (*query_poison_status)(struct amdgpu_device *adev);
>   };
>   
>   /* work flow
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> index 1869bae4104b..3988fc647741 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
> @@ -31,6 +31,7 @@
>   #include "soc15d.h"
>   #include "vcn_v2_0.h"
>   #include "mmsch_v1_0.h"
> +#include "vcn_v2_5.h"
>   
>   #include "vcn/vcn_2_5_offset.h"
>   #include "vcn/vcn_2_5_sh_mask.h"
> @@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void *handle,  static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev,
>   				int inst_idx, struct dpg_pause_state *new_state);  static int vcn_v2_5_sriov_start(struct amdgpu_device *adev);
> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev);
>   
>   static int amdgpu_ih_clientid_vcns[] = {
>   	SOC15_IH_CLIENTID_VCN,
> @@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle)
>   	vcn_v2_5_set_dec_ring_funcs(adev);
>   	vcn_v2_5_set_enc_ring_funcs(adev);
>   	vcn_v2_5_set_irq_funcs(adev);
> +	vcn_v2_5_set_ras_funcs(adev);
>   
>   	return 0;
>   }
> @@ -1930,3 +1933,71 @@ const struct amdgpu_ip_block_version vcn_v2_6_ip_block =
>   		.rev = 0,
>   		.funcs = &vcn_v2_6_ip_funcs,
>   };
> +
> +static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev,
> +			uint32_t instance, uint32_t sub_block) {
> +	uint32_t poison_stat = 0, reg_value = 0;
> +
> +	switch (sub_block) {
> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
> +		reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS);
> +		poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
> +		break;
> +	default:
> +		break;
> +	};
> +
> +	if (poison_stat)
> +		dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n",
> +			instance, sub_block);

What should a user do with that information? Faulty hardware, …?

> +
> +	return poison_stat;
> +}
> +
> +static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) {
> +	uint32_t inst, sub;
> +	uint32_t poison_stat = 0;
> +
> +	for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
> +		for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK; sub++)
> +			poison_stat +=
> +			vcn_v2_6_query_poison_by_instance(adev, inst, sub);
> +
> +	return poison_stat ? true : false;
> 
> [Tao] just want to confirm the logic, if the POISONED_PF of one instance is 1 and another is 0, the poison_stat is true, correct?
> Can we add a print message for this case? Same question to JPEG.

Is the `dev_info` message in `vcn_v2_6_query_poison_by_instance()` doing 
what you want?

Also, instead of `poison_stat ? true : false;` a common pattern is 
`!!poison_stat` I believe.

Kind regards,

Paul

> +}
> +
> +const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
> +	.query_poison_status = vcn_v2_6_query_poison_status, };
> +
> +static struct amdgpu_vcn_ras vcn_v2_6_ras = {
> +	.ras_block = {
> +		.hw_ops = &vcn_v2_6_ras_hw_ops,
> +	},
> +};
> +
> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev) {
> +	switch (adev->ip_versions[VCN_HWIP][0]) {
> +	case IP_VERSION(2, 6, 0):
> +		adev->vcn.ras = &vcn_v2_6_ras;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	if (adev->vcn.ras) {
> +		amdgpu_ras_register_ras_block(adev, &adev->vcn.ras->ras_block);
> +
> +		strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
> +		adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
> +		adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
> +		adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
> +
> +		/* If don't define special ras_late_init function, use default ras_late_init */
> +		if (!adev->vcn.ras->ras_block.ras_late_init)
> +			adev->vcn.ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
> +	}
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> index e72f799ed0fd..1c19af74e4fd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
> @@ -24,6 +24,12 @@
>   #ifndef __VCN_V2_5_H__
>   #define __VCN_V2_5_H__
>   
> +enum amdgpu_vcn_v2_6_sub_block {
> +	AMDGPU_VCN_V2_6_VCPU_VCODEC = 0,
> +
> +	AMDGPU_VCN_V2_6_MAX_SUB_BLOCK,
> +};
> +
>   extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block;  extern const struct amdgpu_ip_block_version vcn_v2_6_ip_block;
>   
> --
> 2.25.1