[PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support

Ziya, Mohammad zafar Mohammadzafar.Ziya at amd.com
Mon Mar 28 08:00:56 UTC 2022


[AMD Official Use Only]

Dear Paul and Tao,

Comments are added inline.

Regards,
Zafar

>-----Original Message-----
>From: Paul Menzel <pmenzel at molgen.mpg.de>
>Sent: Monday, March 28, 2022 1:22 PM
>To: Zhou1, Tao <Tao.Zhou1 at amd.com>; Ziya, Mohammad zafar
><Mohammadzafar.Ziya at amd.com>
>Cc: amd-gfx at lists.freedesktop.org; Lazar, Lijo <Lijo.Lazar at amd.com>; Zhang,
>Hawking <Hawking.Zhang at amd.com>
>Subject: Re: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>
>Dear Mohammad, dear Tao,
>
>
>Tao, it’s hard to find your reply in the quote, as the message is not quoted
>correctly (> prepended). Is it possible to use a different email client?
>
>
>Am 28.03.22 um 09:43 schrieb Zhou1, Tao:
>> -----Original Message-----
>> From: Ziya, Mohammad zafar <Mohammadzafar.Ziya at amd.com>
>> Sent: Monday, March 28, 2022 2:25 PM
>> To: amd-gfx at lists.freedesktop.org
>> Cc: Lazar, Lijo <Lijo.Lazar at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>;
>> Zhang, Hawking <Hawking.Zhang at amd.com>; Ziya, Mohammad zafar
>> <Mohammadzafar.Ziya at amd.com>
>> Subject: [PATCH v4 5/6] drm/amdgpu/vcn: VCN ras error query support
>>
>> RAS error query support addition for VCN 2.6
>>
>> V2: removed unused option and corrected comment format Moved the
>> register definition under header file
>
>Please wrap lines after 75 characters. (`scripts/checkpatch.pl` should have
>warned you about that).
>
>> V3: poison query status check added.
>> Removed error query interface
>>
>> V4: MMSCH poison check option removed, return true/false refactored.
>>
>> Signed-off-by: Mohammad Zafar Ziya <Mohammadzafar.ziya at amd.com>
>> Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 71
>+++++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h   |  6 +++
>>   3 files changed, 78 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 1e1a3b736859..606df8869b89 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops {
>>   	void (*query_ras_error_address)(struct amdgpu_device *adev, void
>*ras_error_status);
>>   	void (*reset_ras_error_count)(struct amdgpu_device *adev);
>>   	void (*reset_ras_error_status)(struct amdgpu_device *adev);
>> +	bool (*query_poison_status)(struct amdgpu_device *adev);
>>   };
>>
>>   /* work flow
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> index 1869bae4104b..3988fc647741 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
>> @@ -31,6 +31,7 @@
>>   #include "soc15d.h"
>>   #include "vcn_v2_0.h"
>>   #include "mmsch_v1_0.h"
>> +#include "vcn_v2_5.h"
>>
>>   #include "vcn/vcn_2_5_offset.h"
>>   #include "vcn/vcn_2_5_sh_mask.h"
>> @@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void
>*handle,  static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device
>*adev,
>>   				int inst_idx, struct dpg_pause_state
>*new_state);  static int
>> vcn_v2_5_sriov_start(struct amdgpu_device *adev);
>> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev);
>>
>>   static int amdgpu_ih_clientid_vcns[] = {
>>   	SOC15_IH_CLIENTID_VCN,
>> @@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle)
>>   	vcn_v2_5_set_dec_ring_funcs(adev);
>>   	vcn_v2_5_set_enc_ring_funcs(adev);
>>   	vcn_v2_5_set_irq_funcs(adev);
>> +	vcn_v2_5_set_ras_funcs(adev);
>>
>>   	return 0;
>>   }
>> @@ -1930,3 +1933,71 @@ const struct amdgpu_ip_block_version
>vcn_v2_6_ip_block =
>>   		.rev = 0,
>>   		.funcs = &vcn_v2_6_ip_funcs,
>>   };
>> +
>> +static uint32_t vcn_v2_6_query_poison_by_instance(struct
>amdgpu_device *adev,
>> +			uint32_t instance, uint32_t sub_block) {
>> +	uint32_t poison_stat = 0, reg_value = 0;
>> +
>> +	switch (sub_block) {
>> +	case AMDGPU_VCN_V2_6_VCPU_VCODEC:
>> +		reg_value = RREG32_SOC15(VCN, instance,
>mmUVD_RAS_VCPU_VCODEC_STATUS);
>> +		poison_stat = REG_GET_FIELD(reg_value,
>UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF);
>> +		break;
>> +	default:
>> +		break;
>> +	};
>> +
>> +	if (poison_stat)
>> +		dev_info(adev->dev, "Poison detected in VCN%d,
>sub_block%d\n",
>> +			instance, sub_block);
>
>What should a user do with that information? Faulty hardware, …?

[Mohammad]: This message will help to identify the faulty hardware, the hardware ID will also log along with poison, help to identify among multiple hardware installed on the system.

>
>> +
>> +	return poison_stat;
>> +}
>> +
>> +static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) {
>> +	uint32_t inst, sub;
>> +	uint32_t poison_stat = 0;
>> +
>> +	for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++)
>> +		for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK;
>sub++)
>> +			poison_stat +=
>> +			vcn_v2_6_query_poison_by_instance(adev, inst,
>sub);
>> +
>> +	return poison_stat ? true : false;
>>
>> [Tao] just want to confirm the logic, if the POISONED_PF of one instance is 1
>and another is 0, the poison_stat is true, correct?
>> Can we add a print message for this case? Same question to JPEG.

[Mohammad]: Message will be printed on function block ahead of the function.

>
>Is the `dev_info` message in `vcn_v2_6_query_poison_by_instance()` doing
>what you want?
>
>Also, instead of `poison_stat ? true : false;` a common pattern is
>`!!poison_stat` I believe.
>
>
>Kind regards,
>
>Paul

[Mohammad]: Noted the change. Will make to return !!poison_stat ? true : false;

>
>
>> +}
>> +
>> +const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = {
>> +	.query_poison_status = vcn_v2_6_query_poison_status, };
>> +
>> +static struct amdgpu_vcn_ras vcn_v2_6_ras = {
>> +	.ras_block = {
>> +		.hw_ops = &vcn_v2_6_ras_hw_ops,
>> +	},
>> +};
>> +
>> +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev) {
>> +	switch (adev->ip_versions[VCN_HWIP][0]) {
>> +	case IP_VERSION(2, 6, 0):
>> +		adev->vcn.ras = &vcn_v2_6_ras;
>> +		break;
>> +	default:
>> +		break;
>> +	}
>> +
>> +	if (adev->vcn.ras) {
>> +		amdgpu_ras_register_ras_block(adev, &adev->vcn.ras-
>>ras_block);
>> +
>> +		strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
>> +		adev->vcn.ras->ras_block.ras_comm.block =
>AMDGPU_RAS_BLOCK__VCN;
>> +		adev->vcn.ras->ras_block.ras_comm.type =
>AMDGPU_RAS_ERROR__POISON;
>> +		adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm;
>> +
>> +		/* If don't define special ras_late_init function, use default
>ras_late_init */
>> +		if (!adev->vcn.ras->ras_block.ras_late_init)
>> +			adev->vcn.ras->ras_block.ras_late_init =
>amdgpu_ras_block_late_init;
>> +	}
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> index e72f799ed0fd..1c19af74e4fd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h
>> @@ -24,6 +24,12 @@
>>   #ifndef __VCN_V2_5_H__
>>   #define __VCN_V2_5_H__
>>
>> +enum amdgpu_vcn_v2_6_sub_block {
>> +	AMDGPU_VCN_V2_6_VCPU_VCODEC = 0,
>> +
>> +	AMDGPU_VCN_V2_6_MAX_SUB_BLOCK,
>> +};
>> +
>>   extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block;  extern
>const struct amdgpu_ip_block_version vcn_v2_6_ip_block;
>>
>> --
>> 2.25.1


More information about the amd-gfx mailing list