[PATCH] drm/amdgpu/vcn: reset fw_shared when VCPU buffers corrupted on vcn v4.0.3

Tue Nov 19 12:22:18 UTC 2024

On 11/19/2024 4:27 PM, Xiang Liu wrote:
> In the case of RAS err_event_athub, the VCPU buffers are corrupted and

For a better description -

It is not necessarily corrupted. When there is RAS fatal error, device
memory access is blocked. Hence vcpu bo cannot be saved to system memory
as in a regular suspend sequence before going for reset. In other full
device reset cases, that gets saved and restored during resume.

Thanks,
Lijo

> cannot be restored in amdgpu_vcn_resume(), the buffers are cleared to 0
> for good. However, the fw_shared stored in the buffers need to be reset
> , or the firmware cannot work properly.
> 
> v2: Remove redundant code like vcn_v4_0 did
> v2: Refine commit message
> 
> Signed-off-by: Xiang Liu <xiang.liu at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 32 ++++++++++++++++++-------
>  1 file changed, 23 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> index d011e4678ca1..cf8264bf45c5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> @@ -123,6 +123,20 @@ static int vcn_v4_0_3_early_init(struct amdgpu_ip_block *ip_block)
>  	return amdgpu_vcn_early_init(adev);
>  }
>  
> +static int vcn_v4_0_3_fw_shared_init(struct amdgpu_device *adev, int inst_idx)
> +{
> +	volatile struct amdgpu_vcn4_fw_shared *fw_shared;
> +
> +	fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
> +	fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
> +	fw_shared->sq.is_enabled = 1;
> +
> +	if (amdgpu_vcnfw_log)
> +		amdgpu_vcn_fwlog_init(&adev->vcn.inst[inst_idx]);
> +
> +	return 0;
> +}
> +
>  /**
>   * vcn_v4_0_3_sw_init - sw init for VCN block
>   *
> @@ -155,8 +169,6 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>  		return r;
>  
>  	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
> -		volatile struct amdgpu_vcn4_fw_shared *fw_shared;
> -
>  		vcn_inst = GET_INST(VCN, i);
>  
>  		ring = &adev->vcn.inst[i].ring_enc[0];
> @@ -179,12 +191,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>  		if (r)
>  			return r;
>  
> -		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> -		fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
> -		fw_shared->sq.is_enabled = true;
> -
> -		if (amdgpu_vcnfw_log)
> -			amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
> +		vcn_v4_0_3_fw_shared_init(adev, i);
>  	}
>  
>  	if (amdgpu_sriov_vf(adev)) {
> @@ -234,7 +241,7 @@ static int vcn_v4_0_3_sw_fini(struct amdgpu_ip_block *ip_block)
>  
>  			fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
>  			fw_shared->present_flag_0 = 0;
> -			fw_shared->sq.is_enabled = cpu_to_le32(false);
> +			fw_shared->sq.is_enabled = 0;
>  		}
>  		drm_dev_exit(idx);
>  	}
> @@ -280,6 +287,8 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block *ip_block)
>  		}
>  	} else {
>  		for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> +			volatile struct amdgpu_vcn4_fw_shared *fw_shared;
> +
>  			vcn_inst = GET_INST(VCN, i);
>  			ring = &adev->vcn.inst[i].ring_enc[0];
>  
> @@ -303,6 +312,11 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block *ip_block)
>  					regVCN_RB1_DB_CTRL);
>  			}
>  
> +			/* Re-init fw_shared when RAS err_event_athub corrupt the VCPU buffers */
> +			fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> +			if (!fw_shared->sq.is_enabled)
> +				vcn_v4_0_3_fw_shared_init(adev, i);
> +
>  			r = amdgpu_ring_test_helper(ring);
>  			if (r)
>  				return r;