[PATCH] drm/amdgpu/vcn: reset fw_shared when VCPU buffers corrupted on vcn v4.0.3
Christian König
christian.koenig at amd.com
Wed Nov 20 12:48:29 UTC 2024
Am 20.11.24 um 13:34 schrieb Xiang Liu:
> It is not necessarily corrupted. When there is RAS fatal error, device
> memory access is blocked. Hence vcpu bo cannot be saved to system memory
> as in a regular suspend sequence before going for reset. In other full
> device reset cases, that gets saved and restored during resume.
>
> v2: Remove redundant code like vcn_v4_0 did
> v2: Refine commit message
> v3: Drop the volatile
> v3: Refine commit message
>
> Signed-off-by: Xiang Liu <xiang.liu at amd.com>
Acked-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 30 ++++++++++++++++++-------
> 1 file changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> index d011e4678ca1..c678631c6887 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> @@ -123,6 +123,20 @@ static int vcn_v4_0_3_early_init(struct amdgpu_ip_block *ip_block)
> return amdgpu_vcn_early_init(adev);
> }
>
> +static int vcn_v4_0_3_fw_shared_init(struct amdgpu_device *adev, int inst_idx)
> +{
> + struct amdgpu_vcn4_fw_shared *fw_shared;
> +
> + fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
> + fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
> + fw_shared->sq.is_enabled = 1;
> +
> + if (amdgpu_vcnfw_log)
> + amdgpu_vcn_fwlog_init(&adev->vcn.inst[inst_idx]);
> +
> + return 0;
> +}
> +
> /**
> * vcn_v4_0_3_sw_init - sw init for VCN block
> *
> @@ -155,8 +169,6 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
> return r;
>
> for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
> - volatile struct amdgpu_vcn4_fw_shared *fw_shared;
> -
> vcn_inst = GET_INST(VCN, i);
>
> ring = &adev->vcn.inst[i].ring_enc[0];
> @@ -179,12 +191,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> - fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> - fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
> - fw_shared->sq.is_enabled = true;
> -
> - if (amdgpu_vcnfw_log)
> - amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
> + vcn_v4_0_3_fw_shared_init(adev, i);
> }
>
> if (amdgpu_sriov_vf(adev)) {
> @@ -280,6 +287,8 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block *ip_block)
> }
> } else {
> for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> + struct amdgpu_vcn4_fw_shared *fw_shared;
> +
> vcn_inst = GET_INST(VCN, i);
> ring = &adev->vcn.inst[i].ring_enc[0];
>
> @@ -303,6 +312,11 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block *ip_block)
> regVCN_RB1_DB_CTRL);
> }
>
> + /* Re-init fw_shared when RAS fatal error occurred */
> + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> + if (!fw_shared->sq.is_enabled)
> + vcn_v4_0_3_fw_shared_init(adev, i);
> +
> r = amdgpu_ring_test_helper(ring);
> if (r)
> return r;
More information about the amd-gfx
mailing list