[PATCH] drm/amd/amdgpu: Clean up VCN fw_shared and set flag bits during hw_init
Bokun Zhang
bokun.zhang at amd.com
Tue Nov 28 21:25:17 UTC 2023
- After whole GPU reset, the VCN fw_shared data is cleaned up.
This seems like a new behavior and it is considered safer since
we do not want to keep the residual data
- However, the existing driver code still assumes the residual data.
For example, in sw_init, we will set the UNIFIED_QUEUE flag bit.
This flag bit is never set anywhere else.
Then if a WGR happens, sw_init will not be called and therefore,
we will lose this bit and it leads to a crash in VCN FW.
- A proper fix is that we explicitly set the flag bit in hw_init
every time and the data is repopulated after a WGR instead of
assuming the data can survive the WGR.
Signed-off-by: Bokun Zhang <bokun.zhang at amd.com>
Change-Id: I783ff826f12e12eaf48d046ff9f1cef65558c7b9
---
drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 45 +++++++++++++++++----------
1 file changed, 28 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index bf07aa200030..0cf3e639c480 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -111,6 +111,7 @@ static int vcn_v4_0_sw_init(void *handle)
{
struct amdgpu_ring *ring;
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+ volatile struct amdgpu_vcn4_fw_shared *fw_shared;
int i, r;
r = amdgpu_vcn_sw_init(adev);
@@ -124,11 +125,12 @@ static int vcn_v4_0_sw_init(void *handle)
return r;
for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
- volatile struct amdgpu_vcn4_fw_shared *fw_shared;
-
if (adev->vcn.harvest_config & (1 << i))
continue;
+ fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
+ memset(fw_shared, 0, sizeof(struct amdgpu_vcn4_fw_shared));
+
/* Init instance 0 sched_score to 1, so it's scheduled after other instances */
if (i == 0)
atomic_set(&adev->vcn.inst[i].sched_score, 1);
@@ -161,21 +163,6 @@ static int vcn_v4_0_sw_init(void *handle)
if (r)
return r;
- fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
- fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
- fw_shared->sq.is_enabled = 1;
-
- fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG);
- fw_shared->smu_dpm_interface.smu_interface_type = (adev->flags & AMD_IS_APU) ?
- AMDGPU_VCN_SMU_DPM_INTERFACE_APU : AMDGPU_VCN_SMU_DPM_INTERFACE_DGPU;
-
- if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
- IP_VERSION(4, 0, 2)) {
- fw_shared->present_flag_0 |= AMDGPU_FW_SHARED_FLAG_0_DRM_KEY_INJECT;
- fw_shared->drm_key_wa.method =
- AMDGPU_DRM_KEY_INJECT_WORKAROUND_VCNFW_ASD_HANDSHAKING;
- }
-
if (amdgpu_vcnfw_log)
amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
}
@@ -245,9 +232,33 @@ static int vcn_v4_0_sw_fini(void *handle)
static int vcn_v4_0_hw_init(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+ volatile struct amdgpu_vcn4_fw_shared *fw_shared;
struct amdgpu_ring *ring;
int i, r;
+ for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
+ if (adev->vcn.harvest_config & (1 << i))
+ continue;
+
+ fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
+ fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
+ fw_shared->sq.is_enabled = 1;
+
+ fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG);
+ fw_shared->smu_dpm_interface.smu_interface_type = (adev->flags & AMD_IS_APU) ?
+ AMDGPU_VCN_SMU_DPM_INTERFACE_APU : AMDGPU_VCN_SMU_DPM_INTERFACE_DGPU;
+
+ if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
+ IP_VERSION(4, 0, 2)) {
+ fw_shared->present_flag_0 |= AMDGPU_FW_SHARED_FLAG_0_DRM_KEY_INJECT;
+ fw_shared->drm_key_wa.method =
+ AMDGPU_DRM_KEY_INJECT_WORKAROUND_VCNFW_ASD_HANDSHAKING;
+ }
+
+ if (amdgpu_vcnfw_log)
+ fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_FW_LOGGING_FLAG);
+ }
+
if (amdgpu_sriov_vf(adev)) {
r = vcn_v4_0_start_sriov(adev);
if (r)
--
2.34.1
More information about the amd-gfx
mailing list