[PATCH] drm/amdgpu: fix double ucode load by PSP(v3)

Zhang, Hawking Hawking.Zhang at amd.com
Thu Aug 1 05:35:51 UTC 2019


No objection from me for this patch. But I was really shocked at first glance to the subject and thought how amdgpu driver survive with this bug in bare-metal case... It was then proved to be this is SRIOV specific bug because psp was initialized ahead of ih in sriov use case. The status.hw fix in suspend/resume call stack looks reasonable to me. Patch is

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: Liu, Monk <Monk.Liu at amd.com> 
Sent: 2019年8月1日 11:43
To: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Cc: Deng, Emily <Emily.Deng at amd.com>; amd-gfx at lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3)

If no objection I would submit those three patches, thanks

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Deng, Emily <Emily.Deng at amd.com>
Sent: Wednesday, July 31, 2019 5:04 PM
To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Liu, Monk <Monk.Liu at amd.com>
Subject: RE: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3)

All looks good to me. Reviewed-by: Emily Deng <Emily.Deng at amd.com>.

>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Monk 
>Liu
>Sent: Wednesday, July 31, 2019 4:54 PM
>To: amd-gfx at lists.freedesktop.org
>Cc: Liu, Monk <Monk.Liu at amd.com>
>Subject: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3)
>
>previously the ucode loading of PSP was repreated, one executed in
>phase_1 init/re-init/resume and the other in fw_loading routine
>
>Avoid this double loading by clearing ip_blocks.status.hw in suspend or 
>reset prior to the FW loading and any block's hw_init/resume
>
>v2:
>still do the smu fw loading since it is needed by bare-metal
>
>v3:
>drop the change in reinit_early_sriov, just clear all block's status.hw 
>in the head place and set the status.hw after hw_init done is enough
>
>Signed-off-by: Monk Liu <Monk.Liu at amd.com>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59
>+++++++++++++++++++-----------
> 1 file changed, 38 insertions(+), 21 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 6cb358c..30436ba 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -1673,28 +1673,34 @@ static int amdgpu_device_fw_loading(struct 
>amdgpu_device *adev)
>
> 	if (adev->asic_type >= CHIP_VEGA10) {
> 		for (i = 0; i < adev->num_ip_blocks; i++) {
>-			if (adev->ip_blocks[i].version->type ==
>AMD_IP_BLOCK_TYPE_PSP) {
>-				if (adev->in_gpu_reset || adev->in_suspend) {
>-					if (amdgpu_sriov_vf(adev) && adev-
>>in_gpu_reset)
>-						break; /* sriov gpu reset, psp
>need to do hw_init before IH because of hw limit */
>-					r = adev->ip_blocks[i].version->funcs-
>>resume(adev);
>-					if (r) {
>-						DRM_ERROR("resume of IP
>block <%s> failed %d\n",
>+			if (adev->ip_blocks[i].version->type !=
>AMD_IP_BLOCK_TYPE_PSP)
>+				continue;
>+
>+			/* no need to do the fw loading again if already
>done*/
>+			if (adev->ip_blocks[i].status.hw == true)
>+				break;
>+
>+			if (adev->in_gpu_reset || adev->in_suspend) {
>+				r = adev->ip_blocks[i].version->funcs-
>>resume(adev);
>+				if (r) {
>+					DRM_ERROR("resume of IP block <%s>
>failed %d\n",
> 							  adev-
>>ip_blocks[i].version->funcs->name, r);
>-						return r;
>-					}
>-				} else {
>-					r = adev->ip_blocks[i].version->funcs-
>>hw_init(adev);
>-					if (r) {
>-						DRM_ERROR("hw_init of IP
>block <%s> failed %d\n",
>-						  adev->ip_blocks[i].version-
>>funcs->name, r);
>-						return r;
>-					}
>+					return r;
>+				}
>+			} else {
>+				r = adev->ip_blocks[i].version->funcs-
>>hw_init(adev);
>+				if (r) {
>+					DRM_ERROR("hw_init of IP block <%s>
>failed %d\n",
>+							  adev-
>>ip_blocks[i].version->funcs->name, r);
>+					return r;
> 				}
>-				adev->ip_blocks[i].status.hw = true;
> 			}
>+
>+			adev->ip_blocks[i].status.hw = true;
>+			break;
> 		}
> 	}
>+
> 	r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
>
> 	return r;
>@@ -2136,7 +2142,9 @@ static int
>amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
> 			if (r) {
> 				DRM_ERROR("suspend of IP block <%s> failed %d\n",
> 					  adev->ip_blocks[i].version->funcs-
>>name, r);
>+				return r;
> 			}
>+			adev->ip_blocks[i].status.hw = false;
> 		}
> 	}
>
>@@ -2176,14 +2184,16 @@ static int
>amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
> 			if (is_support_sw_smu(adev)) {
> 				/* todo */
> 			} else if (adev->powerplay.pp_funcs &&
>-				   adev->powerplay.pp_funcs->set_mp1_state)
>{
>+					   adev->powerplay.pp_funcs-
>>set_mp1_state) {
> 				r = adev->powerplay.pp_funcs-
>>set_mp1_state(
> 					adev->powerplay.pp_handle,
> 					adev->mp1_state);
> 				if (r) {
> 					DRM_ERROR("SMC failed to set mp1
>state %d, %d\n",
> 						  adev->mp1_state, r);
>+					return r;
> 				}
>+				adev->ip_blocks[i].status.hw = false;
> 			}
> 		}
> 	}
>@@ -2238,6 +2248,7 @@ static int
>amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
> 		for (j = 0; j < adev->num_ip_blocks; j++) {
> 			block = &adev->ip_blocks[j];
>
>+			block->status.hw = false;
> 			if (block->version->type != ip_order[i] ||
> 				!block->status.valid)
> 				continue;
>@@ -2246,6 +2257,7 @@ static int
>amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
> 			DRM_INFO("RE-INIT-early: %s %s\n", block->version-
>>funcs->name, r?"failed":"succeeded");
> 			if (r)
> 				return r;
>+			block->status.hw = true;
> 		}
> 	}
>
>@@ -2273,13 +2285,15 @@ static int
>amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
> 			block = &adev->ip_blocks[j];
>
> 			if (block->version->type != ip_order[i] ||
>-				!block->status.valid)
>+				!block->status.valid ||
>+				block->status.hw)
> 				continue;
>
> 			r = block->version->funcs->hw_init(adev);
> 			DRM_INFO("RE-INIT-late: %s %s\n", block->version-
>>funcs->name, r?"failed":"succeeded");
> 			if (r)
> 				return r;
>+			block->status.hw = true;
> 		}
> 	}
>
>@@ -2303,17 +2317,19 @@ static int
>amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
> 	int i, r;
>
> 	for (i = 0; i < adev->num_ip_blocks; i++) {
>-		if (!adev->ip_blocks[i].status.valid)
>+		if (!adev->ip_blocks[i].status.valid || adev-
>>ip_blocks[i].status.hw)
> 			continue;
> 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> 		    adev->ip_blocks[i].version->type ==
>AMD_IP_BLOCK_TYPE_IH) {
>+
> 			r = adev->ip_blocks[i].version->funcs->resume(adev);
> 			if (r) {
> 				DRM_ERROR("resume of IP block <%s> failed %d\n",
> 					  adev->ip_blocks[i].version->funcs-
>>name, r);
> 				return r;
> 			}
>+			adev->ip_blocks[i].status.hw = true;
> 		}
> 	}
>
>@@ -2338,7 +2354,7 @@ static int amdgpu_device_ip_resume_phase2(struct
>amdgpu_device *adev)
> 	int i, r;
>
> 	for (i = 0; i < adev->num_ip_blocks; i++) {
>-		if (!adev->ip_blocks[i].status.valid)
>+		if (!adev->ip_blocks[i].status.valid || adev-
>>ip_blocks[i].status.hw)
> 			continue;
> 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || @@
>-2351,6 +2367,7 @@ static int amdgpu_device_ip_resume_phase2(struct
>amdgpu_device *adev)
> 				  adev->ip_blocks[i].version->funcs->name, r);
> 			return r;
> 		}
>+		adev->ip_blocks[i].status.hw = true;
> 	}
>
> 	return 0;
>--
>2.7.4
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx at lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx


More information about the amd-gfx mailing list