[PATCH] drm/amdgpu: fix double ucode load by PSP(v3)
Zhang, Hawking
Hawking.Zhang at amd.com
Thu Aug 1 05:35:51 UTC 2019
No objection from me for this patch. But I was really shocked at first glance to the subject and thought how amdgpu driver survive with this bug in bare-metal case... It was then proved to be this is SRIOV specific bug because psp was initialized ahead of ih in sriov use case. The status.hw fix in suspend/resume call stack looks reasonable to me. Patch is
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
Regards,
Hawking
-----Original Message-----
From: Liu, Monk <Monk.Liu at amd.com>
Sent: 2019年8月1日 11:43
To: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Cc: Deng, Emily <Emily.Deng at amd.com>; amd-gfx at lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3)
If no objection I would submit those three patches, thanks
_____________________________________
Monk Liu|GPU Virtualization Team |AMD
-----Original Message-----
From: Deng, Emily <Emily.Deng at amd.com>
Sent: Wednesday, July 31, 2019 5:04 PM
To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Liu, Monk <Monk.Liu at amd.com>
Subject: RE: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3)
All looks good to me. Reviewed-by: Emily Deng <Emily.Deng at amd.com>.
>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Monk
>Liu
>Sent: Wednesday, July 31, 2019 4:54 PM
>To: amd-gfx at lists.freedesktop.org
>Cc: Liu, Monk <Monk.Liu at amd.com>
>Subject: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3)
>
>previously the ucode loading of PSP was repreated, one executed in
>phase_1 init/re-init/resume and the other in fw_loading routine
>
>Avoid this double loading by clearing ip_blocks.status.hw in suspend or
>reset prior to the FW loading and any block's hw_init/resume
>
>v2:
>still do the smu fw loading since it is needed by bare-metal
>
>v3:
>drop the change in reinit_early_sriov, just clear all block's status.hw
>in the head place and set the status.hw after hw_init done is enough
>
>Signed-off-by: Monk Liu <Monk.Liu at amd.com>
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59
>+++++++++++++++++++-----------
> 1 file changed, 38 insertions(+), 21 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>index 6cb358c..30436ba 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>@@ -1673,28 +1673,34 @@ static int amdgpu_device_fw_loading(struct
>amdgpu_device *adev)
>
> if (adev->asic_type >= CHIP_VEGA10) {
> for (i = 0; i < adev->num_ip_blocks; i++) {
>- if (adev->ip_blocks[i].version->type ==
>AMD_IP_BLOCK_TYPE_PSP) {
>- if (adev->in_gpu_reset || adev->in_suspend) {
>- if (amdgpu_sriov_vf(adev) && adev-
>>in_gpu_reset)
>- break; /* sriov gpu reset, psp
>need to do hw_init before IH because of hw limit */
>- r = adev->ip_blocks[i].version->funcs-
>>resume(adev);
>- if (r) {
>- DRM_ERROR("resume of IP
>block <%s> failed %d\n",
>+ if (adev->ip_blocks[i].version->type !=
>AMD_IP_BLOCK_TYPE_PSP)
>+ continue;
>+
>+ /* no need to do the fw loading again if already
>done*/
>+ if (adev->ip_blocks[i].status.hw == true)
>+ break;
>+
>+ if (adev->in_gpu_reset || adev->in_suspend) {
>+ r = adev->ip_blocks[i].version->funcs-
>>resume(adev);
>+ if (r) {
>+ DRM_ERROR("resume of IP block <%s>
>failed %d\n",
> adev-
>>ip_blocks[i].version->funcs->name, r);
>- return r;
>- }
>- } else {
>- r = adev->ip_blocks[i].version->funcs-
>>hw_init(adev);
>- if (r) {
>- DRM_ERROR("hw_init of IP
>block <%s> failed %d\n",
>- adev->ip_blocks[i].version-
>>funcs->name, r);
>- return r;
>- }
>+ return r;
>+ }
>+ } else {
>+ r = adev->ip_blocks[i].version->funcs-
>>hw_init(adev);
>+ if (r) {
>+ DRM_ERROR("hw_init of IP block <%s>
>failed %d\n",
>+ adev-
>>ip_blocks[i].version->funcs->name, r);
>+ return r;
> }
>- adev->ip_blocks[i].status.hw = true;
> }
>+
>+ adev->ip_blocks[i].status.hw = true;
>+ break;
> }
> }
>+
> r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
>
> return r;
>@@ -2136,7 +2142,9 @@ static int
>amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
> if (r) {
> DRM_ERROR("suspend of IP block <%s> failed %d\n",
> adev->ip_blocks[i].version->funcs-
>>name, r);
>+ return r;
> }
>+ adev->ip_blocks[i].status.hw = false;
> }
> }
>
>@@ -2176,14 +2184,16 @@ static int
>amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
> if (is_support_sw_smu(adev)) {
> /* todo */
> } else if (adev->powerplay.pp_funcs &&
>- adev->powerplay.pp_funcs->set_mp1_state)
>{
>+ adev->powerplay.pp_funcs-
>>set_mp1_state) {
> r = adev->powerplay.pp_funcs-
>>set_mp1_state(
> adev->powerplay.pp_handle,
> adev->mp1_state);
> if (r) {
> DRM_ERROR("SMC failed to set mp1
>state %d, %d\n",
> adev->mp1_state, r);
>+ return r;
> }
>+ adev->ip_blocks[i].status.hw = false;
> }
> }
> }
>@@ -2238,6 +2248,7 @@ static int
>amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
> for (j = 0; j < adev->num_ip_blocks; j++) {
> block = &adev->ip_blocks[j];
>
>+ block->status.hw = false;
> if (block->version->type != ip_order[i] ||
> !block->status.valid)
> continue;
>@@ -2246,6 +2257,7 @@ static int
>amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
> DRM_INFO("RE-INIT-early: %s %s\n", block->version-
>>funcs->name, r?"failed":"succeeded");
> if (r)
> return r;
>+ block->status.hw = true;
> }
> }
>
>@@ -2273,13 +2285,15 @@ static int
>amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
> block = &adev->ip_blocks[j];
>
> if (block->version->type != ip_order[i] ||
>- !block->status.valid)
>+ !block->status.valid ||
>+ block->status.hw)
> continue;
>
> r = block->version->funcs->hw_init(adev);
> DRM_INFO("RE-INIT-late: %s %s\n", block->version-
>>funcs->name, r?"failed":"succeeded");
> if (r)
> return r;
>+ block->status.hw = true;
> }
> }
>
>@@ -2303,17 +2317,19 @@ static int
>amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
> int i, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
>- if (!adev->ip_blocks[i].status.valid)
>+ if (!adev->ip_blocks[i].status.valid || adev-
>>ip_blocks[i].status.hw)
> continue;
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> adev->ip_blocks[i].version->type ==
>AMD_IP_BLOCK_TYPE_IH) {
>+
> r = adev->ip_blocks[i].version->funcs->resume(adev);
> if (r) {
> DRM_ERROR("resume of IP block <%s> failed %d\n",
> adev->ip_blocks[i].version->funcs-
>>name, r);
> return r;
> }
>+ adev->ip_blocks[i].status.hw = true;
> }
> }
>
>@@ -2338,7 +2354,7 @@ static int amdgpu_device_ip_resume_phase2(struct
>amdgpu_device *adev)
> int i, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
>- if (!adev->ip_blocks[i].status.valid)
>+ if (!adev->ip_blocks[i].status.valid || adev-
>>ip_blocks[i].status.hw)
> continue;
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || @@
>-2351,6 +2367,7 @@ static int amdgpu_device_ip_resume_phase2(struct
>amdgpu_device *adev)
> adev->ip_blocks[i].version->funcs->name, r);
> return r;
> }
>+ adev->ip_blocks[i].status.hw = true;
> }
>
> return 0;
>--
>2.7.4
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx at lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list