[PATCH 5/5] drm/amdgpu: fix calltrace during kmd unload

Yuan, Xiaojie Xiaojie.Yuan at amd.com
Thu Nov 28 03:32:01 UTC 2019


[AMD Official Use Only - Internal Distribution Only]

Hi Monk,

As long as the content of CSIB won't be changed by CP FW in runtime, I have no objection to 're-initialize after S3 resume'.
I am not quite sure about the actual behavior, let me do an experiment to confirm that and add Hawking / Jack who adds the original CSIB code for comment.

BTW, note that I recently has a patch to re-initialize CSIB in baco sequence, please consider to squash it when making your final fix:

commit c8494497feb0050a66128ca626f3883d6f08d783
Author: Xiaojie Yuan <xiaojie.yuan at amd.com>
Date:   Wed Nov 20 14:02:22 2019 +0800

    drm/amdgpu/gfx10: re-init clear state buffer after gpu reset

    This patch fixes 2nd baco reset failure with gfxoff enabled on navi1x.

    clear state buffer (resides in vram) is corrupted after 1st baco reset,
    upon gfxoff exit, CPF gets garbage header in CSIB and hangs.

    Signed-off-by: Xiaojie Yuan <xiaojie.yuan at amd.com>
    Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
    Signed-off-by: Alex Deucher <alexander.deucher at amd.com>

BR,
Xiaojie

________________________________________
From: Liu, Monk <Monk.Liu at amd.com>
Sent: Thursday, November 28, 2019 10:53 AM
To: Yuan, Xiaojie; Deucher, Alexander; Koenig, Christian
Cc: amd-gfx at lists.freedesktop.org
Subject: RE: [PATCH 5/5] drm/amdgpu: fix calltrace during kmd unload

Hi Xiaojie

For SRIOV we don't use suspend so I didn't think to that part, thanks for the remind !
But we still need to fix this call trace issue anyway (our jenkins testing  system consider such call trace as an error )

How about we do "  adev->gfx.rlc.funcs->get_csb_buffer(adev, dst_ptr);" in the hw_init() ? this way
You don't need to evict the CSIB during suspend and the CSIB always will be re-initialized after S3 resume ?

@Deucher, Alexander @Koenig, Christian what's your opinion ?
_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Yuan, Xiaojie <Xiaojie.Yuan at amd.com>
Sent: Tuesday, November 26, 2019 9:10 PM
To: Liu, Monk <Monk.Liu at amd.com>
Cc: amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH 5/5] drm/amdgpu: fix calltrace during kmd unload

Hi Monk,

hw_fini() is called in suspend code path as well. I'm wondering how csb can be evicted if it's not unpined before suspend.

BR,
Xiaojie

> On Nov 26, 2019, at 7:50 PM, Monk Liu <Monk.Liu at amd.com> wrote:
>
> kernel would report a warning on double unpin on the csb BO because we
> unpin it during hw_fini but actually we don't need to pin/unpin it
> during hw_init/fini since it is created with kernel pinned
>
> remove all those useless code for gfx9/10
>
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c |  1 -
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 38 --------------------------------
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 39 ---------------------------------
> 3 files changed, 78 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
> index c8793e6..289fada 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
> @@ -145,7 +145,6 @@ int amdgpu_gfx_rlc_init_csb(struct amdgpu_device *adev)
>    dst_ptr = adev->gfx.rlc.cs_ptr;
>    adev->gfx.rlc.funcs->get_csb_buffer(adev, dst_ptr);
>    amdgpu_bo_kunmap(adev->gfx.rlc.clear_state_obj);
> -    amdgpu_bo_unpin(adev->gfx.rlc.clear_state_obj);
>    amdgpu_bo_unreserve(adev->gfx.rlc.clear_state_obj);
>
>    return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index a56cba9..5ee7467 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -996,39 +996,6 @@ static int gfx_v10_0_rlc_init(struct amdgpu_device *adev)
>    return 0;
> }
>
> -static int gfx_v10_0_csb_vram_pin(struct amdgpu_device *adev) -{
> -    int r;
> -
> -    r = amdgpu_bo_reserve(adev->gfx.rlc.clear_state_obj, false);
> -    if (unlikely(r != 0))
> -        return r;
> -
> -    r = amdgpu_bo_pin(adev->gfx.rlc.clear_state_obj,
> -            AMDGPU_GEM_DOMAIN_VRAM);
> -    if (!r)
> -        adev->gfx.rlc.clear_state_gpu_addr =
> -            amdgpu_bo_gpu_offset(adev->gfx.rlc.clear_state_obj);
> -
> -    amdgpu_bo_unreserve(adev->gfx.rlc.clear_state_obj);
> -
> -    return r;
> -}
> -
> -static void gfx_v10_0_csb_vram_unpin(struct amdgpu_device *adev) -{
> -    int r;
> -
> -    if (!adev->gfx.rlc.clear_state_obj)
> -        return;
> -
> -    r = amdgpu_bo_reserve(adev->gfx.rlc.clear_state_obj, true);
> -    if (likely(r == 0)) {
> -        amdgpu_bo_unpin(adev->gfx.rlc.clear_state_obj);
> -        amdgpu_bo_unreserve(adev->gfx.rlc.clear_state_obj);
> -    }
> -}
> -
> static void gfx_v10_0_mec_fini(struct amdgpu_device *adev) {
>    amdgpu_bo_free_kernel(&adev->gfx.mec.hpd_eop_obj, NULL, NULL); @@
> -3780,10 +3747,6 @@ static int gfx_v10_0_hw_init(void *handle)
>    int r;
>    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
> -    r = gfx_v10_0_csb_vram_pin(adev);
> -    if (r)
> -        return r;
> -
>    if (!amdgpu_emu_mode)
>        gfx_v10_0_init_golden_registers(adev);
>
> @@ -3871,7 +3834,6 @@ static int gfx_v10_0_hw_fini(void *handle)
>    }
>    gfx_v10_0_cp_enable(adev, false);
>    gfx_v10_0_enable_gui_idle_interrupt(adev, false);
> -    gfx_v10_0_csb_vram_unpin(adev);
>
>    return 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 4cc2e50..524a7ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -1683,39 +1683,6 @@ static int gfx_v9_0_rlc_init(struct amdgpu_device *adev)
>    return 0;
> }
>
> -static int gfx_v9_0_csb_vram_pin(struct amdgpu_device *adev) -{
> -    int r;
> -
> -    r = amdgpu_bo_reserve(adev->gfx.rlc.clear_state_obj, false);
> -    if (unlikely(r != 0))
> -        return r;
> -
> -    r = amdgpu_bo_pin(adev->gfx.rlc.clear_state_obj,
> -            AMDGPU_GEM_DOMAIN_VRAM);
> -    if (!r)
> -        adev->gfx.rlc.clear_state_gpu_addr =
> -            amdgpu_bo_gpu_offset(adev->gfx.rlc.clear_state_obj);
> -
> -    amdgpu_bo_unreserve(adev->gfx.rlc.clear_state_obj);
> -
> -    return r;
> -}
> -
> -static void gfx_v9_0_csb_vram_unpin(struct amdgpu_device *adev) -{
> -    int r;
> -
> -    if (!adev->gfx.rlc.clear_state_obj)
> -        return;
> -
> -    r = amdgpu_bo_reserve(adev->gfx.rlc.clear_state_obj, true);
> -    if (likely(r == 0)) {
> -        amdgpu_bo_unpin(adev->gfx.rlc.clear_state_obj);
> -        amdgpu_bo_unreserve(adev->gfx.rlc.clear_state_obj);
> -    }
> -}
> -
> static void gfx_v9_0_mec_fini(struct amdgpu_device *adev) {
>    amdgpu_bo_free_kernel(&adev->gfx.mec.hpd_eop_obj, NULL, NULL); @@
> -3694,10 +3661,6 @@ static int gfx_v9_0_hw_init(void *handle)
>
>    gfx_v9_0_constants_init(adev);
>
> -    r = gfx_v9_0_csb_vram_pin(adev);
> -    if (r)
> -        return r;
> -
>    r = adev->gfx.rlc.funcs->resume(adev);
>    if (r)
>        return r;
> @@ -3779,8 +3742,6 @@ static int gfx_v9_0_hw_fini(void *handle)
>    gfx_v9_0_cp_enable(adev, false);
>    adev->gfx.rlc.funcs->stop(adev);
>
> -    gfx_v9_0_csb_vram_unpin(adev);
> -
>    return 0;
> }
>
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CXi
> aojie.Yuan%40amd.com%7C65e162e509ea4a90f79308d77266de65%7C3dd8961fe488
> 4e608e11a82d994e183d%7C0%7C0%7C637103658464512751&sdata=r5fpid5IsP
> 8anzg%2FZIYHn0N8xceBvG7rtRG80%2B7868o%3D&reserved=0


More information about the amd-gfx mailing list