[PATCH] drm/amdgpu: fix wrong vram lost counter increment
Quan, Evan
Evan.Quan at amd.com
Mon Apr 13 03:09:30 UTC 2020
Thanks. Updated this in V2.
-----Original Message-----
From: Alex Deucher <alexdeucher at gmail.com>
Sent: Friday, April 10, 2020 8:45 PM
To: Quan, Evan <Evan.Quan at amd.com>
Cc: amd-gfx list <amd-gfx at lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher at amd.com>
Subject: Re: [PATCH] drm/amdgpu: fix wrong vram lost counter increment
On Fri, Apr 10, 2020 at 4:02 AM Evan Quan <evan.quan at amd.com> wrote:
>
> Vram lost counter is wrongly increased by two during baco reset.
>
> Change-Id: I8b9959a5d1632abc774ba07d56cf295bdd8288eb
> Signed-off-by: Evan Quan <evan.quan at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 36 ++++++++++++++++++++--
> drivers/gpu/drm/amd/amdgpu/cik.c | 2 --
> drivers/gpu/drm/amd/amdgpu/nv.c | 4 ---
> drivers/gpu/drm/amd/amdgpu/soc15.c | 4 ---
> drivers/gpu/drm/amd/amdgpu/vi.c | 2 --
> 5 files changed, 34 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a2a4e4b28d00..c9317975c46e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2087,8 +2087,40 @@ static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
> */
> static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
> {
> - return !!memcmp(adev->gart.ptr, adev->reset_magic,
> - AMDGPU_RESET_MAGIC_NUM);
> + if (memcmp(adev->gart.ptr, adev->reset_magic,
> + AMDGPU_RESET_MAGIC_NUM))
> + return true;
> +
> + if (!adev->in_gpu_reset)
> + return false;
> +
> + /*
> + * For all ASICs with baco reset, the VRAM is assumed to be
> + * lost.
> + * For SOC15 and NV ASICs with mode1 reset, the VRAM is also
> + * assumed to be lost.
> + */
> + switch (amdgpu_asic_reset_method(adev)) {
> + case AMD_RESET_METHOD_BACO:
> + return true;
> + case AMD_RESET_METHOD_MODE1:
> + switch (adev->asic_type) {
> + case CHIP_VEGA10:
> + case CHIP_VEGA12:
> + case CHIP_VEGA20:
> + case CHIP_RAVEN:
> + case CHIP_ARCTURUS:
> + case CHIP_RENOIR:
> + case CHIP_NAVI10:
> + case CHIP_NAVI14:
> + case CHIP_NAVI12:
I think we can probably just drop the asic check and always return true for MODE1 reset. The UMC block gets reset is memory is not reliable.
Alex
> + return true;
> + default:
> + return false;
> + }
> + default:
> + return false;
> + }
> }
>
> /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c
> b/drivers/gpu/drm/amd/amdgpu/cik.c
> index db68ffa27984..fe306d0f73f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> @@ -1358,8 +1358,6 @@ static int cik_asic_reset(struct amdgpu_device *adev)
> int r;
>
> if (cik_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
> - if (!adev->in_suspend)
> - amdgpu_inc_vram_lost(adev);
> r = amdgpu_dpm_baco_reset(adev);
> } else {
> r = cik_asic_pci_config_reset(adev); diff --git
> a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
> index 7768880fcccf..995bdec9fa7d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
> @@ -351,8 +351,6 @@ static int nv_asic_reset(struct amdgpu_device *adev)
> struct smu_context *smu = &adev->smu;
>
> if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
> - if (!adev->in_suspend)
> - amdgpu_inc_vram_lost(adev);
> ret = smu_baco_enter(smu);
> if (ret)
> return ret;
> @@ -360,8 +358,6 @@ static int nv_asic_reset(struct amdgpu_device *adev)
> if (ret)
> return ret;
> } else {
> - if (!adev->in_suspend)
> - amdgpu_inc_vram_lost(adev);
> ret = nv_asic_mode1_reset(adev);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
> b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index a597ad22b675..58a440a15525 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -569,14 +569,10 @@ static int soc15_asic_reset(struct amdgpu_device
> *adev)
>
> switch (soc15_asic_reset_method(adev)) {
> case AMD_RESET_METHOD_BACO:
> - if (!adev->in_suspend)
> - amdgpu_inc_vram_lost(adev);
> return soc15_asic_baco_reset(adev);
> case AMD_RESET_METHOD_MODE2:
> return amdgpu_dpm_mode2_reset(adev);
> default:
> - if (!adev->in_suspend)
> - amdgpu_inc_vram_lost(adev);
> return soc15_asic_mode1_reset(adev);
> }
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c
> b/drivers/gpu/drm/amd/amdgpu/vi.c index 0a90c296409b..af8986a55354
> 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> @@ -744,8 +744,6 @@ static int vi_asic_reset(struct amdgpu_device *adev)
> int r;
>
> if (vi_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
> - if (!adev->in_suspend)
> - amdgpu_inc_vram_lost(adev);
> r = amdgpu_dpm_baco_reset(adev);
> } else {
> r = vi_asic_pci_config_reset(adev);
> --
> 2.26.0
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cev
> an.quan%40amd.com%7C22eb212ade824eb3fc5c08d7dd4cfe75%7C3dd8961fe4884e6
> 08e11a82d994e183d%7C0%7C0%7C637221195076651531&sdata=2DazCSnEqgcdV
> pRpmyEBZ9k%2BawbTdciixdhCdNIij4g%3D&reserved=0
More information about the amd-gfx
mailing list