[PATCH] drm/amdgpu: fix wrong vram lost counter increment

Mon Apr 13 03:09:30 UTC 2020

Thanks. Updated this in V2.

-----Original Message-----
From: Alex Deucher <alexdeucher at gmail.com> 
Sent: Friday, April 10, 2020 8:45 PM
To: Quan, Evan <Evan.Quan at amd.com>
Cc: amd-gfx list <amd-gfx at lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher at amd.com>
Subject: Re: [PATCH] drm/amdgpu: fix wrong vram lost counter increment

On Fri, Apr 10, 2020 at 4:02 AM Evan Quan <evan.quan at amd.com> wrote:
>
> Vram lost counter is wrongly increased by two during baco reset.
>
> Change-Id: I8b9959a5d1632abc774ba07d56cf295bdd8288eb
> Signed-off-by: Evan Quan <evan.quan at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 36 ++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/cik.c           |  2 --
>  drivers/gpu/drm/amd/amdgpu/nv.c            |  4 ---
>  drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 ---
>  drivers/gpu/drm/amd/amdgpu/vi.c            |  2 --
>  5 files changed, 34 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a2a4e4b28d00..c9317975c46e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2087,8 +2087,40 @@ static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
>   */
>  static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)  
> {
> -       return !!memcmp(adev->gart.ptr, adev->reset_magic,
> -                       AMDGPU_RESET_MAGIC_NUM);
> +       if (memcmp(adev->gart.ptr, adev->reset_magic,
> +                       AMDGPU_RESET_MAGIC_NUM))
> +               return true;
> +
> +       if (!adev->in_gpu_reset)
> +               return false;
> +
> +       /*
> +        * For all ASICs with baco reset, the VRAM is assumed to be
> +        * lost.
> +        * For SOC15 and NV ASICs with mode1 reset, the VRAM is also
> +        * assumed to be lost.
> +        */
> +       switch (amdgpu_asic_reset_method(adev)) {
> +       case AMD_RESET_METHOD_BACO:
> +               return true;
> +       case AMD_RESET_METHOD_MODE1:
> +               switch (adev->asic_type) {
> +               case CHIP_VEGA10:
> +               case CHIP_VEGA12:
> +               case CHIP_VEGA20:
> +               case CHIP_RAVEN:
> +               case CHIP_ARCTURUS:
> +               case CHIP_RENOIR:
> +               case CHIP_NAVI10:
> +               case CHIP_NAVI14:
> +               case CHIP_NAVI12:

I think we can probably just drop the asic check and always return true for MODE1 reset.  The UMC block gets reset is memory is not reliable.

Alex

> +                       return true;
> +               default:
> +                       return false;
> +               }
> +       default:
> +               return false;
> +       }
>  }
>
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c 
> b/drivers/gpu/drm/amd/amdgpu/cik.c
> index db68ffa27984..fe306d0f73f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> @@ -1358,8 +1358,6 @@ static int cik_asic_reset(struct amdgpu_device *adev)
>         int r;
>
>         if (cik_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
> -               if (!adev->in_suspend)
> -                       amdgpu_inc_vram_lost(adev);
>                 r = amdgpu_dpm_baco_reset(adev);
>         } else {
>                 r = cik_asic_pci_config_reset(adev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c 
> index 7768880fcccf..995bdec9fa7d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
> @@ -351,8 +351,6 @@ static int nv_asic_reset(struct amdgpu_device *adev)
>         struct smu_context *smu = &adev->smu;
>
>         if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
> -               if (!adev->in_suspend)
> -                       amdgpu_inc_vram_lost(adev);
>                 ret = smu_baco_enter(smu);
>                 if (ret)
>                         return ret;
> @@ -360,8 +358,6 @@ static int nv_asic_reset(struct amdgpu_device *adev)
>                 if (ret)
>                         return ret;
>         } else {
> -               if (!adev->in_suspend)
> -                       amdgpu_inc_vram_lost(adev);
>                 ret = nv_asic_mode1_reset(adev);
>         }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
> b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index a597ad22b675..58a440a15525 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -569,14 +569,10 @@ static int soc15_asic_reset(struct amdgpu_device 
> *adev)
>
>         switch (soc15_asic_reset_method(adev)) {
>                 case AMD_RESET_METHOD_BACO:
> -                       if (!adev->in_suspend)
> -                               amdgpu_inc_vram_lost(adev);
>                         return soc15_asic_baco_reset(adev);
>                 case AMD_RESET_METHOD_MODE2:
>                         return amdgpu_dpm_mode2_reset(adev);
>                 default:
> -                       if (!adev->in_suspend)
> -                               amdgpu_inc_vram_lost(adev);
>                         return soc15_asic_mode1_reset(adev);
>         }
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c 
> b/drivers/gpu/drm/amd/amdgpu/vi.c index 0a90c296409b..af8986a55354 
> 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> @@ -744,8 +744,6 @@ static int vi_asic_reset(struct amdgpu_device *adev)
>         int r;
>
>         if (vi_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
> -               if (!adev->in_suspend)
> -                       amdgpu_inc_vram_lost(adev);
>                 r = amdgpu_dpm_baco_reset(adev);
>         } else {
>                 r = vi_asic_pci_config_reset(adev);
> --
> 2.26.0
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cev
> an.quan%40amd.com%7C22eb212ade824eb3fc5c08d7dd4cfe75%7C3dd8961fe4884e6
> 08e11a82d994e183d%7C0%7C0%7C637221195076651531&sdata=2DazCSnEqgcdV
> pRpmyEBZ9k%2BawbTdciixdhCdNIij4g%3D&reserved=0