[PATCH v2] drm/amd/pm: fix the high voltage and temperature issue

Alex Deucher alexdeucher at gmail.com
Wed Oct 25 14:15:10 UTC 2023


On Tue, Oct 24, 2023 at 11:57 PM Kenneth Feng <kenneth.feng at amd.com> wrote:
>
> fix the high voltage and temperature issue after the driver is unloaded on smu 13.0.0,
> smu 13.0.7 and smu 13.0.10
> v2 - fix the code format and make sure it is used on the unload case only.
>
> Signed-off-by: Kenneth Feng <kenneth.feng at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 36 +++++++++++++++----
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 33 +++++++++++++++--
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  1 +
>  drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h  |  2 ++
>  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    | 13 +++++++
>  .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  |  8 ++++-
>  .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c  |  8 ++++-
>  7 files changed, 90 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 31f8c3ead161..c5c892a8b3f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3986,13 +3986,23 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>                                 }
>                         }
>                 } else {
> -                       tmp = amdgpu_reset_method;
> -                       /* It should do a default reset when loading or reloading the driver,
> -                        * regardless of the module parameter reset_method.
> -                        */
> -                       amdgpu_reset_method = AMD_RESET_METHOD_NONE;
> -                       r = amdgpu_asic_reset(adev);
> -                       amdgpu_reset_method = tmp;
> +                       switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
> +                       case IP_VERSION(13, 0, 0):
> +                       case IP_VERSION(13, 0, 7):
> +                       case IP_VERSION(13, 0, 10):
> +                               r = psp_gpu_reset(adev);
> +                               break;
> +                       default:
> +                               tmp = amdgpu_reset_method;
> +                               /* It should do a default reset when loading or reloading the driver,
> +                                * regardless of the module parameter reset_method.
> +                                */
> +                               amdgpu_reset_method = AMD_RESET_METHOD_NONE;
> +                               r = amdgpu_asic_reset(adev);
> +                               amdgpu_reset_method = tmp;
> +                               break;
> +                       }
> +
>                         if (r) {
>                                 dev_err(adev->dev, "asic reset on init failed\n");
>                                 goto failed;
> @@ -5945,6 +5955,18 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>                 return -ENOTSUPP;
>
>         ret = amdgpu_dpm_baco_exit(adev);
> +
> +       if (!ret)
> +               switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
> +               case IP_VERSION(13, 0, 0):
> +               case IP_VERSION(13, 0, 7):
> +               case IP_VERSION(13, 0, 10):
> +                       adev->gfx.is_poweron = false;
> +                       break;
> +               default:
> +                       break;
> +               }

Was it not possible to put this in the smu13 baco exit code?

> +
>         if (ret)
>                 return ret;
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 7c3356d6da5e..2e82172ba250 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -733,7 +733,7 @@ static int smu_early_init(void *handle)
>         smu->adev = adev;
>         smu->pm_enabled = !!amdgpu_dpm;
>         smu->is_apu = false;
> -       smu->smu_baco.state = SMU_BACO_STATE_EXIT;
> +       smu->smu_baco.state = SMU_BACO_STATE_NONE;
>         smu->smu_baco.platform_support = false;
>         smu->user_dpm_profile.fan_mode = -1;
>
> @@ -1740,10 +1740,31 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
>         return 0;
>  }
>
> +static int smu_reset_mp1_state(struct smu_context *smu)
> +{
> +       struct amdgpu_device *adev = smu->adev;
> +       int ret = 0;
> +
> +       if ((!adev->in_runpm) && (!adev->in_suspend) &&
> +               (!amdgpu_in_reset(adev)))
> +               switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
> +                 case IP_VERSION(13, 0, 0):
> +                 case IP_VERSION(13, 0, 7):
> +                 case IP_VERSION(13, 0, 10):
> +                       ret = smu_set_mp1_state(smu, PP_MP1_STATE_UNLOAD);
> +                       break;

Is there any reason not to enable this on all dGPUs?

Alex

> +                 default:
> +                       break;
> +               }
> +
> +       return ret;
> +}
> +
>  static int smu_hw_fini(void *handle)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>         struct smu_context *smu = adev->powerplay.pp_handle;
> +       int ret;
>
>         if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
>                 return 0;
> @@ -1761,7 +1782,15 @@ static int smu_hw_fini(void *handle)
>
>         adev->pm.dpm_enabled = false;
>
> -       return smu_smc_hw_cleanup(smu);
> +       ret = smu_smc_hw_cleanup(smu);
> +       if (ret)
> +               return ret;
> +
> +       ret = smu_reset_mp1_state(smu);
> +       if (ret)
> +               return ret;
> +
> +       return 0;
>  }
>
>  static void smu_late_fini(void *handle)
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index 1454eed76604..9f2dbc90b606 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -419,6 +419,7 @@ enum smu_reset_mode {
>  enum smu_baco_state {
>         SMU_BACO_STATE_ENTER = 0,
>         SMU_BACO_STATE_EXIT,
> +       SMU_BACO_STATE_NONE,
>  };
>
>  struct smu_baco_context {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
> index cc02f979e9e9..43c7ba68eb50 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h
> @@ -299,5 +299,7 @@ int smu_v13_0_update_pcie_parameters(struct smu_context *smu,
>                                      uint8_t pcie_gen_cap,
>                                      uint8_t pcie_width_cap);
>
> +int smu_v13_0_disable_pmfw_state(struct smu_context* smu);
> +
>  #endif
>  #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index bcb7ab9d2221..0724441e53ef 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -2473,3 +2473,16 @@ int smu_v13_0_update_pcie_parameters(struct smu_context *smu,
>
>         return 0;
>  }
> +
> +int smu_v13_0_disable_pmfw_state(struct smu_context* smu)
> +{
> +       int ret;
> +       struct amdgpu_device *adev = smu->adev;
> +
> +       WREG32_PCIE(MP1_Public | (smnMP1_FIRMWARE_FLAGS & 0xffffffff), 0);
> +
> +       ret = RREG32_PCIE(MP1_Public |
> +                                          (smnMP1_FIRMWARE_FLAGS & 0xffffffff));
> +
> +       return ret == 0 ? 0 : -EINVAL;
> +}
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
> index 47d008cbc186..e2a09fe29e2f 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
> @@ -2758,7 +2758,13 @@ static int smu_v13_0_0_set_mp1_state(struct smu_context *smu,
>
>         switch (mp1_state) {
>         case PP_MP1_STATE_UNLOAD:
> -               ret = smu_cmn_set_mp1_state(smu, mp1_state);
> +               ret = smu_cmn_send_smc_msg_with_param(smu,
> +                                                               SMU_MSG_PrepareMp1ForUnload,
> +                                                               0x55, NULL);
> +
> +               if (!ret && smu->smu_baco.state == SMU_BACO_STATE_EXIT)
> +                       ret = smu_v13_0_disable_pmfw_state(smu);
> +
>                 break;
>         default:
>                 /* Ignore others */
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
> index b8a7a1d853df..2a0d1da18a9b 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
> @@ -2429,7 +2429,13 @@ static int smu_v13_0_7_set_mp1_state(struct smu_context *smu,
>
>         switch (mp1_state) {
>         case PP_MP1_STATE_UNLOAD:
> -               ret = smu_cmn_set_mp1_state(smu, mp1_state);
> +               ret = smu_cmn_send_smc_msg_with_param(smu,
> +                                                               SMU_MSG_PrepareMp1ForUnload,
> +                                                               0x55, NULL);
> +
> +               if (!ret && smu->smu_baco.state == SMU_BACO_STATE_EXIT)
> +                       ret = smu_v13_0_disable_pmfw_state(smu);
> +
>                 break;
>         default:
>                 /* Ignore others */
> --
> 2.34.1
>


More information about the amd-gfx mailing list