[PATCH 07/12] drm/amd/pm: correct the checks for granting gpu reset APIs
Lazar, Lijo
lijo.lazar at amd.com
Thu Feb 17 05:00:47 UTC 2022
On 2/17/2022 8:18 AM, Quan, Evan wrote:
> [AMD Official Use Only]
>
>
>
>> -----Original Message-----
>> From: Lazar, Lijo <Lijo.Lazar at amd.com>
>> Sent: Monday, February 14, 2022 12:04 PM
>> To: Quan, Evan <Evan.Quan at amd.com>; amd-gfx at lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>;
>> rui.huang at amd.com
>> Subject: Re: [PATCH 07/12] drm/amd/pm: correct the checks for granting gpu
>> reset APIs
>>
>>
>>
>> On 2/11/2022 1:22 PM, Evan Quan wrote:
>>> Those gpu reset APIs can be granted when:
>>> - System is up and dpm features are enabled.
>>> - System is under resuming and dpm features are not yet enabled.
>>> Under such scenario, the PMFW is already alive and can support
>>> those gpu reset functionalities.
>>>
>>> Signed-off-by: Evan Quan <evan.quan at amd.com>
>>> Change-Id: I8c2f07138921eb53a2bd7fb94f9b3622af0eacf8
>>> ---
>>> .../gpu/drm/amd/include/kgd_pp_interface.h | 1 +
>>> drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 34 +++++++++++++++
>>> .../gpu/drm/amd/pm/powerplay/amd_powerplay.c | 42
>> +++++++++++++++----
>>> .../drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 1 +
>>> .../drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c | 17 ++++++++
>>> drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h | 1 +
>>> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 32 +++++++-------
>>> 7 files changed, 101 insertions(+), 27 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
>>> b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
>>> index a4c267f15959..892648a4a353 100644
>>> --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
>>> +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
>>> @@ -409,6 +409,7 @@ struct amd_pm_funcs {
>>> struct dpm_clocks *clock_table);
>>> int (*get_smu_prv_buf_details)(void *handle, void **addr, size_t
>> *size);
>>> void (*pm_compute_clocks)(void *handle);
>>> + bool (*is_smc_alive)(void *handle);
>>> };
>>>
>>> struct metrics_table_header {
>>> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
>>> b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
>>> index b46ae0063047..5f1d3342f87b 100644
>>> --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
>>> +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
>>> @@ -120,12 +120,25 @@ int
>> amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev,
>> uint32_t block
>>> return ret;
>>> }
>>>
>>> +static bool amdgpu_dpm_is_smc_alive(struct amdgpu_device *adev) {
>>> + const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
>>> +
>>> + if (!pp_funcs || !pp_funcs->is_smc_alive)
>>> + return false;
>>> +
>>> + return pp_funcs->is_smc_alive;
>>> +}
>>> +
>>> int amdgpu_dpm_baco_enter(struct amdgpu_device *adev)
>>> {
>>> const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
>>> void *pp_handle = adev->powerplay.pp_handle;
>>> int ret = 0;
>>>
>>> + if (!amdgpu_dpm_is_smc_alive(adev))
>>> + return -EOPNOTSUPP;
>>> +
>>> if (!pp_funcs || !pp_funcs->set_asic_baco_state)
>>> return -ENOENT;
>>>
>>> @@ -145,6 +158,9 @@ int amdgpu_dpm_baco_exit(struct amdgpu_device
>> *adev)
>>> void *pp_handle = adev->powerplay.pp_handle;
>>> int ret = 0;
>>>
>>> + if (!amdgpu_dpm_is_smc_alive(adev))
>>> + return -EOPNOTSUPP;
>>> +
>>> if (!pp_funcs || !pp_funcs->set_asic_baco_state)
>>> return -ENOENT;
>>>
>>> @@ -164,6 +180,9 @@ int amdgpu_dpm_set_mp1_state(struct
>> amdgpu_device *adev,
>>> int ret = 0;
>>> const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
>>>
>>> + if (!amdgpu_dpm_is_smc_alive(adev))
>>> + return -EOPNOTSUPP;
>>> +
>>> if (pp_funcs && pp_funcs->set_mp1_state) {
>>> mutex_lock(&adev->pm.mutex);
>>>
>>> @@ -184,6 +203,9 @@ bool amdgpu_dpm_is_baco_supported(struct
>> amdgpu_device *adev)
>>> bool baco_cap;
>>> int ret = 0;
>>>
>>> + if (!amdgpu_dpm_is_smc_alive(adev))
>>> + return false;
>>> +
>>> if (!pp_funcs || !pp_funcs->get_asic_baco_capability)
>>> return false;
>>>
>>> @@ -203,6 +225,9 @@ int amdgpu_dpm_mode2_reset(struct
>> amdgpu_device *adev)
>>> void *pp_handle = adev->powerplay.pp_handle;
>>> int ret = 0;
>>>
>>> + if (!amdgpu_dpm_is_smc_alive(adev))
>>> + return -EOPNOTSUPP;
>>> +
>>> if (!pp_funcs || !pp_funcs->asic_reset_mode_2)
>>> return -ENOENT;
>>>
>>> @@ -221,6 +246,9 @@ int amdgpu_dpm_baco_reset(struct
>> amdgpu_device *adev)
>>> void *pp_handle = adev->powerplay.pp_handle;
>>> int ret = 0;
>>>
>>> + if (!amdgpu_dpm_is_smc_alive(adev))
>>> + return -EOPNOTSUPP;
>>> +
>>> if (!pp_funcs || !pp_funcs->set_asic_baco_state)
>>> return -ENOENT;
>>>
>>> @@ -244,6 +272,9 @@ bool
>> amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev)
>>> struct smu_context *smu = adev->powerplay.pp_handle;
>>> bool support_mode1_reset = false;
>>>
>>> + if (!amdgpu_dpm_is_smc_alive(adev))
>>> + return false;
>>> +
>>> if (is_support_sw_smu(adev)) {
>>> mutex_lock(&adev->pm.mutex);
>>> support_mode1_reset =
>> smu_mode1_reset_is_support(smu); @@ -258,6
>>> +289,9 @@ int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev)
>>> struct smu_context *smu = adev->powerplay.pp_handle;
>>> int ret = -EOPNOTSUPP;
>>>
>>> + if (!amdgpu_dpm_is_smc_alive(adev))
>>> + return -EOPNOTSUPP;
>>> +
>>> if (is_support_sw_smu(adev)) {
>>> mutex_lock(&adev->pm.mutex);
>>> ret = smu_mode1_reset(smu);
>>> diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
>>> b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
>>> index bba923cfe08c..4c709f7bcd51 100644
>>> --- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
>>> +++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
>>> @@ -844,9 +844,6 @@ static int pp_dpm_set_mp1_state(void *handle,
>> enum pp_mp1_state mp1_state)
>>> if (!hwmgr)
>>> return -EINVAL;
>>>
>>> - if (!hwmgr->pm_en)
>>> - return 0;
>>> -
>>> if (hwmgr->hwmgr_func->set_mp1_state)
>>> return hwmgr->hwmgr_func->set_mp1_state(hwmgr,
>> mp1_state);
>>>
>>> @@ -1305,8 +1302,7 @@ static int pp_get_asic_baco_capability(void
>> *handle, bool *cap)
>>> if (!hwmgr)
>>> return -EINVAL;
>>>
>>> - if (!(hwmgr->not_vf && amdgpu_dpm) ||
>>> - !hwmgr->hwmgr_func->get_asic_baco_capability)
>>> + if (!hwmgr->hwmgr_func->get_asic_baco_capability)
>>> return 0;
>>>
>>> hwmgr->hwmgr_func->get_asic_baco_capability(hwmgr, cap); @@ -
>> 1321,7
>>> +1317,7 @@ static int pp_get_asic_baco_state(void *handle, int *state)
>>> if (!hwmgr)
>>> return -EINVAL;
>>>
>>> - if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state)
>>> + if (!hwmgr->hwmgr_func->get_asic_baco_state)
>>> return 0;
>>>
>>> hwmgr->hwmgr_func->get_asic_baco_state(hwmgr, (enum
>> BACO_STATE
>>> *)state); @@ -1336,8 +1332,7 @@ static int pp_set_asic_baco_state(void
>> *handle, int state)
>>> if (!hwmgr)
>>> return -EINVAL;
>>>
>>> - if (!(hwmgr->not_vf && amdgpu_dpm) ||
>>> - !hwmgr->hwmgr_func->set_asic_baco_state)
>>> + if (!hwmgr->hwmgr_func->set_asic_baco_state)
>>> return 0;
>>>
>>> hwmgr->hwmgr_func->set_asic_baco_state(hwmgr, (enum
>>> BACO_STATE)state); @@ -1379,7 +1374,7 @@ static int
>> pp_asic_reset_mode_2(void *handle)
>>> {
>>> struct pp_hwmgr *hwmgr = handle;
>>>
>>> - if (!hwmgr || !hwmgr->pm_en)
>>> + if (!hwmgr)
>>> return -EINVAL;
>>>
>>> if (hwmgr->hwmgr_func->asic_reset == NULL) { @@ -1517,6
>> +1512,34 @@
>>> static void pp_pm_compute_clocks(void *handle)
>>> NULL);
>>> }
>>>
>>> +/* MP Apertures */
>>> +#define MP1_Public 0x03b00000
>>> +#define smnMP1_FIRMWARE_FLAGS
>> 0x3010028
>>> +#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK
>> 0x00000001L
>>> +
>>> +static bool pp_is_smc_alive(void *handle) {
>>> + struct pp_hwmgr *hwmgr = handle;
>>> + struct amdgpu_device *adev = hwmgr->adev;
>>> + uint32_t mp1_fw_flags;
>>> +
>>> + /*
>>> + * If some ASIC(e.g. smu7/smu8) needs special handling for
>>> + * checking smc alive, it should have its own implementation
>>> + * for ->is_smc_alive.
>>> + */
>>> + if (hwmgr->hwmgr_func->is_smc_alive)
>>> + return hwmgr->hwmgr_func->is_smc_alive(hwmgr);
>>> +
>>> + mp1_fw_flags = RREG32_PCIE(MP1_Public |
>>> + (smnMP1_FIRMWARE_FLAGS & 0xffffffff));
>>> +
>>
>> The flags check doesn't tell whether PMFW is hung or not. It is a minimal
>> thing that is set after PMFW boot. To call the API this condition is necessary in
>> an implicit way. Driver always check this on boot, if not driver aborts smu init.
>>
>> So better thing is to go ahead and send the message without any check, it will
>> tell the result whether PMFW is really working or not.
>>
>> In short this API is not needed.
> [Quan, Evan] It was not designed to cover "PMFW hung". Instead, it was designed to be support early phase of post-silicon bringup.
> At that time, the SMU may be not enabled/up. We need to prevent this API from wrongly called.
>
One of the first things done, atleast in swsmu, is hw_init/resume ->
smu_start_smc_engine -> check_fw_status.
If smu is not up/enabled, this call shouldn't even happen as init itself
will fail.
Thanks,
Lijo
> BR
> Evan
>>
>> Thanks,
>> Lijo
>>
>>> + if (mp1_fw_flags &
>> MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK)
>>> + return true;
>>> +
>>> + return false;
>>> +}
>>> +
>>> static const struct amd_pm_funcs pp_dpm_funcs = {
>>> .load_firmware = pp_dpm_load_fw,
>>> .wait_for_fw_loading_complete = pp_dpm_fw_loading_complete,
>> @@
>>> -1582,4 +1605,5 @@ static const struct amd_pm_funcs pp_dpm_funcs = {
>>> .gfx_state_change_set = pp_gfx_state_change_set,
>>> .get_smu_prv_buf_details = pp_get_prv_buffer_details,
>>> .pm_compute_clocks = pp_pm_compute_clocks,
>>> + .is_smc_alive = pp_is_smc_alive,
>>> };
>>> diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
>>> b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
>>> index a1e11037831a..118039b96524 100644
>>> --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
>>> +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
>>> @@ -5735,6 +5735,7 @@ static const struct pp_hwmgr_func
>> smu7_hwmgr_funcs = {
>>> .get_asic_baco_state = smu7_baco_get_state,
>>> .set_asic_baco_state = smu7_baco_set_state,
>>> .power_off_asic = smu7_power_off_asic,
>>> + .is_smc_alive = smu7_is_smc_ram_running,
>>> };
>>>
>>> uint8_t smu7_get_sleep_divider_id_from_clock(uint32_t clock, diff
>>> --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
>>> b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
>>> index b50fd4a4a3d1..fc4d58329f6d 100644
>>> --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
>>> +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
>>> @@ -2015,6 +2015,22 @@ static void smu8_dpm_powergate_vce(struct
>> pp_hwmgr *hwmgr, bool bgate)
>>> }
>>> }
>>>
>>> +#define ixMP1_FIRMWARE_FLAGS
>> 0x3008210
>>> +#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK
>> 0x00000001L
>>> +
>>> +static bool smu8_is_smc_running(struct pp_hwmgr *hwmgr) {
>>> + struct amdgpu_device *adev = hwmgr->adev;
>>> + uint32_t mp1_fw_flags;
>>> +
>>> + mp1_fw_flags = RREG32_SMC(ixMP1_FIRMWARE_FLAGS);
>>> +
>>> + if (mp1_fw_flags &
>> MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK)
>>> + return true;
>>> +
>>> + return false;
>>> +}
>>> +
>>> static const struct pp_hwmgr_func smu8_hwmgr_funcs = {
>>> .backend_init = smu8_hwmgr_backend_init,
>>> .backend_fini = smu8_hwmgr_backend_fini, @@ -2047,6 +2063,7
>> @@
>>> static const struct pp_hwmgr_func smu8_hwmgr_funcs = {
>>> .dynamic_state_management_disable = smu8_disable_dpm_tasks,
>>> .notify_cac_buffer_info = smu8_notify_cac_buffer_info,
>>> .get_thermal_temperature_range =
>>> smu8_get_thermal_temperature_range,
>>> + .is_smc_alive = smu8_is_smc_running,
>>> };
>>>
>>> int smu8_init_function_pointers(struct pp_hwmgr *hwmgr) diff --git
>>> a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
>>> b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
>>> index 4f7f2f455301..790fc387752c 100644
>>> --- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
>>> +++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
>>> @@ -364,6 +364,7 @@ struct pp_hwmgr_func {
>>> bool disable);
>>> ssize_t (*get_gpu_metrics)(struct pp_hwmgr *hwmgr, void **table);
>>> int (*gfx_state_change)(struct pp_hwmgr *hwmgr, uint32_t state);
>>> + bool (*is_smc_alive)(struct pp_hwmgr *hwmgr);
>>> };
>>>
>>> struct pp_table_func {
>>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>> index 8b8feaf7aa0e..27a453fb4db7 100644
>>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>>> @@ -1845,9 +1845,6 @@ static int smu_set_mp1_state(void *handle,
>>> struct smu_context *smu = handle;
>>> int ret = 0;
>>>
>>> - if (!smu->pm_enabled)
>>> - return -EOPNOTSUPP;
>>> -
>>> if (smu->ppt_funcs &&
>>> smu->ppt_funcs->set_mp1_state)
>>> ret = smu->ppt_funcs->set_mp1_state(smu, mp1_state);
>> @@ -2513,9
>>> +2510,6 @@ static int smu_get_baco_capability(void *handle, bool *cap)
>>>
>>> *cap = false;
>>>
>>> - if (!smu->pm_enabled)
>>> - return 0;
>>> -
>>> if (smu->ppt_funcs && smu->ppt_funcs->baco_is_support)
>>> *cap = smu->ppt_funcs->baco_is_support(smu);
>>>
>>> @@ -2527,9 +2521,6 @@ static int smu_baco_set_state(void *handle, int
>> state)
>>> struct smu_context *smu = handle;
>>> int ret = 0;
>>>
>>> - if (!smu->pm_enabled)
>>> - return -EOPNOTSUPP;
>>> -
>>> if (state == 0) {
>>> if (smu->ppt_funcs->baco_exit)
>>> ret = smu->ppt_funcs->baco_exit(smu); @@ -2551,9
>> +2542,6 @@ bool
>>> smu_mode1_reset_is_support(struct smu_context *smu)
>>> {
>>> bool ret = false;
>>>
>>> - if (!smu->pm_enabled)
>>> - return false;
>>> -
>>> if (smu->ppt_funcs && smu->ppt_funcs->mode1_reset_is_support)
>>> ret = smu->ppt_funcs->mode1_reset_is_support(smu);
>>>
>>> @@ -2564,9 +2552,6 @@ int smu_mode1_reset(struct smu_context *smu)
>>> {
>>> int ret = 0;
>>>
>>> - if (!smu->pm_enabled)
>>> - return -EOPNOTSUPP;
>>> -
>>> if (smu->ppt_funcs->mode1_reset)
>>> ret = smu->ppt_funcs->mode1_reset(smu);
>>>
>>> @@ -2578,9 +2563,6 @@ static int smu_mode2_reset(void *handle)
>>> struct smu_context *smu = handle;
>>> int ret = 0;
>>>
>>> - if (!smu->pm_enabled)
>>> - return -EOPNOTSUPP;
>>> -
>>> if (smu->ppt_funcs->mode2_reset)
>>> ret = smu->ppt_funcs->mode2_reset(smu);
>>>
>>> @@ -2712,6 +2694,19 @@ static int smu_get_prv_buffer_details(void
>> *handle, void **addr, size_t *size)
>>> return 0;
>>> }
>>>
>>> +static bool smu_is_smc_alive(void *handle) {
>>> + struct smu_context *smu = handle;
>>> +
>>> + if (!smu->ppt_funcs->check_fw_status)
>>> + return false;
>>> +
>>> + if (!smu->ppt_funcs->check_fw_status(smu))
>>> + return true;
>>> +
>>> + return false;
>>> +}
>>> +
>>> static const struct amd_pm_funcs swsmu_pm_funcs = {
>>> /* export for sysfs */
>>> .set_fan_control_mode = smu_set_fan_control_mode,
>>> @@ -2765,6 +2760,7 @@ static const struct amd_pm_funcs
>> swsmu_pm_funcs = {
>>> .get_uclk_dpm_states = smu_get_uclk_dpm_states,
>>> .get_dpm_clock_table = smu_get_dpm_clock_table,
>>> .get_smu_prv_buf_details = smu_get_prv_buffer_details,
>>> + .is_smc_alive = smu_is_smc_alive,
>>> };
>>>
>>> int smu_wait_for_event(struct smu_context *smu, enum
>> smu_event_type
>>> event,
>>>
More information about the amd-gfx
mailing list