[PATCH v5] drm/amdgpu: add support for SMU debug option

Fri Dec 10 11:02:00 UTC 2021


On 12/10/2021 3:08 PM, Lang Yu wrote:
> SMU firmware expects the driver maintains error context
> and doesn't interact with SMU any more when SMU errors
> occurred. That will aid in debugging SMU firmware issues.
> 
> Add SMU debug option support for this request, it can be
> enabled or disabled via amdgpu_smu_debug debugfs file.
> Use a 32-bit mask to indicate corresponding debug mode.
> Currently, only one mode-SMU FW debug mode. When enabled,
> it brings hardware to a kind of halt state so that no one
> can touch it any more in the envent of SMU errors.
> 
> The dirver interacts with SMU via sending messages. And
> threre are three ways to sending messages to SMU in current
> implementation. Handle them respectively as following:
> 
> 1, smu_cmn_send_smc_msg_with_param() for normal timeout cases
> 
>    Halt on any error.
> 
> 2, smu_cmn_send_msg_without_waiting()/smu_cmn_wait_for_response()
> for longer timeout cases
> 
>    Halt on errors apart from ETIME. Otherwise this way won't work.
>    Let the user handle ETIME error in such a case.
> 
> 3, smu_cmn_send_msg_without_waiting() for no waiting cases
> 
>    Halt on errors apart from ETIME. Otherwise second way won't work.
> 
> == Command Guide ==
> 
> 1, enable SMU FW debug
> 
>   # echo 0x1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> 2, disable SMU FW debug
> 
>   # echo 0x0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> v5:
>   - Use bit mask to allow more debug features.(Evan)
>   - Use WRAN() instead of BUG().(Evan)
> 
> v4:
>   - Set to halt state instead of a simple hang.(Christian)
> 
> v3:
>   - Use debugfs_create_bool().(Christian)
>   - Put variable into smu_context struct.
>   - Don't resend command when timeout.
> 
> v2:
>   - Resend command when timeout.(Lijo)
>   - Use debugfs file instead of module parameter.
> 
> Signed-off-by: Lang Yu <lang.yu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  3 +++
>   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h     |  6 ++++++
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 22 ++++++++++++++++++++-
>   3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..9dfccb20fedd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   	if (!debugfs_initialized())
>   		return 0;
>   
> +	debugfs_create_x32("amdgpu_smu_debug", 0600, root,
> +			   &adev->smu.smu_debug_mask);
> +
>   	ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>   				  &fops_ib_preempt);
>   	if (IS_ERR(ent)) {
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> index f738f7dc20c9..b24be7c8e2ef 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> @@ -569,6 +569,12 @@ struct smu_context
>   	struct smu_user_dpm_profile user_dpm_profile;
>   
>   	struct stb_context stb_context;
> +
> +	#define DEBUG_FW_MASK		0x1
> +	/*
> +	 * 0 = disabled (default), otherwise enable corresponding debug mode
> +	 */

Please define this outside the structure and better rename this option 
as SMU_DEBUG_HALT_ON_ERROR

With that changed
	Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>

Thanks,
Lijo

> +	uint32_t smu_debug_mask;
>   };
>   
>   struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 048ca1673863..0f807688ab52 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -272,6 +272,12 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>   	__smu_cmn_send_msg(smu, msg_index, param);
>   	res = 0;
>   Out:
> +	if (unlikely(smu->smu_debug_mask & DEBUG_FW_MASK) &&
> +	    res && (res != -ETIME)) {
> +		amdgpu_device_halt(smu->adev);
> +		WARN_ON(1);
> +	}
> +
>   	return res;
>   }
>   
> @@ -288,9 +294,18 @@ int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
>   int smu_cmn_wait_for_response(struct smu_context *smu)
>   {
>   	u32 reg;
> +	int res;
>   
>   	reg = __smu_cmn_poll_stat(smu);
> -	return __smu_cmn_reg2errno(smu, reg);
> +	res = __smu_cmn_reg2errno(smu, reg);
> +
> +	if (unlikely(smu->smu_debug_mask & DEBUG_FW_MASK) &&
> +	    res && (res != -ETIME)) {
> +		amdgpu_device_halt(smu->adev);
> +		WARN_ON(1);
> +	}
> +
> +	return res;
>   }
>   
>   /**
> @@ -357,6 +372,11 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   	if (read_arg)
>   		smu_cmn_read_arg(smu, read_arg);
>   Out:
> +	if (unlikely(smu->smu_debug_mask & DEBUG_FW_MASK) && res) {
> +		amdgpu_device_halt(smu->adev);
> +		WARN_ON(1);
> +	}
> +
>   	mutex_unlock(&smu->message_lock);
>   	return res;
>   }
>