[PATCH] drm/amdgpu: add support to SMU debug option

Lazar, Lijo lijo.lazar at amd.com
Wed Dec 1 09:47:20 UTC 2021



On 12/1/2021 2:54 PM, Lang Yu wrote:
> To maintain system error state when SMU errors occurred,
> which will aid in debugging SMU firmware issues, add SMU
> debug option support.
> 
> It can be enabled or disabled via amdgpu_smu_debug
> debugfs file. When enabled, it makes SMU errors fatal.
> It is disabled by default.
> 
> == Command Guide ==
> 
> 1, enable SMU debug option
> 
>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> 2, disable SMU debug option
> 
>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> v3:
>   - Use debugfs_create_bool().(Christian)
>   - Put variable into smu_context struct.
>   - Don't resend command when timeout.
> 
> v2:
>   - Resend command when timeout.(Lijo)
>   - Use debugfs file instead of module parameter.
> 
> Signed-off-by: Lang Yu <lang.yu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c        | 3 +++
>   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h            | 5 +++++
>   drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 ++
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c             | 8 +++++++-
>   4 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..86cd888c7822 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1618,6 +1618,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   	if (!debugfs_initialized())
>   		return 0;
>   
> +	debugfs_create_bool("amdgpu_smu_debug", 0600, root,
> +				  &adev->smu.smu_debug_mode);
> +
>   	ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>   				  &fops_ib_preempt);
>   	if (IS_ERR(ent)) {
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> index f738f7dc20c9..50dbf5594a9d 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> @@ -569,6 +569,11 @@ struct smu_context
>   	struct smu_user_dpm_profile user_dpm_profile;
>   
>   	struct stb_context stb_context;
> +	/*
> +	 * When enabled, it makes SMU errors fatal.
> +	 * (0 = disabled (default), 1 = enabled)
> +	 */
> +	bool smu_debug_mode;
>   };
>   
>   struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 6e781cee8bb6..d3797a2d6451 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -1919,6 +1919,8 @@ static int aldebaran_mode2_reset(struct smu_context *smu)
>   out:
>   	mutex_unlock(&smu->message_lock);
>   
> +	BUG_ON(unlikely(smu->smu_debug_mode) && ret);
> +
This hunk can be skipped while submitting. If this fails, GPU reset will 
fail and amdgpu won't continue.

>   	return ret;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 048ca1673863..9be005eb4241 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -349,15 +349,21 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
>   		goto Out;
>   	}
> +
>   	__smu_cmn_send_msg(smu, (uint16_t) index, param);
>   	reg = __smu_cmn_poll_stat(smu);
>   	res = __smu_cmn_reg2errno(smu, reg);
> -	if (res != 0)
> +	if (res != 0) {
>   		__smu_cmn_reg_print_error(smu, reg, index, param, msg);
> +		goto Out;

Next step is reading smu parameter register which is harmless as reading 
response register and it's not clear on read. This goto also may be skipped.

Thanks,
Lijo

> +	}
>   	if (read_arg)
>   		smu_cmn_read_arg(smu, read_arg);
>   Out:
>   	mutex_unlock(&smu->message_lock);
> +
> +	BUG_ON(unlikely(smu->smu_debug_mode) && res);
> +
>   	return res;
>   }
>   
> 


More information about the amd-gfx mailing list