[PATCH v6 1/2] drm/amdgpu: add debugfs for reset registers list

Christian König christian.koenig at amd.com
Wed Feb 16 10:11:38 UTC 2022


Am 16.02.22 um 10:49 schrieb Somalapuram Amaranath:
> List of register populated for dump collection during the GPU reset.
>
> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  5 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 +++++++++++++++++++++
>   2 files changed, 100 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b85b67a88a3d..57965316873b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1097,6 +1097,11 @@ struct amdgpu_device {
>   
>   	struct amdgpu_reset_control     *reset_cntl;
>   	uint32_t                        ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
> +
> +	/* reset dump register */
> +	uint32_t			*reset_dump_reg_list;
> +	int                             n_regs;
> +	struct mutex			reset_dump_mutex;

I think we should rather use the reset lock for this instead of 
introducing just another mutex.

>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..faf985c7cb93 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>   			amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
> +				char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char reg_offset[11];
> +	int i, r, len = 0;
> +
> +	if (*pos)
> +		return 0;
> +
> +	if (adev->n_regs == 0)
> +		return 0;
> +
> +	for (i = 0; i < adev->n_regs; i++) {
> +		sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
> +		r = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
> +
> +		if (r)
> +			return -EFAULT;
> +
> +		len += strlen(reg_offset);
> +	}

You need to hold the lock protecting adev->reset_dump_reg_list and 
adev->n_regs while accessing those.

(BTW: num_regs instead of n_regs would match more what we use elsewhere, 
but is not a must have).

> +
> +	r = copy_to_user(buf + len, "\n", 1);
> +
> +	if (r)
> +		return -EFAULT;
> +
> +	len++;
> +	*pos += len;
> +
> +	return len;
> +}
> +
> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
> +			const char __user *buf, size_t size, loff_t *pos)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	char *reg_offset, *reg, reg_temp[11];
> +	static int alloc_count;
> +	int ret, i = 0, len = 0;
> +
> +	do {
> +		reg_offset = reg_temp;
> +		memset(reg_offset,  0, 11);
> +		ret = copy_from_user(reg_offset, buf + len, min(11, ((int)size-len)));
> +
> +		if (ret)
> +			goto failed;
> +
> +		reg = strsep(&reg_offset, " ");
> +
> +		if (alloc_count <= i) {

> +			adev->reset_dump_reg_list =  krealloc_array(
> +							adev->reset_dump_reg_list, 1,
> +							sizeof(uint32_t), GFP_KERNEL);
> +			alloc_count++;
> +		}
> +
> +		ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]);

This here is modifying adev->reset_dump_reg_list as well and so must be 
protected by a lock as well.

The tricky part is that we can't allocate memory while holding this lock 
(because we need it during reset as well).

One solution for this is to read the register list into a local array 
first and when that's done swap the local array with the one in 
adev->reset_dump_reg_list while holding the lock.

Regards,
Christian.

> +
> +		if (ret)
> +			goto failed;
> +
> +		len += strlen(reg) + 1;
> +		i++;
> +
> +	} while (len < size);
> +
> +	adev->n_regs = i;
> +
> +	return size;
> +
> +failed:
> +	mutex_lock(&adev->reset_dump_mutex);
> +	kfree(adev->reset_dump_reg_list);
> +	adev->reset_dump_reg_list = NULL;
> +	alloc_count = 0;
> +	adev->n_regs = 0;
> +	mutex_unlock(&adev->reset_dump_mutex);
> +	return -EFAULT;
> +}
> +
> +
> +
> +static const struct file_operations amdgpu_reset_dump_register_list = {
> +	.owner = THIS_MODULE,
> +	.read = amdgpu_reset_dump_register_list_read,
> +	.write = amdgpu_reset_dump_register_list_write,
> +	.llseek = default_llseek
> +};
> +
>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>   	struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   	if (!debugfs_initialized())
>   		return 0;
>   
> +	mutex_init(&adev->reset_dump_mutex);
>   	ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
>   				  &fops_ib_preempt);
>   	if (IS_ERR(ent)) {
> @@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   			    &amdgpu_debugfs_test_ib_fops);
>   	debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>   			    &amdgpu_debugfs_vm_info_fops);
> +	debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,
> +			    &amdgpu_reset_dump_register_list);
>   
>   	adev->debugfs_vbios_blob.data = adev->bios;
>   	adev->debugfs_vbios_blob.size = adev->bios_size;



More information about the amd-gfx mailing list