[PATCH v7 1/2] drm/amdgpu: add debugfs for reset registers list
Christian König
christian.koenig at amd.com
Thu Feb 17 15:38:20 UTC 2022
Am 17.02.22 um 16:32 schrieb Somalapuram, Amaranath:
>
> On 2/17/2022 8:36 PM, Christian König wrote:
>> Am 17.02.22 um 15:29 schrieb Somalapuram Amaranath:
>>> List of register populated for dump collection during the GPU reset.
>>>
>>> Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 112
>>> ++++++++++++++++++++
>>> 2 files changed, 116 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index b85b67a88a3d..6e35f2c4c869 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1097,6 +1097,10 @@ struct amdgpu_device {
>>> struct amdgpu_reset_control *reset_cntl;
>>> uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
>>> +
>>> + /* reset dump register */
>>> + uint32_t *reset_dump_reg_list;
>>> + int num_regs;
>>> };
>>> static inline struct amdgpu_device *drm_to_adev(struct
>>> drm_device *ddev)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 164d6a9e9fbb..ad807350d13e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1609,6 +1609,116 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>> DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>> amdgpu_debugfs_sclk_set, "%llu\n");
>>> +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>> + char __user *buf, size_t size, loff_t *pos)
>>> +{
>>> + struct amdgpu_device *adev = (struct amdgpu_device
>>> *)file_inode(f)->i_private;
>>> + char reg_offset[11];
>>> + int i, ret, len = 0;
>>> +
>>> + if (*pos)
>>> + return 0;
>>> +
>>> + ret = down_read_killable(&adev->reset_sem);
>>
>> Using the _killable() variant is a really good idea here.
>>
>>> +
>>> + if (ret)
>>> + return ret;
>>> +
>>> + if (adev->num_regs == 0)
>>> + return 0;
>>> +
>>> + for (i = 0; i < adev->num_regs; i++) {
>>> + sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]);
>>> + ret = copy_to_user(buf + len, reg_offset, strlen(reg_offset));
>>
>> Uff, I'm not 100% sure if we can do copy_to_user without dropping the
>> lock.
>>
> then I need to use kmalloc or krealloc_array.
Alternatively you should be able to safely drop and re-take the lock.
If the array is modified while somebody reads the list it's the fault of
userspace and they can keep the broken pieces.
Just make sure that dropping the lock is after the snprintf().
Regards,
Christian.
>
> Regards,
>
> S.Amarnath
>
>>> +
>>> + if (ret)
>>> + return -EFAULT;
>>
>> But returning here without dropping the lock is certainly incorrect.
>>
>>> +
>>> + len += strlen(reg_offset);
>>> + }
>>> +
>>> + up_read(&adev->reset_sem);
>>> +
>>> + ret = copy_to_user(buf + len, "\n", 1);
>>> +
>>> + if (ret)
>>> + return -EFAULT;
>>> +
>>> + len++;
>>> + *pos += len;
>>> +
>>> + return len;
>>> +}
>>> +
>>> +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
>>> + const char __user *buf, size_t size, loff_t *pos)
>>> +{
>>> + struct amdgpu_device *adev = (struct amdgpu_device
>>> *)file_inode(f)->i_private;
>>> + char *reg_offset, *reg, reg_temp[11];
>>> + uint32_t reg_list[128];
>>> + int ret, i = 0, len = 0;
>>> +
>>> + do {
>>> + reg_offset = reg_temp;
>>> + memset(reg_offset, 0, 11);
>>> + ret = copy_from_user(reg_offset, buf + len, min(11,
>>> ((int)size-len)));
>>> +
>>> + if (ret)
>>> + return -EFAULT;
>>> +
>>> + if (strncmp(reg_offset, "clear", 5) == 0) {
>>
>> Please completely drop the idea with the clear here, that should be
>> unnecessary.
>>
>>> +
>>> + ret = down_read_killable(&adev->reset_sem);
>>> +
>>> + if (ret)
>>> + return ret;
>>> +
>>> + kfree(adev->reset_dump_reg_list);
>>> + adev->reset_dump_reg_list = NULL;
>>> + adev->num_regs = 0;
>>> +
>>> + up_read(&adev->reset_sem);
>>> +
>>> + return size;
>>> + }
>>> +
>>> + reg = strsep(®_offset, " ");
>>> + ret = kstrtouint(reg, 16, ®_list[i]);
>>> +
>>> + if (ret)
>>> + return -EFAULT;
>>> +
>>> + len += strlen(reg) + 1;
>>> + i++;
>>> +
>>> + } while (len < size);
>>> +
>>> + adev->reset_dump_reg_list =
>>> krealloc_array(adev->reset_dump_reg_list,
>>> + i, sizeof(uint32_t), GFP_KERNEL);
>>
>> Well that still doesn't looks like what we need.
>>
>> Here is once more the roughly what the code should do:
>>
>> unsigned int i;
>> uint32_t *tmp;
>>
>> i = 0;
>> do {
>> tmp = krealloc_array(tmp, i, sizeof(uint32_t), GFP_KERNEL);
>> copy_from_user().....
>> i++;
>> } while (len < size);
>>
>> down_write_killable();
>> ...
>>
>> swap(adev->reset_dump_reg_list, tmp);
>> adev->num_regs = i;
>> up_write();
>>
>> Regards,
>> Christian.
>>
>>> +
>>> + ret = down_read_killable(&adev->reset_sem);
>>> +
>>> + if (ret)
>>> + return ret;
>>> +
>>> + adev->num_regs = i;
>>> + memcpy(adev->reset_dump_reg_list, reg_list,
>>> + sizeof(uint32_t) * adev->num_regs);
>>> +
>>> + up_read(&adev->reset_sem);
>>> +
>>> + return size;
>>> +}
>>> +
>>> +
>>> +
>>> +static const struct file_operations amdgpu_reset_dump_register_list
>>> = {
>>> + .owner = THIS_MODULE,
>>> + .read = amdgpu_reset_dump_register_list_read,
>>> + .write = amdgpu_reset_dump_register_list_write,
>>> + .llseek = default_llseek
>>> +};
>>> +
>>> int amdgpu_debugfs_init(struct amdgpu_device *adev)
>>> {
>>> struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
>>> @@ -1672,6 +1782,8 @@ int amdgpu_debugfs_init(struct amdgpu_device
>>> *adev)
>>> &amdgpu_debugfs_test_ib_fops);
>>> debugfs_create_file("amdgpu_vm_info", 0444, root, adev,
>>> &amdgpu_debugfs_vm_info_fops);
>>> + debugfs_create_file("amdgpu_reset_dump_register_list", 0644,
>>> root, adev,
>>> + &amdgpu_reset_dump_register_list);
>>> adev->debugfs_vbios_blob.data = adev->bios;
>>> adev->debugfs_vbios_blob.size = adev->bios_size;
>>
More information about the amd-gfx
mailing list