[PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level

Andrey Grodzovsky andrey.grodzovsky at amd.com
Mon Jul 25 21:07:22 UTC 2022


On 2022-07-25 13:37, Christian König wrote:
> Hi Victor,
>
> Am 25.07.22 um 12:45 schrieb Zhao, Victor:
>> [AMD Official Use Only - General]
>>
>> Hi @Grodzovsky, Andrey,
>>
>> Please help review the series, thanks a lot.
>>
>> Hi @Koenig, Christian,
>>
>> I thought a module parameter will be exposed to a common user, this 
>> control was added to help debug and test so I put it as a debugfs. I 
>> can make it module parameter if more appropriate.
>
> That's a really good argument. I leave the decision if we should use a 
> module parameter or debugfs file to Andrey.
>
> If you go with debugfs then using the debugfs_create_u32() or 
> debugfs_create_u64() function would be more appropriate I think. And 
> then don't make that global, but rather a per device flag.
>
> Regards,
> Christian.


Makes sense to me too.

Andrey


>
>>
>>
>> Thanks,
>> Victor
>>
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig at amd.com>
>> Sent: Friday, July 22, 2022 4:20 PM
>> To: Zhao, Victor <Victor.Zhao at amd.com>; amd-gfx at lists.freedesktop.org
>> Cc: Deng, Emily <Emily.Deng at amd.com>; Deucher, Alexander 
>> <Alexander.Deucher at amd.com>
>> Subject: Re: [PATCH 2/5] drm/amdgpu: add debugfs amdgpu_reset_level
>>
>> Well NAK to the debugfs approach, stuff like that is usually a module 
>> parameter.
>>
>> Apart from that this series needs to be reviewed by Andrey.
>>
>> Regards,
>> Christian.
>>
>> Am 22.07.22 um 09:34 schrieb Victor Zhao:
>>> Introduce amdgpu_reset_level debugfs in order to help debug and test
>>> specific type of reset. Also helps blocking unwanted type of resets.
>>>
>>> By default, mode2 reset will not be enabled
>>>
>>> Signed-off-by: Victor Zhao <Victor.Zhao at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  4 ++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 20 
>>> ++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c     |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c   |  6 ++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    |  3 +++
>>>    5 files changed, 34 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 6cd1e0a6dffc..c661231a6a07 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -238,6 +238,7 @@ extern int amdgpu_si_support;
>>>    extern int amdgpu_cik_support;
>>>    #endif
>>>    extern int amdgpu_num_kcq;
>>> +extern uint amdgpu_reset_level_mask;
>>>       #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024)
>>>    extern int amdgpu_vcnfw_log;
>>> @@ -274,6 +275,9 @@ extern int amdgpu_vcnfw_log;
>>>    #define AMDGPU_RESET_VCE            (1 << 13)
>>>    #define AMDGPU_RESET_VCE1            (1 << 14)
>>>    +#define AMDGPU_RESET_LEVEL_SOFT_RECOVERY (1 << 0) #define
>>> +AMDGPU_RESET_LEVEL_MODE2 (1 << 1)
>>> +
>>>    /* max cursor sizes (in pixels) */
>>>    #define CIK_CURSOR_WIDTH 128
>>>    #define CIK_CURSOR_HEIGHT 128
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index e2eec985adb3..235c48e4ba4d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1661,12 +1661,29 @@ static int amdgpu_debugfs_sclk_set(void 
>>> *data, u64 val)
>>>        return ret;
>>>    }
>>>    +static int amdgpu_debugfs_reset_level_get(void *data, u64 *val) {
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)data;
>>> +    *val = amdgpu_reset_level_mask;
>>> +    return 0;
>>> +}
>>> +
>>> +static int amdgpu_debugfs_reset_level_set(void *data, u64 val) {
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)data;
>>> +    amdgpu_reset_level_mask = val;
>>> +    return 0;
>>> +}
>>> +
>>>    DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>>>                amdgpu_debugfs_ib_preempt, "%llu\n");
>>>       DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>>>                amdgpu_debugfs_sclk_set, "%llu\n");
>>>    +DEFINE_DEBUGFS_ATTRIBUTE(fops_reset_level, 
>>> amdgpu_debugfs_reset_level_get,
>>> +            amdgpu_debugfs_reset_level_set, "%llu\n");
>>> +
>>>    static ssize_t amdgpu_reset_dump_register_list_read(struct file *f,
>>>                    char __user *buf, size_t size, loff_t *pos)
>>>    {
>>> @@ -1785,6 +1802,9 @@ int amdgpu_debugfs_init(struct amdgpu_device 
>>> *adev)
>>>            return PTR_ERR(ent);
>>>        }
>>>    +    debugfs_create_file("amdgpu_reset_level", 0200, root, adev,
>>> +                  &fops_reset_level);
>>> +
>>>        /* Register debugfs entries for amdgpu_ttm */
>>>        amdgpu_ttm_debugfs_init(adev);
>>>        amdgpu_debugfs_pm_init(adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index e8c6c3fe9374..fb8f3cb853a7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -198,6 +198,7 @@ struct amdgpu_watchdog_timer 
>>> amdgpu_watchdog_timer = {
>>>        .timeout_fatal_disable = false,
>>>        .period = 0x0, /* default to 0x0 (timeout disable) */
>>>    };
>>> +uint amdgpu_reset_level_mask = 0x1;
>>>       /**
>>>     * DOC: vramlimit (int)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> index 831fb222139c..f16ab1a54b70 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
>>> @@ -74,6 +74,9 @@ int amdgpu_reset_prepare_hwcontext(struct 
>>> amdgpu_device *adev,
>>>    {
>>>        struct amdgpu_reset_handler *reset_handler = NULL;
>>>    +    if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
>>> +        return -ENOSYS;
>>> +
>>>        if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>>>            return -ENOSYS;
>>>    @@ -93,6 +96,9 @@ int amdgpu_reset_perform_reset(struct 
>>> amdgpu_device *adev,
>>>        int ret;
>>>        struct amdgpu_reset_handler *reset_handler = NULL;
>>>    +    if (!(amdgpu_reset_level_mask & AMDGPU_RESET_LEVEL_MODE2))
>>> +        return -ENOSYS;
>>> +
>>>        if (test_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags))
>>>            return -ENOSYS;
>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> index d3558c34d406..1ffdc050a077 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
>>> @@ -405,6 +405,9 @@ bool amdgpu_ring_soft_recovery(struct 
>>> amdgpu_ring *ring, unsigned int vmid,
>>>    {
>>>        ktime_t deadline = ktime_add_us(ktime_get(), 10000);
>>>    +    if (!(amdgpu_reset_level_mask & 
>>> AMDGPU_RESET_LEVEL_SOFT_RECOVERY))
>>> +        return false;
>>> +
>>>        if (amdgpu_sriov_vf(ring->adev) || 
>>> !ring->funcs->soft_recovery || !fence)
>>>            return false;
>


More information about the amd-gfx mailing list