[PATCH] drm/xe: Allow to trigger GT resets using debugfs writes

Wed May 21 21:05:51 UTC 2025

On 5/20/2025 2:12 AM, Michal Wajdeczko wrote:
> On 20.05.2025 01:44, John Harrison wrote:
>> On 5/19/2025 1:09 PM, Michal Wajdeczko wrote:
>>> Today we allow to trigger GT resest by reading dedicated debugfs
>>> files "force_reset" and "force_reset_sync" that we are exposing
>>> using drm_info_list[] and drm_debugfs_create_files().
>>>
>>> To avoid triggering potentially disruptive actions during otherwise
>>> "safe" read operations, expose those two attributes using debugfs
>>> function where we can specify file permissions and provide custom
>>> "write" handler to trigger the GT resets also from there.
>> Would you like to extend this to the GuC log dump-via-dmesg trigger
>> entry as well? Dumping that during the IGT read debugfs test also causes
>> issues due to the huge amount of output it produces.
> yes, that was my plan as I was expecting more discussion there so I
> wanted to put that into a separate series
:)

>
> but since you already asked here, my question would be:
>
> do you prefer keeping separate write-only "guc_log_dmesg" file, or keep
> single read-write "guc_log" file that would print encoded log on read
> and dump it to dmesg on write?
I would keep the dmesg version as a totally separate thing. It's purpose 
is just for testing the dump-via-dmesg support for things like the 
CT_DEAD and other on-demand dumps from within the KMD. Whereas the 
regular 'guc_log' file is the official way to query the GuC log for 
debugging.

Longer term, the intent would be to drop the guc_log_dmesg entirely and 
replace it with a devcoredump_to_dmesg file instead. Given that the 
intent is to drop the fake coredump via dmesg that CT_DEAD currently 
uses and replace with a real core dump once I can get back to fixing up 
the core dump to not require a DRM job, support multiple GTs, etc.

>
> also do we want to keep 0444 mode for "guc_log" or change it to 0400
> mode (or 0600 if it would be dual-purpose)
It is debugfs so requires a debug enabled kernel in the first place. I 
assume that someone who is concerned about security and end user privacy 
in a actual product would not have debugfs enabled. But sure, we can 
drop it 0400 if you think being public is an issue.

>
>>> This step would allow us to drop triggering GT resets during read
>>> operations, which we leave just to give users more time to switch.
>> I'm not following this sentence. Give users more time to switch what?
> switch to use "echo 1 > force_reset" instead of "cat force_reset"
>
>> Also, would it be more accurate to call the patch set something like
>> 'require a write to trigger GT resets via debugfs'. Saying 'allow'
>> implies you are just extending the existing interface rather than
>> removing the trigger-via-read ability.
> this patch extends the existing interface, the legacy way to trigger a
> reset by read is still there (due to IGT) - see force_reset_show()
>
> once the IGT will be updated, we would make those files write-only
> I didn't want to introduce any CI regression right now, hence two steps
Got you.

I would just go straight for the intended version in one patch and 
include a link to the corresponding IGT patch to update the tests. But a 
two step approach works, as long as it doesn't get forgotten and step 
two never happens!

John.

>
>> John.
>>
>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
>>> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_gt_debugfs.c | 96 +++++++++++++++++++++++-------
>>>    1 file changed, 76 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/
>>> xe_gt_debugfs.c
>>> index 119a55bb7580..848618acdca8 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
>>> +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
>>> @@ -122,24 +122,6 @@ static int powergate_info(struct xe_gt *gt,
>>> struct drm_printer *p)
>>>        return ret;
>>>    }
>>>    -static int force_reset(struct xe_gt *gt, struct drm_printer *p)
>>> -{
>>> -    xe_pm_runtime_get(gt_to_xe(gt));
>>> -    xe_gt_reset_async(gt);
>>> -    xe_pm_runtime_put(gt_to_xe(gt));
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int force_reset_sync(struct xe_gt *gt, struct drm_printer *p)
>>> -{
>>> -    xe_pm_runtime_get(gt_to_xe(gt));
>>> -    xe_gt_reset(gt);
>>> -    xe_pm_runtime_put(gt_to_xe(gt));
>>> -
>>> -    return 0;
>>> -}
>>> -
>>>    static int sa_info(struct xe_gt *gt, struct drm_printer *p)
>>>    {
>>>        struct xe_tile *tile = gt_to_tile(gt);
>>> @@ -306,8 +288,6 @@ static int hwconfig(struct xe_gt *gt, struct
>>> drm_printer *p)
>>>     * - without access to the PF specific data
>>>     */
>>>    static const struct drm_info_list vf_safe_debugfs_list[] = {
>>> -    {"force_reset", .show = xe_gt_debugfs_simple_show, .data =
>>> force_reset},
>>> -    {"force_reset_sync", .show = xe_gt_debugfs_simple_show, .data =
>>> force_reset_sync},
>>>        {"sa_info", .show = xe_gt_debugfs_simple_show, .data = sa_info},
>>>        {"topology", .show = xe_gt_debugfs_simple_show, .data = topology},
>>>        {"ggtt", .show = xe_gt_debugfs_simple_show, .data = ggtt},
>>> @@ -332,6 +312,78 @@ static const struct drm_info_list
>>> pf_only_debugfs_list[] = {
>>>        {"steering", .show = xe_gt_debugfs_simple_show, .data = steering},
>>>    };
>>>    +static ssize_t write_to_gt_call(const char __user *userbuf, size_t
>>> count, loff_t *ppos,
>>> +                void (*call)(struct xe_gt *), struct xe_gt *gt)
>>> +{
>>> +    bool yes;
>>> +    int ret;
>>> +
>>> +    if (*ppos)
>>> +        return -EINVAL;
>>> +    ret = kstrtobool_from_user(userbuf, count, &yes);
>>> +    if (ret < 0)
>>> +        return ret;
>>> +    if (yes)
>>> +        call(gt);
>>> +    return count;
>>> +}
>>> +
>>> +static void force_reset(struct xe_gt *gt)
>>> +{
>>> +    struct xe_device *xe = gt_to_xe(gt);
>>> +
>>> +    xe_pm_runtime_get(xe);
>>> +    xe_gt_reset_async(gt);
>>> +    xe_pm_runtime_put(xe);
>>> +}
>>> +
>>> +static ssize_t force_reset_write(struct file *file,
>>> +                 const char __user *userbuf,
>>> +                 size_t count, loff_t *ppos)
>>> +{
>>> +    struct seq_file *s = file->private_data;
>>> +    struct xe_gt *gt = s->private;
>>> +
>>> +    return write_to_gt_call(userbuf, count, ppos, force_reset, gt);
>>> +}
>>> +
>>> +static int force_reset_show(struct seq_file *s, void *unused)
>>> +{
>>> +    struct xe_gt *gt = s->private;
>>> +
>>> +    force_reset(gt); /* to be deprecated! */
>>> +    return 0;
>>> +}
>>> +DEFINE_SHOW_STORE_ATTRIBUTE(force_reset);
>>> +
>>> +static void force_reset_sync(struct xe_gt *gt)
>>> +{
>>> +    struct xe_device *xe = gt_to_xe(gt);
>>> +
>>> +    xe_pm_runtime_get(xe);
>>> +    xe_gt_reset(gt);
>>> +    xe_pm_runtime_put(xe);
>>> +}
>>> +
>>> +static ssize_t force_reset_sync_write(struct file *file,
>>> +                      const char __user *userbuf,
>>> +                      size_t count, loff_t *ppos)
>>> +{
>>> +    struct seq_file *s = file->private_data;
>>> +    struct xe_gt *gt = s->private;
>>> +
>>> +    return write_to_gt_call(userbuf, count, ppos, force_reset_sync, gt);
>>> +}
>>> +
>>> +static int force_reset_sync_show(struct seq_file *s, void *unused)
>>> +{
>>> +    struct xe_gt *gt = s->private;
>>> +
>>> +    force_reset_sync(gt); /* to be deprecated! */
>>> +    return 0;
>>> +}
>>> +DEFINE_SHOW_STORE_ATTRIBUTE(force_reset_sync);
>>> +
>>>    void xe_gt_debugfs_register(struct xe_gt *gt)
>>>    {
>>>        struct xe_device *xe = gt_to_xe(gt);
>>> @@ -355,6 +407,10 @@ void xe_gt_debugfs_register(struct xe_gt *gt)
>>>         */
>>>        root->d_inode->i_private = gt;
>>>    +    /* VF safe */
>>> +    debugfs_create_file("force_reset", 0600, root, gt,
>>> &force_reset_fops);
>>> +    debugfs_create_file("force_reset_sync", 0600, root, gt,
>>> &force_reset_sync_fops);
>>> +
>>>        drm_debugfs_create_files(vf_safe_debugfs_list,
>>>                     ARRAY_SIZE(vf_safe_debugfs_list),
>>>                     root, minor);