[PATCH] drm/xe/debugfs: Add support for gt synchronous force reset

Jani Nikula jani.nikula at linux.intel.com
Tue Jan 2 16:05:49 UTC 2024


On Tue, 02 Jan 2024, Mika Kuoppala <mika.kuoppala at linux.intel.com> wrote:
> "Poosa, Karthik" <karthik.poosa at intel.com> writes:
>
>> Hi Anshuman,
>>
>> Please find my comments inline below.
>>
>> Thanks,
>>
>> Karthik.
>>
>> On 01-01-2024 20:07, Gupta, Anshuman wrote:
>>>
>>>> -----Original Message-----
>>>> From: Poosa, Karthik <karthik.poosa at intel.com>
>>>> Sent: Sunday, December 31, 2023 8:18 PM
>>>> To: intel-xe at lists.freedesktop.org
>>>> Cc: Gupta, Anshuman <anshuman.gupta at intel.com>; Nilawar, Badal
>>>> <badal.nilawar at intel.com>; Brost, Matthew <matthew.brost at intel.com>;
>>>> Vivi, Rodrigo <rodrigo.vivi at intel.com>; Poosa, Karthik
>>>> <karthik.poosa at intel.com>
>>>> Subject: [PATCH] drm/xe/debugfs: Add support for gt synchronous force reset
>>>>
>>>> This support added as igt test freq_reset_multiple fails sporadically in case
>>>> xe_guc_pc is not started.
>>>> A completion is added in force_reset for this.
>>>> Writing non-zero to force_reset debugfs entry does synchronous reset.
>>> We need IGT patch as well for this patch.
>>>> v2:
>>>> - Changed wait for completion to interruptible (Anshuman).
>>>> - Moved timeout to xe_gt.h (Anshuman).
>>>> - Created a debugfs for updating timeout (Rodrigo).
>>>>
>>>> v3:
>>>> - Update force_reset debugfs with write support.
>>>>    value 0 -> async reset, non-zero -> sync reset (Matthew Brost).
>>>>
>>>> Signed-off-by: Karthik Poosa <karthik.poosa at intel.com>
>>>> ---
>>>>   drivers/gpu/drm/xe/xe_gt.c         |  2 +
>>>>   drivers/gpu/drm/xe/xe_gt_debugfs.c | 72
>>>> ++++++++++++++++++++++++++++--  drivers/gpu/drm/xe/xe_gt_debugfs.h
>>>> |  3 ++
>>>>   drivers/gpu/drm/xe/xe_gt_types.h   |  8 ++++
>>>>   4 files changed, 82 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index
>>>> 3af2adec1295..c8d2c18b0c62 100644
>>>> --- a/drivers/gpu/drm/xe/xe_gt.c
>>>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>>>> @@ -65,6 +65,7 @@ struct xe_gt *xe_gt_alloc(struct xe_tile *tile)
>>>>
>>>>   	gt->tile = tile;
>>>>   	gt->ordered_wq = alloc_ordered_workqueue("gt-ordered-wq", 0);
>>>> +	init_completion(&gt->reset.done);
>>>>
>>>>   	return gt;
>>>>   }
>>>> @@ -633,6 +634,7 @@ static int gt_reset(struct xe_gt *gt)
>>>>   	xe_device_mem_access_put(gt_to_xe(gt));
>>>>   	XE_WARN_ON(err);
>>>>
>>>> +	complete(&gt->reset.done);
>>>>   	xe_gt_info(gt, "reset done\n");
>>>>
>>>>   	return 0;
>>>> diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c
>>>> b/drivers/gpu/drm/xe/xe_gt_debugfs.c
>>>> index c4b67cf09f8f..3518ab89ccfc 100644
>>>> --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
>>>> +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
>>>> @@ -55,15 +55,71 @@ static int hw_engines(struct seq_file *m, void *data)
>>>>   	return 0;
>>>>   }
>>>>
>>>> -static int force_reset(struct seq_file *m, void *data)
>>>> +static int force_reset(struct xe_gt *gt, bool is_sync)
>>>>   {
>>>> -	struct xe_gt *gt = node_to_gt(m->private);
>>>> +	struct xe_device *xe = gt_to_xe(gt);
>>>> +	long ret;
>>>>
>>>>   	xe_gt_reset_async(gt);
>>>>
>>>> +	if (is_sync) {
>>>> +		drm_info(&xe->drm, "waiting for gt reset completion");
>>>> +		ret = wait_for_completion_interruptible_timeout(&gt-
>>>>> reset.done,
>>>> +							msecs_to_jiffies(gt-
>>>>> reset.timeout_ms));
>>>> +		if (ret <= 0) {
>>>> +			drm_err(&xe->drm, "gt reset timed out/interrputed,
>>>> ret %ld\n", ret);
>>>> +			return -ETIMEDOUT;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +ssize_t force_reset_write(struct file *f, const char __user *buf,
>>>> +size_t len, loff_t *offset) {
>>>> +	struct xe_gt *gt = file_inode(f)->i_private;
>>>> +	struct xe_device *xe = gt_to_xe(gt);
>>>> +	char arg_str[5];
>>>> +	size_t size;
>>>> +	unsigned long arg;
>>>> +	int ret;
>>>> +
>>>> +	size = min(sizeof(arg_str)-1, len);
>>>> +	arg = copy_from_user((void *)arg_str, buf, size);
>>>> +	if (arg < 0) {
>>>> +		drm_err(&xe->drm, "reading force_reset arg failed, ret %ld",
>>>> arg);
>>>> +		return arg;
>>>> +	}
>>>> +	arg_str[size] = '\0';
>>>> +
>>>> +	ret = kstrtoul(arg_str, 0, &arg);
>>>> +	if (ret) {
>>>> +		drm_err(&xe->drm, "force_reset arg parse failed, ret %d", ret);
>>>> +		return ret;
>>>> +	}
>>>> +
>>>> +	ret = force_reset(gt, (arg && 1));
>>> How about  !!arg
>> && will use a single instruction.
>>
>> but !! will happen with 2 instructions, so I think && would be better.
>
> This is not in a hotpath, more readable is better.

And most readable is just passing arg. The param is bool. Let the
compiler do its job.

> -Mika
>
>
>>
>>>   > +	if (ret != 0)  {
>>>> +		drm_err(&xe->drm, "force_reset failed, ret %d\n", ret);
>>>> +		return ret;
>>>> +	}
>>>> +	return len;
>>>> +}
>>>> +
>>>> +ssize_t force_reset_read(struct file *f, char __user *buf, size_t len,
>>>> +loff_t *offset) {
>>>> +	struct xe_gt *gt = file_inode(f)->i_private;
>>>> +
>>>> +	force_reset(gt, 0);
>>>>   	return 0;
>>>>   }
>>>>
>>>> +static const struct file_operations force_reset_fops = {
>>>> +	.owner = THIS_MODULE,
>>>> +	.write = force_reset_write,
>>>> +	.read = force_reset_read,
>>>> +};
>>>> +
>>>>   static int sa_info(struct seq_file *m, void *data)  {
>>>>   	struct xe_tile *tile = gt_to_tile(node_to_gt(m->private));
>>>> @@ -192,7 +248,6 @@ static int vecs_default_lrc(struct seq_file *m, void
>>>> *data)
>>>>
>>>>   static const struct drm_info_list debugfs_list[] = {
>>>>   	{"hw_engines", hw_engines, 0},
>>>> -	{"force_reset", force_reset, 0},
>>>>   	{"sa_info", sa_info, 0},
>>>>   	{"topology", topology, 0},
>>>>   	{"steering", steering, 0},
>>>> @@ -245,5 +300,16 @@ void xe_gt_debugfs_register(struct xe_gt *gt)
>>>>   				 ARRAY_SIZE(debugfs_list),
>>>>   				 root, minor);
>>>>
>>>> +	gt->reset.fr_dentry = debugfs_create_file("force_reset", 0600, root,
>>>> xe,
>>>> +			    &force_reset_fops);
>>> Please use DEFINE_SHOW_STORE_ATTRIBUTE here to create rw debugfs entry.
>>> Use force_reset_show and force_reset_restore function name.
>>>> +	if (!gt->reset.fr_dentry)
>>>> +		drm_warn(&xe->drm, "force_reset debugfs file creation
>>>> failed");
>>>> +	d_inode(gt->reset.fr_dentry)->i_private = gt;
>>>> +
>>>> +	debugfs_create_u32("gt_reset_timeout_ms", 0600, root,
>>>> +					&gt->reset.timeout_ms);
>>>> +	/* set a default timeout value */
>>>> +	gt->reset.timeout_ms = 1000;
>>> xe_gt_debugfs_register() should only do the job of debugfs registration.
>>> This is odd here.
>>
>> Shall I move this to xe_gt_alloc (xe_gt.c), where gt is allocated and 
>> initialized.
>>
>>>> +
>>>>   	xe_uc_debugfs_register(&gt->uc, root);  } diff --git
>>>> a/drivers/gpu/drm/xe/xe_gt_debugfs.h
>>>> b/drivers/gpu/drm/xe/xe_gt_debugfs.h
>>>> index 5a329f118a57..40adcf1822c6 100644
>>>> --- a/drivers/gpu/drm/xe/xe_gt_debugfs.h
>>>> +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.h
>>>> @@ -6,8 +6,11 @@
>>>>   #ifndef _XE_GT_DEBUGFS_H_
>>>>   #define _XE_GT_DEBUGFS_H_
>>>>
>>>> +#include <linux/fs.h>
>>>>   struct xe_gt;
>>>>
>>>>   void xe_gt_debugfs_register(struct xe_gt *gt);
>>>> +ssize_t force_reset_write(struct file *f, const char __user *buf,
>>>> +size_t len, loff_t *offset); ssize_t force_reset_read(struct file *f,
>>>> +char __user *buf, size_t len, loff_t *offset);
>>>>
>>>>   #endif
>>>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
>>>> b/drivers/gpu/drm/xe/xe_gt_types.h
>>>> index f74684660475..d6134215094b 100644
>>>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>>>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>>>> @@ -148,6 +148,14 @@ struct xe_gt {
>>>>   		 * code to safely flush all code paths
>>>>   		 */
>>>>   		struct work_struct worker;
>>>> +		/** @done : completion for GT reset */
>>>> +		struct completion done;
>>>> +		/** @timeout_ms : gt synchronous reset timeout in ms */
>>>> +		u32 timeout_ms;
>>>> +		/** @dentry: dentry for force_reset */
>>>> +		struct dentry *fr_dentry;
>>>> +		/** @is_sync_reset : gt reset is synchronous */
>>>> +		//bool is_sync_reset;
>>> Remove thus unused var.
>>>
>>> Thanks,
>>> Anshuman Gupta.
>>>>   	} reset;
>>>>
>>>>   	/** @tlb_invalidation: TLB invalidation state */
>>>> --
>>>> 2.25.1

-- 
Jani Nikula, Intel


More information about the Intel-xe mailing list