[PATCH] drm/xe/debugfs: Add support for gt synchronous force reset
Poosa, Karthik
karthik.poosa at intel.com
Tue Jan 2 07:03:48 UTC 2024
Hi Anshuman,
Please find my comments inline below.
Thanks,
Karthik.
On 01-01-2024 20:07, Gupta, Anshuman wrote:
>
>> -----Original Message-----
>> From: Poosa, Karthik <karthik.poosa at intel.com>
>> Sent: Sunday, December 31, 2023 8:18 PM
>> To: intel-xe at lists.freedesktop.org
>> Cc: Gupta, Anshuman <anshuman.gupta at intel.com>; Nilawar, Badal
>> <badal.nilawar at intel.com>; Brost, Matthew <matthew.brost at intel.com>;
>> Vivi, Rodrigo <rodrigo.vivi at intel.com>; Poosa, Karthik
>> <karthik.poosa at intel.com>
>> Subject: [PATCH] drm/xe/debugfs: Add support for gt synchronous force reset
>>
>> This support added as igt test freq_reset_multiple fails sporadically in case
>> xe_guc_pc is not started.
>> A completion is added in force_reset for this.
>> Writing non-zero to force_reset debugfs entry does synchronous reset.
> We need IGT patch as well for this patch.
>> v2:
>> - Changed wait for completion to interruptible (Anshuman).
>> - Moved timeout to xe_gt.h (Anshuman).
>> - Created a debugfs for updating timeout (Rodrigo).
>>
>> v3:
>> - Update force_reset debugfs with write support.
>> value 0 -> async reset, non-zero -> sync reset (Matthew Brost).
>>
>> Signed-off-by: Karthik Poosa <karthik.poosa at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_gt.c | 2 +
>> drivers/gpu/drm/xe/xe_gt_debugfs.c | 72
>> ++++++++++++++++++++++++++++-- drivers/gpu/drm/xe/xe_gt_debugfs.h
>> | 3 ++
>> drivers/gpu/drm/xe/xe_gt_types.h | 8 ++++
>> 4 files changed, 82 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index
>> 3af2adec1295..c8d2c18b0c62 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.c
>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>> @@ -65,6 +65,7 @@ struct xe_gt *xe_gt_alloc(struct xe_tile *tile)
>>
>> gt->tile = tile;
>> gt->ordered_wq = alloc_ordered_workqueue("gt-ordered-wq", 0);
>> + init_completion(>->reset.done);
>>
>> return gt;
>> }
>> @@ -633,6 +634,7 @@ static int gt_reset(struct xe_gt *gt)
>> xe_device_mem_access_put(gt_to_xe(gt));
>> XE_WARN_ON(err);
>>
>> + complete(>->reset.done);
>> xe_gt_info(gt, "reset done\n");
>>
>> return 0;
>> diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c
>> b/drivers/gpu/drm/xe/xe_gt_debugfs.c
>> index c4b67cf09f8f..3518ab89ccfc 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
>> +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
>> @@ -55,15 +55,71 @@ static int hw_engines(struct seq_file *m, void *data)
>> return 0;
>> }
>>
>> -static int force_reset(struct seq_file *m, void *data)
>> +static int force_reset(struct xe_gt *gt, bool is_sync)
>> {
>> - struct xe_gt *gt = node_to_gt(m->private);
>> + struct xe_device *xe = gt_to_xe(gt);
>> + long ret;
>>
>> xe_gt_reset_async(gt);
>>
>> + if (is_sync) {
>> + drm_info(&xe->drm, "waiting for gt reset completion");
>> + ret = wait_for_completion_interruptible_timeout(>-
>>> reset.done,
>> + msecs_to_jiffies(gt-
>>> reset.timeout_ms));
>> + if (ret <= 0) {
>> + drm_err(&xe->drm, "gt reset timed out/interrputed,
>> ret %ld\n", ret);
>> + return -ETIMEDOUT;
>> + }
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +ssize_t force_reset_write(struct file *f, const char __user *buf,
>> +size_t len, loff_t *offset) {
>> + struct xe_gt *gt = file_inode(f)->i_private;
>> + struct xe_device *xe = gt_to_xe(gt);
>> + char arg_str[5];
>> + size_t size;
>> + unsigned long arg;
>> + int ret;
>> +
>> + size = min(sizeof(arg_str)-1, len);
>> + arg = copy_from_user((void *)arg_str, buf, size);
>> + if (arg < 0) {
>> + drm_err(&xe->drm, "reading force_reset arg failed, ret %ld",
>> arg);
>> + return arg;
>> + }
>> + arg_str[size] = '\0';
>> +
>> + ret = kstrtoul(arg_str, 0, &arg);
>> + if (ret) {
>> + drm_err(&xe->drm, "force_reset arg parse failed, ret %d", ret);
>> + return ret;
>> + }
>> +
>> + ret = force_reset(gt, (arg && 1));
> How about !!arg
&& will use a single instruction.
but !! will happen with 2 instructions, so I think && would be better.
> > + if (ret != 0) {
>> + drm_err(&xe->drm, "force_reset failed, ret %d\n", ret);
>> + return ret;
>> + }
>> + return len;
>> +}
>> +
>> +ssize_t force_reset_read(struct file *f, char __user *buf, size_t len,
>> +loff_t *offset) {
>> + struct xe_gt *gt = file_inode(f)->i_private;
>> +
>> + force_reset(gt, 0);
>> return 0;
>> }
>>
>> +static const struct file_operations force_reset_fops = {
>> + .owner = THIS_MODULE,
>> + .write = force_reset_write,
>> + .read = force_reset_read,
>> +};
>> +
>> static int sa_info(struct seq_file *m, void *data) {
>> struct xe_tile *tile = gt_to_tile(node_to_gt(m->private));
>> @@ -192,7 +248,6 @@ static int vecs_default_lrc(struct seq_file *m, void
>> *data)
>>
>> static const struct drm_info_list debugfs_list[] = {
>> {"hw_engines", hw_engines, 0},
>> - {"force_reset", force_reset, 0},
>> {"sa_info", sa_info, 0},
>> {"topology", topology, 0},
>> {"steering", steering, 0},
>> @@ -245,5 +300,16 @@ void xe_gt_debugfs_register(struct xe_gt *gt)
>> ARRAY_SIZE(debugfs_list),
>> root, minor);
>>
>> + gt->reset.fr_dentry = debugfs_create_file("force_reset", 0600, root,
>> xe,
>> + &force_reset_fops);
> Please use DEFINE_SHOW_STORE_ATTRIBUTE here to create rw debugfs entry.
> Use force_reset_show and force_reset_restore function name.
>> + if (!gt->reset.fr_dentry)
>> + drm_warn(&xe->drm, "force_reset debugfs file creation
>> failed");
>> + d_inode(gt->reset.fr_dentry)->i_private = gt;
>> +
>> + debugfs_create_u32("gt_reset_timeout_ms", 0600, root,
>> + >->reset.timeout_ms);
>> + /* set a default timeout value */
>> + gt->reset.timeout_ms = 1000;
> xe_gt_debugfs_register() should only do the job of debugfs registration.
> This is odd here.
Shall I move this to xe_gt_alloc (xe_gt.c), where gt is allocated and
initialized.
>> +
>> xe_uc_debugfs_register(>->uc, root); } diff --git
>> a/drivers/gpu/drm/xe/xe_gt_debugfs.h
>> b/drivers/gpu/drm/xe/xe_gt_debugfs.h
>> index 5a329f118a57..40adcf1822c6 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_debugfs.h
>> +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.h
>> @@ -6,8 +6,11 @@
>> #ifndef _XE_GT_DEBUGFS_H_
>> #define _XE_GT_DEBUGFS_H_
>>
>> +#include <linux/fs.h>
>> struct xe_gt;
>>
>> void xe_gt_debugfs_register(struct xe_gt *gt);
>> +ssize_t force_reset_write(struct file *f, const char __user *buf,
>> +size_t len, loff_t *offset); ssize_t force_reset_read(struct file *f,
>> +char __user *buf, size_t len, loff_t *offset);
>>
>> #endif
>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
>> b/drivers/gpu/drm/xe/xe_gt_types.h
>> index f74684660475..d6134215094b 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>> @@ -148,6 +148,14 @@ struct xe_gt {
>> * code to safely flush all code paths
>> */
>> struct work_struct worker;
>> + /** @done : completion for GT reset */
>> + struct completion done;
>> + /** @timeout_ms : gt synchronous reset timeout in ms */
>> + u32 timeout_ms;
>> + /** @dentry: dentry for force_reset */
>> + struct dentry *fr_dentry;
>> + /** @is_sync_reset : gt reset is synchronous */
>> + //bool is_sync_reset;
> Remove thus unused var.
>
> Thanks,
> Anshuman Gupta.
>> } reset;
>>
>> /** @tlb_invalidation: TLB invalidation state */
>> --
>> 2.25.1
More information about the Intel-xe
mailing list