[PATCH v10 4/8] drm/xe/eustall: Add support to read() and poll() EU stall data

Wed Feb 19 19:43:19 UTC 2025

On Wed, Feb 19, 2025 at 10:15:52AM -0800, Dixit, Ashutosh wrote:
Hi Ashutosh,

> On Tue, 18 Feb 2025 11:53:54 -0800, Harish Chegondi wrote:
> >
> > @@ -39,7 +40,9 @@ struct per_xecore_buf {
> >  };
> >
> >  struct xe_eu_stall_data_stream {
> > +	bool pollin;
> >	bool enabled;
> > +	wait_queue_head_t poll_wq;
> >	size_t data_record_size;
> >	size_t per_xecore_buf_size;
> >	unsigned int wait_num_reports;
> > @@ -47,7 +50,11 @@ struct xe_eu_stall_data_stream {
> >
> >	struct xe_gt *gt;
> >	struct xe_bo *bo;
> > +	/* Lock to protect xecore_buf */
> > +	struct mutex buf_lock;
> 
> Why do we need this new lock? I thought we would just be able to use
> gt->eu_stall->stream_lock? stream_lock is already taken for read(), so we
> just need to take it for eu_stall_data_buf_poll()?

I started off with using the gt->eu_stall->stream_lock. But I have seen
warnings in the dmesg log while testing indicating possible circular
locking dependency leading to a deadlock. Maybe I can spend more time
later to investigate further and eliminate the possible circular locking
dependency. But for now, I decided to use a new lock to eliminate the
per subslice lock. Here is the dmesg log that I saved from my testing to
investigate later.

[17606.848776] ======================================================
[17606.848781] WARNING: possible circular locking dependency detected
[17606.848786] 6.13.0-upstream #3 Not tainted
[17606.848791] ------------------------------------------------------
[17606.848796] xe_eu_stall/21899 is trying to acquire lock:
[17606.848801] ffff88810daad948 ((wq_completion)xe_eu_stall){+.+.}-{0:0}, at: touch_wq_lockdep_map+0x21/0x80
[17606.848822]
               but task is already holding lock:
[17606.848827] ffff88810d0786a8 (&gt->eu_stall->stream_lock){+.+.}-{4:4}, at: xe_eu_stall_stream_close+0x27/0x70 [xe]
[17606.848903]
               which lock already depends on the new lock.

[17606.848909]
               the existing dependency chain (in reverse order) is:
[17606.848915]
               -> #2 (&gt->eu_stall->stream_lock){+.+.}-{4:4}:
[17606.848915]
               -> #2 (&gt->eu_stall->stream_lock){+.+.}-{4:4}:
[17606.848925]        __mutex_lock+0xb4/0xeb0
[17606.848934]        eu_stall_data_buf_poll+0x42/0x180 [xe]
[17606.848989]        eu_stall_data_buf_poll_work_fn+0x15/0x60 [xe]
[17606.849042]        process_one_work+0x207/0x640
[17606.849051]        worker_thread+0x18c/0x330
[17606.849058]        kthread+0xeb/0x120
[17606.849065]        ret_from_fork+0x2c/0x50
[17606.849073]        ret_from_fork_asm+0x1a/0x30
[17606.849081]
               -> #1 ((work_completion)(&(&stream->buf_poll_work)->work)){+.+.}-{0:0}:
[17606.849092]        process_one_work+0x1e3/0x640
[17606.849100]        worker_thread+0x18c/0x330
[17606.849107]        kthread+0xeb/0x120
[17606.849113]        ret_from_fork+0x2c/0x50
[17606.849120]        ret_from_fork_asm+0x1a/0x30
[17606.849126]
               -> #0 ((wq_completion)xe_eu_stall){+.+.}-{0:0}:
[17606.849134]        __lock_acquire+0x167c/0x27d0
[17606.849141]        lock_acquire+0xd5/0x300
[17606.849148]        touch_wq_lockdep_map+0x36/0x80
[17606.849155]        __flush_workqueue+0x7e/0x4a0
[17606.849163]        drain_workqueue+0x92/0x130
[17606.849170]        destroy_workqueue+0x55/0x380
[17606.849177]        xe_eu_stall_data_buf_destroy+0x11/0x50 [xe]
[17606.849220]        xe_eu_stall_stream_close+0x37/0x70 [xe]
[17606.849259]        __fput+0xed/0x2b0
[17606.849264]        __x64_sys_close+0x37/0x80
[17606.849271]        do_syscall_64+0x68/0x140
[17606.849276]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[17606.849286]
               other info that might help us debug this:

[17606.849294] Chain exists of:
                 (wq_completion)xe_eu_stall --> (work_completion)(&(&stream->buf_poll_work)->work) --> &gt->eu_stall->stream_lock

[17606.849312]  Possible unsafe locking scenario:

[17606.849318]        CPU0                    CPU1
[17606.849323]        ----                    ----
[17606.849328]   lock(&gt->eu_stall->stream_lock);
[17606.849334]                                lock((work_completion)(&(&stream->buf_poll_work)->work));
[17606.849344]                                lock(&gt->eu_stall->stream_lock);
[17606.849352]   lock((wq_completion)xe_eu_stall);
[17606.849359]
                *** DEADLOCK ***

[17606.849365] 1 lock held by xe_eu_stall/21899:
[17606.849371]  #0: ffff88810d0786a8 (&gt->eu_stall->stream_lock){+.+.}-{4:4}, at: xe_eu_stall_stream_close+0x27/0x70 [xe]
[17606.849430]
               stack backtrace:
[17606.849435] CPU: 3 UID: 0 PID: 21899 Comm: xe_eu_stall Not tainted 6.13.0-upstream #3
[17606.849445] Hardware name: Intel Corporation Lunar Lake Client Platform/LNL-M LP5 RVP1, BIOS LNLMFWI1.R00.3220.D89.2407012051 07/01/2024
[17606.849457] Call Trace:
[17606.849461]  <TASK>
[17606.849465]  dump_stack_lvl+0x82/0xd0
[17606.849473]  print_circular_bug+0x2d2/0x410
[17606.849473]  print_circular_bug+0x2d2/0x410
[17606.849482]  check_noncircular+0x15d/0x180
[17606.849492]  __lock_acquire+0x167c/0x27d0
[17606.849500]  lock_acquire+0xd5/0x300
[17606.849507]  ? touch_wq_lockdep_map+0x21/0x80
[17606.849515]  ? lockdep_init_map_type+0x4b/0x260
[17606.849522]  ? touch_wq_lockdep_map+0x21/0x80
[17606.849529]  touch_wq_lockdep_map+0x36/0x80
[17606.849536]  ? touch_wq_lockdep_map+0x21/0x80
[17606.849544]  __flush_workqueue+0x7e/0x4a0
[17606.849551]  ? find_held_lock+0x2b/0x80
[17606.849561]  drain_workqueue+0x92/0x130
[17606.849569]  destroy_workqueue+0x55/0x380
[17606.849577]  xe_eu_stall_data_buf_destroy+0x11/0x50 [xe]
[17606.849627]  xe_eu_stall_stream_close+0x37/0x70 [xe]
[17606.849678]  __fput+0xed/0x2b0
[17606.849683]  __x64_sys_close+0x37/0x80
[17606.849691]  do_syscall_64+0x68/0x140
[17606.849698]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[17606.849706] RIP: 0033:0x7fdc81b14f67
[17606.849712] Code: ff e8 0d 16 02 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 73 ba f7 ff
[17606.849728] RSP: 002b:00007fffd2bd7e58 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
[17606.849738] RAX: ffffffffffffffda RBX: 0000559f7fa08100 RCX: 00007fdc81b14f67
[17606.849746] RDX: 0000000000000000 RSI: 0000000000006901 RDI: 0000000000000004
[17606.849754] RBP: 00007fdc7d40bc90 R08: 0000000000000000 R09: 000000007fffffff
[17606.849762] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[17606.849770] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
[17606.849783]  </TASK>

> 
> > @@ -357,16 +580,26 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
> >			  max_wait_num_reports);
> >		return -EINVAL;
> >	}
> > +
> > +	init_waitqueue_head(&stream->poll_wq);
> > +	INIT_DELAYED_WORK(&stream->buf_poll_work, eu_stall_data_buf_poll_work_fn);
> > +	mutex_init(&stream->buf_lock);
> > +	stream->buf_poll_wq = alloc_ordered_workqueue("xe_eu_stall", 0);
> > +	if (!stream->buf_poll_wq)
> > +		return -ENOMEM;
> >	stream->per_xecore_buf_size = per_xecore_buf_size;
> >	stream->sampling_rate_mult = props->sampling_rate_mult;
> >	stream->wait_num_reports = props->wait_num_reports;
> >	stream->data_record_size = xe_eu_stall_data_record_size(gt_to_xe(gt));
> >	stream->xecore_buf = kcalloc(last_xecore, sizeof(*stream->xecore_buf), GFP_KERNEL);
> > -	if (!stream->xecore_buf)
> > +	if (!stream->xecore_buf) {
> > +		destroy_workqueue(stream->buf_poll_wq);
> >		return -ENOMEM;
> > +	}
> >
> >	ret = xe_eu_stall_data_buf_alloc(stream, last_xecore);
> >	if (ret) {
> > +		destroy_workqueue(stream->buf_poll_wq);
> >		kfree(stream->xecore_buf);
> 
> OK, won't block on this, but error unwinding is cleaner with label's and
> goto's.
> 
> Also, if stream->buf_lock is needed, mutex_destroy is also needed for error
> unwinding and also during stream close.
Okay. But, before we do a free(stream), does it really matter to call
mutex_destroy on stream->buf_lock ?
> 
> > diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
> > index d5281de04d54..1cc6bfc34ccb 100644
> > --- a/drivers/gpu/drm/xe/xe_trace.h
> > +++ b/drivers/gpu/drm/xe/xe_trace.h
> > @@ -427,6 +427,39 @@ DEFINE_EVENT(xe_pm_runtime, xe_pm_runtime_get_ioctl,
> >	     TP_ARGS(xe, caller)
> >  );
> >
> > +TRACE_EVENT(xe_eu_stall_data_read,
> > +	    TP_PROTO(u8 slice, u8 subslice,
> > +		     u32 read_ptr, u32 write_ptr,
> > +		     u32 read_offset, u32 write_offset,
> > +		     size_t total_size),
> > +	    TP_ARGS(slice, subslice, read_ptr, write_ptr,
> > +		    read_offset, write_offset, total_size),
> > +
> > +	    TP_STRUCT__entry(__field(u8, slice)
> > +			     __field(u8, subslice)
> > +			     __field(u32, read_ptr)
> > +			     __field(u32, write_ptr)
> > +			     __field(u32, read_offset)
> > +			     __field(u32, write_offset)
> > +			     __field(size_t, total_size)
> > +			     ),
> > +
> > +	    TP_fast_assign(__entry->slice = slice;
> > +			   __entry->subslice = subslice;
> > +			   __entry->read_ptr = read_ptr;
> > +			   __entry->write_ptr = write_ptr;
> > +			   __entry->read_offset = read_offset;
> > +			   __entry->write_offset = write_offset;
> 
> Keep it if we need it, but do we need both the read/write ptr's and
> offset's? Since offset's are the same as ptr's, but without the ms bit.
True offsets are just pointers without the overflow bit. Maybe I can
remove the offsets, but add old read pointer (since read pointer changes
during read).

Thank You
Harish.

> 
> > +			   __entry->total_size = total_size;
> > +			   ),
> > +
> > +	    TP_printk("slice:%u subslice:%u readptr:0x%x writeptr:0x%x read off:%u write off:%u size:%zu ",
> > +		      __entry->slice, __entry->subslice,
> > +		      __entry->read_ptr, __entry->write_ptr,
> > +		      __entry->read_offset, __entry->write_offset,
> > +		      __entry->total_size)
> > +);
> > +
> >  #endif
> >
> >  /* This part must be outside protection */
> > --
> > 2.48.1
> >