[PATCH v10 4/8] drm/xe/eustall: Add support to read() and poll() EU stall data

Thu Feb 20 07:02:45 UTC 2025

On Wed, 19 Feb 2025 11:43:19 -0800, Harish Chegondi wrote:
>

Hi Harish,

> On Wed, Feb 19, 2025 at 10:15:52AM -0800, Dixit, Ashutosh wrote:
> Hi Ashutosh,
>
> > On Tue, 18 Feb 2025 11:53:54 -0800, Harish Chegondi wrote:
> > >
> > > @@ -39,7 +40,9 @@ struct per_xecore_buf {
> > >  };
> > >
> > >  struct xe_eu_stall_data_stream {
> > > +	bool pollin;
> > >	bool enabled;
> > > +	wait_queue_head_t poll_wq;
> > >	size_t data_record_size;
> > >	size_t per_xecore_buf_size;
> > >	unsigned int wait_num_reports;
> > > @@ -47,7 +50,11 @@ struct xe_eu_stall_data_stream {
> > >
> > >	struct xe_gt *gt;
> > >	struct xe_bo *bo;
> > > +	/* Lock to protect xecore_buf */
> > > +	struct mutex buf_lock;
> >
> > Why do we need this new lock? I thought we would just be able to use
> > gt->eu_stall->stream_lock? stream_lock is already taken for read(), so we
> > just need to take it for eu_stall_data_buf_poll()?
>
> I started off with using the gt->eu_stall->stream_lock. But I have seen
> warnings in the dmesg log while testing indicating possible circular
> locking dependency leading to a deadlock. Maybe I can spend more time
> later to investigate further and eliminate the possible circular locking
> dependency. But for now, I decided to use a new lock to eliminate the
> per subslice lock. Here is the dmesg log that I saved from my testing to
> investigate later.
>
> [17606.848776] ======================================================
> [17606.848781] WARNING: possible circular locking dependency detected
> [17606.848786] 6.13.0-upstream #3 Not tainted
> [17606.848791] ------------------------------------------------------
> [17606.848796] xe_eu_stall/21899 is trying to acquire lock:
> [17606.848801] ffff88810daad948 ((wq_completion)xe_eu_stall){+.+.}-{0:0}, at: touch_wq_lockdep_map+0x21/0x80
> [17606.848822]
>                but task is already holding lock:
> [17606.848827] ffff88810d0786a8 (&gt->eu_stall->stream_lock){+.+.}-{4:4}, at: xe_eu_stall_stream_close+0x27/0x70 [xe]
> [17606.848903]
>                which lock already depends on the new lock.
>
> [17606.848909]
>                the existing dependency chain (in reverse order) is:
> [17606.848915]
>                -> #2 (&gt->eu_stall->stream_lock){+.+.}-{4:4}:
> [17606.848915]
>                -> #2 (&gt->eu_stall->stream_lock){+.+.}-{4:4}:
> [17606.848925]        __mutex_lock+0xb4/0xeb0
> [17606.848934]        eu_stall_data_buf_poll+0x42/0x180 [xe]
> [17606.848989]        eu_stall_data_buf_poll_work_fn+0x15/0x60 [xe]
> [17606.849042]        process_one_work+0x207/0x640
> [17606.849051]        worker_thread+0x18c/0x330
> [17606.849058]        kthread+0xeb/0x120
> [17606.849065]        ret_from_fork+0x2c/0x50
> [17606.849073]        ret_from_fork_asm+0x1a/0x30
> [17606.849081]
>                -> #1 ((work_completion)(&(&stream->buf_poll_work)->work)){+.+.}-{0:0}:
> [17606.849092]        process_one_work+0x1e3/0x640
> [17606.849100]        worker_thread+0x18c/0x330
> [17606.849107]        kthread+0xeb/0x120
> [17606.849113]        ret_from_fork+0x2c/0x50
> [17606.849120]        ret_from_fork_asm+0x1a/0x30
> [17606.849126]
>                -> #0 ((wq_completion)xe_eu_stall){+.+.}-{0:0}:
> [17606.849134]        __lock_acquire+0x167c/0x27d0
> [17606.849141]        lock_acquire+0xd5/0x300
> [17606.849148]        touch_wq_lockdep_map+0x36/0x80
> [17606.849155]        __flush_workqueue+0x7e/0x4a0
> [17606.849163]        drain_workqueue+0x92/0x130
> [17606.849170]        destroy_workqueue+0x55/0x380
> [17606.849177]        xe_eu_stall_data_buf_destroy+0x11/0x50 [xe]
> [17606.849220]        xe_eu_stall_stream_close+0x37/0x70 [xe]
> [17606.849259]        __fput+0xed/0x2b0
> [17606.849264]        __x64_sys_close+0x37/0x80
> [17606.849271]        do_syscall_64+0x68/0x140
> [17606.849276]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [17606.849286]
>                other info that might help us debug this:
>
> [17606.849294] Chain exists of:
>                  (wq_completion)xe_eu_stall --> (work_completion)(&(&stream->buf_poll_work)->work) --> &gt->eu_stall->stream_lock
>
> [17606.849312]  Possible unsafe locking scenario:
>
> [17606.849318]        CPU0                    CPU1
> [17606.849323]        ----                    ----
> [17606.849328]   lock(&gt->eu_stall->stream_lock);
> [17606.849334]                                lock((work_completion)(&(&stream->buf_poll_work)->work));
> [17606.849344]                                lock(&gt->eu_stall->stream_lock);
> [17606.849352]   lock((wq_completion)xe_eu_stall);
> [17606.849359]
>                 *** DEADLOCK ***
>
> [17606.849365] 1 lock held by xe_eu_stall/21899:
> [17606.849371]  #0: ffff88810d0786a8 (&gt->eu_stall->stream_lock){+.+.}-{4:4}, at: xe_eu_stall_stream_close+0x27/0x70 [xe]
> [17606.849430]
>                stack backtrace:
> [17606.849435] CPU: 3 UID: 0 PID: 21899 Comm: xe_eu_stall Not tainted 6.13.0-upstream #3
> [17606.849445] Hardware name: Intel Corporation Lunar Lake Client Platform/LNL-M LP5 RVP1, BIOS LNLMFWI1.R00.3220.D89.2407012051 07/01/2024
> [17606.849457] Call Trace:
> [17606.849461]  <TASK>
> [17606.849465]  dump_stack_lvl+0x82/0xd0
> [17606.849473]  print_circular_bug+0x2d2/0x410
> [17606.849473]  print_circular_bug+0x2d2/0x410
> [17606.849482]  check_noncircular+0x15d/0x180
> [17606.849492]  __lock_acquire+0x167c/0x27d0
> [17606.849500]  lock_acquire+0xd5/0x300
> [17606.849507]  ? touch_wq_lockdep_map+0x21/0x80
> [17606.849515]  ? lockdep_init_map_type+0x4b/0x260
> [17606.849522]  ? touch_wq_lockdep_map+0x21/0x80
> [17606.849529]  touch_wq_lockdep_map+0x36/0x80
> [17606.849536]  ? touch_wq_lockdep_map+0x21/0x80
> [17606.849544]  __flush_workqueue+0x7e/0x4a0
> [17606.849551]  ? find_held_lock+0x2b/0x80
> [17606.849561]  drain_workqueue+0x92/0x130
> [17606.849569]  destroy_workqueue+0x55/0x380
> [17606.849577]  xe_eu_stall_data_buf_destroy+0x11/0x50 [xe]
> [17606.849627]  xe_eu_stall_stream_close+0x37/0x70 [xe]
> [17606.849678]  __fput+0xed/0x2b0
> [17606.849683]  __x64_sys_close+0x37/0x80
> [17606.849691]  do_syscall_64+0x68/0x140
> [17606.849698]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [17606.849706] RIP: 0033:0x7fdc81b14f67
> [17606.849712] Code: ff e8 0d 16 02 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 73 ba f7 ff
> [17606.849728] RSP: 002b:00007fffd2bd7e58 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
> [17606.849738] RAX: ffffffffffffffda RBX: 0000559f7fa08100 RCX: 00007fdc81b14f67
> [17606.849746] RDX: 0000000000000000 RSI: 0000000000006901 RDI: 0000000000000004
> [17606.849754] RBP: 00007fdc7d40bc90 R08: 0000000000000000 R09: 000000007fffffff
> [17606.849762] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> [17606.849770] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
> [17606.849783]  </TASK>

Could you try out this change in your patch (also just use
gt->eu_stall->stream_lock, not stream->buf_lock) and see if it resolves
this issue:

	@@ -573,7 +573,6 @@ static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)

	 static void xe_eu_stall_data_buf_destroy(struct xe_eu_stall_data_stream *stream)
	 {
	-       destroy_workqueue(stream->buf_poll_wq);
	        xe_bo_unpin_map_no_vm(stream->bo);
	        kfree(stream->xecore_buf);
	 }
	@@ -829,6 +828,8 @@ static int xe_eu_stall_stream_close(struct inode *inode, struct file *file)
	        xe_eu_stall_stream_free(stream);
	        mutex_unlock(&gt->eu_stall->stream_lock);

	+       destroy_workqueue(stream->buf_poll_wq);
	+
	        return 0;
	 }

Basically move destroy_workqueue outside gt->eu_stall->stream_lock.

I will look more into this. But from the above trace it looks like the
issue of lock order between two locks in different code paths. The two
locks are stream_lock and something let's call wq_lock (associated with the
workqueue or the work item).

So this is what we see about the order of these two locks in these
instances in the code (assuming we are only using stream_lock):

1. eu_stall_data_buf_poll_work_fn: wq_lock is taken first followed by
   stream_lock inside eu_stall_data_buf_poll.

2. xe_eu_stall_disable_locked: stream_lock is taken first followed by
   wq_lock when we do cancel_delayed_work_sync (if at all)

3. xe_eu_stall_stream_close: stream_lock is taken first followed by wq_lock
   when we do destroy_workqueue.

So looks like lockdep is complaining about the lock order reversal between
1. and 3. above. I am not sure if the order reversal between 1. and 2. is a
problem or not (if lockdep complains about this or not). If it is, we could
try moving cancel_delayed_work_sync also outside stream_lock. But we
haven't seen a trace for the second case yet.

So anyway, the first thing to try is the patch above and see if it fixes
this issue.

Another idea would be to move buf_poll_wq into gt->eu_stall (rather than
the stream), so it does not have to be destroyed when the stream fd is
closed.

>
>
> >
> > > @@ -357,16 +580,26 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
> > >			  max_wait_num_reports);
> > >		return -EINVAL;
> > >	}
> > > +
> > > +	init_waitqueue_head(&stream->poll_wq);
> > > +	INIT_DELAYED_WORK(&stream->buf_poll_work, eu_stall_data_buf_poll_work_fn);
> > > +	mutex_init(&stream->buf_lock);
> > > +	stream->buf_poll_wq = alloc_ordered_workqueue("xe_eu_stall", 0);
> > > +	if (!stream->buf_poll_wq)
> > > +		return -ENOMEM;
> > >	stream->per_xecore_buf_size = per_xecore_buf_size;
> > >	stream->sampling_rate_mult = props->sampling_rate_mult;
> > >	stream->wait_num_reports = props->wait_num_reports;
> > >	stream->data_record_size = xe_eu_stall_data_record_size(gt_to_xe(gt));
> > >	stream->xecore_buf = kcalloc(last_xecore, sizeof(*stream->xecore_buf), GFP_KERNEL);
> > > -	if (!stream->xecore_buf)
> > > +	if (!stream->xecore_buf) {
> > > +		destroy_workqueue(stream->buf_poll_wq);
> > >		return -ENOMEM;
> > > +	}
> > >
> > >	ret = xe_eu_stall_data_buf_alloc(stream, last_xecore);
> > >	if (ret) {
> > > +		destroy_workqueue(stream->buf_poll_wq);
> > >		kfree(stream->xecore_buf);
> >
> > OK, won't block on this, but error unwinding is cleaner with label's and
> > goto's.
> >
> > Also, if stream->buf_lock is needed, mutex_destroy is also needed for error
> > unwinding and also during stream close.
> Okay. But, before we do a free(stream), does it really matter to call
> mutex_destroy on stream->buf_lock ?

Yes, we'd need mutex_destroy if we introduce stream->buf_lock. First let's
try the approach above so we don't have to introduce stream->buf_lock.

If stream->buf_lock is needed and mutex_destroy is a problem, another idea
would be to introduce it in gt->eu_stall so it doesn't need to be destroyed
when the stream fd is closed.

> >
> > > diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
> > > index d5281de04d54..1cc6bfc34ccb 100644
> > > --- a/drivers/gpu/drm/xe/xe_trace.h
> > > +++ b/drivers/gpu/drm/xe/xe_trace.h
> > > @@ -427,6 +427,39 @@ DEFINE_EVENT(xe_pm_runtime, xe_pm_runtime_get_ioctl,
> > >	     TP_ARGS(xe, caller)
> > >  );
> > >
> > > +TRACE_EVENT(xe_eu_stall_data_read,
> > > +	    TP_PROTO(u8 slice, u8 subslice,
> > > +		     u32 read_ptr, u32 write_ptr,
> > > +		     u32 read_offset, u32 write_offset,
> > > +		     size_t total_size),
> > > +	    TP_ARGS(slice, subslice, read_ptr, write_ptr,
> > > +		    read_offset, write_offset, total_size),
> > > +
> > > +	    TP_STRUCT__entry(__field(u8, slice)
> > > +			     __field(u8, subslice)
> > > +			     __field(u32, read_ptr)
> > > +			     __field(u32, write_ptr)
> > > +			     __field(u32, read_offset)
> > > +			     __field(u32, write_offset)
> > > +			     __field(size_t, total_size)
> > > +			     ),
> > > +
> > > +	    TP_fast_assign(__entry->slice = slice;
> > > +			   __entry->subslice = subslice;
> > > +			   __entry->read_ptr = read_ptr;
> > > +			   __entry->write_ptr = write_ptr;
> > > +			   __entry->read_offset = read_offset;
> > > +			   __entry->write_offset = write_offset;
> >
> > Keep it if we need it, but do we need both the read/write ptr's and
> > offset's? Since offset's are the same as ptr's, but without the ms bit.
> True offsets are just pointers without the overflow bit. Maybe I can
> remove the offsets, but add old read pointer (since read pointer changes
> during read).

As I said, you can keep this as is, or whatever works for you and is
convenient for debug.

> >
> > > +			   __entry->total_size = total_size;
> > > +			   ),
> > > +
> > > +	    TP_printk("slice:%u subslice:%u readptr:0x%x writeptr:0x%x read off:%u write off:%u size:%zu ",
> > > +		      __entry->slice, __entry->subslice,
> > > +		      __entry->read_ptr, __entry->write_ptr,
> > > +		      __entry->read_offset, __entry->write_offset,
> > > +		      __entry->total_size)
> > > +);
> > > +
> > >  #endif
> > >
> > >  /* This part must be outside protection */
> > > --
> > > 2.48.1
> > >

Thanks.
--
Ashutosh