[PATCH v10 4/8] drm/xe/eustall: Add support to read() and poll() EU stall data
Dixit, Ashutosh
ashutosh.dixit at intel.com
Thu Feb 20 20:02:26 UTC 2025
On Thu, 20 Feb 2025 11:52:34 -0800, Dixit, Ashutosh wrote:
>
> On Wed, 19 Feb 2025 23:02:45 -0800, Dixit, Ashutosh wrote:
> >
> > On Wed, 19 Feb 2025 11:43:19 -0800, Harish Chegondi wrote:
> > >
> >
> > Hi Harish,
> >
> > > On Wed, Feb 19, 2025 at 10:15:52AM -0800, Dixit, Ashutosh wrote:
> > > Hi Ashutosh,
> > >
> > > > On Tue, 18 Feb 2025 11:53:54 -0800, Harish Chegondi wrote:
> > > > >
> > > > > @@ -39,7 +40,9 @@ struct per_xecore_buf {
> > > > > };
> > > > >
> > > > > struct xe_eu_stall_data_stream {
> > > > > + bool pollin;
> > > > > bool enabled;
> > > > > + wait_queue_head_t poll_wq;
> > > > > size_t data_record_size;
> > > > > size_t per_xecore_buf_size;
> > > > > unsigned int wait_num_reports;
> > > > > @@ -47,7 +50,11 @@ struct xe_eu_stall_data_stream {
> > > > >
> > > > > struct xe_gt *gt;
> > > > > struct xe_bo *bo;
> > > > > + /* Lock to protect xecore_buf */
> > > > > + struct mutex buf_lock;
> > > >
> > > > Why do we need this new lock? I thought we would just be able to use
> > > > gt->eu_stall->stream_lock? stream_lock is already taken for read(), so we
> > > > just need to take it for eu_stall_data_buf_poll()?
> > >
> > > I started off with using the gt->eu_stall->stream_lock. But I have seen
> > > warnings in the dmesg log while testing indicating possible circular
> > > locking dependency leading to a deadlock. Maybe I can spend more time
> > > later to investigate further and eliminate the possible circular locking
> > > dependency. But for now, I decided to use a new lock to eliminate the
> > > per subslice lock. Here is the dmesg log that I saved from my testing to
> > > investigate later.
> > >
> > > [17606.848776] ======================================================
> > > [17606.848781] WARNING: possible circular locking dependency detected
> > > [17606.848786] 6.13.0-upstream #3 Not tainted
> > > [17606.848791] ------------------------------------------------------
> > > [17606.848796] xe_eu_stall/21899 is trying to acquire lock:
> > > [17606.848801] ffff88810daad948 ((wq_completion)xe_eu_stall){+.+.}-{0:0}, at: touch_wq_lockdep_map+0x21/0x80
> > > [17606.848822]
> > > but task is already holding lock:
> > > [17606.848827] ffff88810d0786a8 (>->eu_stall->stream_lock){+.+.}-{4:4}, at: xe_eu_stall_stream_close+0x27/0x70 [xe]
> > > [17606.848903]
> > > which lock already depends on the new lock.
> > >
> > > [17606.848909]
> > > the existing dependency chain (in reverse order) is:
> > > [17606.848915]
> > > -> #2 (>->eu_stall->stream_lock){+.+.}-{4:4}:
> > > [17606.848915]
> > > -> #2 (>->eu_stall->stream_lock){+.+.}-{4:4}:
> > > [17606.848925] __mutex_lock+0xb4/0xeb0
> > > [17606.848934] eu_stall_data_buf_poll+0x42/0x180 [xe]
> > > [17606.848989] eu_stall_data_buf_poll_work_fn+0x15/0x60 [xe]
> > > [17606.849042] process_one_work+0x207/0x640
> > > [17606.849051] worker_thread+0x18c/0x330
> > > [17606.849058] kthread+0xeb/0x120
> > > [17606.849065] ret_from_fork+0x2c/0x50
> > > [17606.849073] ret_from_fork_asm+0x1a/0x30
> > > [17606.849081]
> > > -> #1 ((work_completion)(&(&stream->buf_poll_work)->work)){+.+.}-{0:0}:
> > > [17606.849092] process_one_work+0x1e3/0x640
> > > [17606.849100] worker_thread+0x18c/0x330
> > > [17606.849107] kthread+0xeb/0x120
> > > [17606.849113] ret_from_fork+0x2c/0x50
> > > [17606.849120] ret_from_fork_asm+0x1a/0x30
> > > [17606.849126]
> > > -> #0 ((wq_completion)xe_eu_stall){+.+.}-{0:0}:
> > > [17606.849134] __lock_acquire+0x167c/0x27d0
> > > [17606.849141] lock_acquire+0xd5/0x300
> > > [17606.849148] touch_wq_lockdep_map+0x36/0x80
> > > [17606.849155] __flush_workqueue+0x7e/0x4a0
> > > [17606.849163] drain_workqueue+0x92/0x130
> > > [17606.849170] destroy_workqueue+0x55/0x380
> > > [17606.849177] xe_eu_stall_data_buf_destroy+0x11/0x50 [xe]
> > > [17606.849220] xe_eu_stall_stream_close+0x37/0x70 [xe]
> > > [17606.849259] __fput+0xed/0x2b0
> > > [17606.849264] __x64_sys_close+0x37/0x80
> > > [17606.849271] do_syscall_64+0x68/0x140
> > > [17606.849276] entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > [17606.849286]
> > > other info that might help us debug this:
> > >
> > > [17606.849294] Chain exists of:
> > > (wq_completion)xe_eu_stall --> (work_completion)(&(&stream->buf_poll_work)->work) --> >->eu_stall->stream_lock
> > >
> > > [17606.849312] Possible unsafe locking scenario:
> > >
> > > [17606.849318] CPU0 CPU1
> > > [17606.849323] ---- ----
> > > [17606.849328] lock(>->eu_stall->stream_lock);
> > > [17606.849334] lock((work_completion)(&(&stream->buf_poll_work)->work));
> > > [17606.849344] lock(>->eu_stall->stream_lock);
> > > [17606.849352] lock((wq_completion)xe_eu_stall);
> > > [17606.849359]
> > > *** DEADLOCK ***
> > >
> > > [17606.849365] 1 lock held by xe_eu_stall/21899:
> > > [17606.849371] #0: ffff88810d0786a8 (>->eu_stall->stream_lock){+.+.}-{4:4}, at: xe_eu_stall_stream_close+0x27/0x70 [xe]
> > > [17606.849430]
> > > stack backtrace:
> > > [17606.849435] CPU: 3 UID: 0 PID: 21899 Comm: xe_eu_stall Not tainted 6.13.0-upstream #3
> > > [17606.849445] Hardware name: Intel Corporation Lunar Lake Client Platform/LNL-M LP5 RVP1, BIOS LNLMFWI1.R00.3220.D89.2407012051 07/01/2024
> > > [17606.849457] Call Trace:
> > > [17606.849461] <TASK>
> > > [17606.849465] dump_stack_lvl+0x82/0xd0
> > > [17606.849473] print_circular_bug+0x2d2/0x410
> > > [17606.849473] print_circular_bug+0x2d2/0x410
> > > [17606.849482] check_noncircular+0x15d/0x180
> > > [17606.849492] __lock_acquire+0x167c/0x27d0
> > > [17606.849500] lock_acquire+0xd5/0x300
> > > [17606.849507] ? touch_wq_lockdep_map+0x21/0x80
> > > [17606.849515] ? lockdep_init_map_type+0x4b/0x260
> > > [17606.849522] ? touch_wq_lockdep_map+0x21/0x80
> > > [17606.849529] touch_wq_lockdep_map+0x36/0x80
> > > [17606.849536] ? touch_wq_lockdep_map+0x21/0x80
> > > [17606.849544] __flush_workqueue+0x7e/0x4a0
> > > [17606.849551] ? find_held_lock+0x2b/0x80
> > > [17606.849561] drain_workqueue+0x92/0x130
> > > [17606.849569] destroy_workqueue+0x55/0x380
> > > [17606.849577] xe_eu_stall_data_buf_destroy+0x11/0x50 [xe]
> > > [17606.849627] xe_eu_stall_stream_close+0x37/0x70 [xe]
> > > [17606.849678] __fput+0xed/0x2b0
> > > [17606.849683] __x64_sys_close+0x37/0x80
> > > [17606.849691] do_syscall_64+0x68/0x140
> > > [17606.849698] entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > > [17606.849706] RIP: 0033:0x7fdc81b14f67
> > > [17606.849712] Code: ff e8 0d 16 02 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 73 ba f7 ff
> > > [17606.849728] RSP: 002b:00007fffd2bd7e58 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
> > > [17606.849738] RAX: ffffffffffffffda RBX: 0000559f7fa08100 RCX: 00007fdc81b14f67
> > > [17606.849746] RDX: 0000000000000000 RSI: 0000000000006901 RDI: 0000000000000004
> > > [17606.849754] RBP: 00007fdc7d40bc90 R08: 0000000000000000 R09: 000000007fffffff
> > > [17606.849762] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> > > [17606.849770] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
> > > [17606.849783] </TASK>
> >
> > Could you try out this change in your patch (also just use
> > gt->eu_stall->stream_lock, not stream->buf_lock) and see if it resolves
> > this issue:
> >
> > @@ -573,7 +573,6 @@ static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)
> >
> > static void xe_eu_stall_data_buf_destroy(struct xe_eu_stall_data_stream *stream)
> > {
> > - destroy_workqueue(stream->buf_poll_wq);
> > xe_bo_unpin_map_no_vm(stream->bo);
> > kfree(stream->xecore_buf);
> > }
> > @@ -829,6 +828,8 @@ static int xe_eu_stall_stream_close(struct inode *inode, struct file *file)
> > xe_eu_stall_stream_free(stream);
> > mutex_unlock(>->eu_stall->stream_lock);
> >
> > + destroy_workqueue(stream->buf_poll_wq);
> > +
> > return 0;
> > }
> >
> > Basically move destroy_workqueue outside gt->eu_stall->stream_lock.
> >
> > I will look more into this. But from the above trace it looks like the
> > issue of lock order between two locks in different code paths. The two
> > locks are stream_lock and something let's call wq_lock (associated with the
> > workqueue or the work item).
> >
> > So this is what we see about the order of these two locks in these
> > instances in the code (assuming we are only using stream_lock):
> >
> > 1. eu_stall_data_buf_poll_work_fn: wq_lock is taken first followed by
> > stream_lock inside eu_stall_data_buf_poll.
> >
> > 2. xe_eu_stall_disable_locked: stream_lock is taken first followed by
> > wq_lock when we do cancel_delayed_work_sync (if at all)
> >
> > 3. xe_eu_stall_stream_close: stream_lock is taken first followed by wq_lock
> > when we do destroy_workqueue.
> >
> > So looks like lockdep is complaining about the lock order reversal between
> > 1. and 3. above. I am not sure if the order reversal between 1. and 2. is a
> > problem or not (if lockdep complains about this or not). If it is, we could
> > try moving cancel_delayed_work_sync also outside stream_lock. But we
> > haven't seen a trace for the second case yet.
> >
> > So anyway, the first thing to try is the patch above and see if it fixes
> > this issue.
> >
> > Another idea would be to move buf_poll_wq into gt->eu_stall (rather than
> > the stream), so it does not have to be destroyed when the stream fd is
> > closed.
>
> So I decided to do a quick POC to see if my suggestion above to get rid of
> the circular locking dependency worked. And basically it does, with the
> patch below (we have to ensure stream doesn't get freed and I haven't
> deleted buf_lock), but you will get the idea. I reproduced the issue with
> your kernel patches and IGT and made sure the patch below fixes it. So
> buf_lock is not needed.
>
> So with the patch below the circular locking dependency is gone. So let's
> go with something like this, if you agree:
>
> diff --git a/drivers/gpu/drm/xe/xe_eu_stall.c b/drivers/gpu/drm/xe/xe_eu_stall.c
> index b94585bdf91c6..a3fc424b36665 100644
> --- a/drivers/gpu/drm/xe/xe_eu_stall.c
> +++ b/drivers/gpu/drm/xe/xe_eu_stall.c
> @@ -365,7 +365,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
> u16 group, instance;
> unsigned int xecore;
>
> - mutex_lock(&stream->buf_lock);
> + mutex_lock(>->eu_stall->stream_lock);
> for_each_dss_steering(xecore, gt, group, instance) {
> xecore_buf = &stream->xecore_buf[xecore];
> read_ptr = xecore_buf->read;
> @@ -383,7 +383,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
> set_bit(xecore, stream->data_drop.mask);
> xecore_buf->write = write_ptr;
> }
> - mutex_unlock(&stream->buf_lock);
> + mutex_unlock(>->eu_stall->stream_lock);
>
> return min_data_present;
> }
> @@ -503,14 +503,12 @@ static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *st
> stream->data_drop.reported_to_user = false;
> }
>
> - mutex_lock(&stream->buf_lock);
> for_each_dss_steering(xecore, gt, group, instance) {
> ret = xe_eu_stall_data_buf_read(stream, buf, count, &total_size,
> gt, group, instance, xecore);
> if (ret || count == total_size)
> break;
> }
> - mutex_unlock(&stream->buf_lock);
> return total_size ?: (ret ?: -EAGAIN);
> }
>
> @@ -573,7 +571,6 @@ static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)
>
> static void xe_eu_stall_data_buf_destroy(struct xe_eu_stall_data_stream *stream)
> {
> - destroy_workqueue(stream->buf_poll_wq);
> xe_bo_unpin_map_no_vm(stream->bo);
> kfree(stream->xecore_buf);
> }
> @@ -826,9 +823,12 @@ static int xe_eu_stall_stream_close(struct inode *inode, struct file *file)
> mutex_lock(>->eu_stall->stream_lock);
> xe_eu_stall_disable_locked(stream);
> xe_eu_stall_data_buf_destroy(stream);
> - xe_eu_stall_stream_free(stream);
> + gt->eu_stall->stream = NULL;
> mutex_unlock(>->eu_stall->stream_lock);
>
> + destroy_workqueue(stream->buf_poll_wq);
> + kfree(stream);
> +
> return 0;
> }
Maybe, as suggested above, move buf_poll_wq into gt->eu_stall (rather than
the stream), so it does not have to be destroyed when the stream fd is
closed. That way we can just call xe_eu_stall_stream_free() from
xe_eu_stall_stream_close().
And we'll alloc the workqueue in xe_eu_stall_init() and destroy it in
xe_eu_stall_fini(). This will minimize the changes.
More information about the Intel-xe
mailing list