[PATCH v10 4/8] drm/xe/eustall: Add support to read() and poll() EU stall data

Thu Feb 20 19:52:34 UTC 2025

On Wed, 19 Feb 2025 23:02:45 -0800, Dixit, Ashutosh wrote:
>
> On Wed, 19 Feb 2025 11:43:19 -0800, Harish Chegondi wrote:
> >
>
> Hi Harish,
>
> > On Wed, Feb 19, 2025 at 10:15:52AM -0800, Dixit, Ashutosh wrote:
> > Hi Ashutosh,
> >
> > > On Tue, 18 Feb 2025 11:53:54 -0800, Harish Chegondi wrote:
> > > >
> > > > @@ -39,7 +40,9 @@ struct per_xecore_buf {
> > > >  };
> > > >
> > > >  struct xe_eu_stall_data_stream {
> > > > +	bool pollin;
> > > >	bool enabled;
> > > > +	wait_queue_head_t poll_wq;
> > > >	size_t data_record_size;
> > > >	size_t per_xecore_buf_size;
> > > >	unsigned int wait_num_reports;
> > > > @@ -47,7 +50,11 @@ struct xe_eu_stall_data_stream {
> > > >
> > > >	struct xe_gt *gt;
> > > >	struct xe_bo *bo;
> > > > +	/* Lock to protect xecore_buf */
> > > > +	struct mutex buf_lock;
> > >
> > > Why do we need this new lock? I thought we would just be able to use
> > > gt->eu_stall->stream_lock? stream_lock is already taken for read(), so we
> > > just need to take it for eu_stall_data_buf_poll()?
> >
> > I started off with using the gt->eu_stall->stream_lock. But I have seen
> > warnings in the dmesg log while testing indicating possible circular
> > locking dependency leading to a deadlock. Maybe I can spend more time
> > later to investigate further and eliminate the possible circular locking
> > dependency. But for now, I decided to use a new lock to eliminate the
> > per subslice lock. Here is the dmesg log that I saved from my testing to
> > investigate later.
> >
> > [17606.848776] ======================================================
> > [17606.848781] WARNING: possible circular locking dependency detected
> > [17606.848786] 6.13.0-upstream #3 Not tainted
> > [17606.848791] ------------------------------------------------------
> > [17606.848796] xe_eu_stall/21899 is trying to acquire lock:
> > [17606.848801] ffff88810daad948 ((wq_completion)xe_eu_stall){+.+.}-{0:0}, at: touch_wq_lockdep_map+0x21/0x80
> > [17606.848822]
> >                but task is already holding lock:
> > [17606.848827] ffff88810d0786a8 (&gt->eu_stall->stream_lock){+.+.}-{4:4}, at: xe_eu_stall_stream_close+0x27/0x70 [xe]
> > [17606.848903]
> >                which lock already depends on the new lock.
> >
> > [17606.848909]
> >                the existing dependency chain (in reverse order) is:
> > [17606.848915]
> >                -> #2 (&gt->eu_stall->stream_lock){+.+.}-{4:4}:
> > [17606.848915]
> >                -> #2 (&gt->eu_stall->stream_lock){+.+.}-{4:4}:
> > [17606.848925]        __mutex_lock+0xb4/0xeb0
> > [17606.848934]        eu_stall_data_buf_poll+0x42/0x180 [xe]
> > [17606.848989]        eu_stall_data_buf_poll_work_fn+0x15/0x60 [xe]
> > [17606.849042]        process_one_work+0x207/0x640
> > [17606.849051]        worker_thread+0x18c/0x330
> > [17606.849058]        kthread+0xeb/0x120
> > [17606.849065]        ret_from_fork+0x2c/0x50
> > [17606.849073]        ret_from_fork_asm+0x1a/0x30
> > [17606.849081]
> >                -> #1 ((work_completion)(&(&stream->buf_poll_work)->work)){+.+.}-{0:0}:
> > [17606.849092]        process_one_work+0x1e3/0x640
> > [17606.849100]        worker_thread+0x18c/0x330
> > [17606.849107]        kthread+0xeb/0x120
> > [17606.849113]        ret_from_fork+0x2c/0x50
> > [17606.849120]        ret_from_fork_asm+0x1a/0x30
> > [17606.849126]
> >                -> #0 ((wq_completion)xe_eu_stall){+.+.}-{0:0}:
> > [17606.849134]        __lock_acquire+0x167c/0x27d0
> > [17606.849141]        lock_acquire+0xd5/0x300
> > [17606.849148]        touch_wq_lockdep_map+0x36/0x80
> > [17606.849155]        __flush_workqueue+0x7e/0x4a0
> > [17606.849163]        drain_workqueue+0x92/0x130
> > [17606.849170]        destroy_workqueue+0x55/0x380
> > [17606.849177]        xe_eu_stall_data_buf_destroy+0x11/0x50 [xe]
> > [17606.849220]        xe_eu_stall_stream_close+0x37/0x70 [xe]
> > [17606.849259]        __fput+0xed/0x2b0
> > [17606.849264]        __x64_sys_close+0x37/0x80
> > [17606.849271]        do_syscall_64+0x68/0x140
> > [17606.849276]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [17606.849286]
> >                other info that might help us debug this:
> >
> > [17606.849294] Chain exists of:
> >                  (wq_completion)xe_eu_stall --> (work_completion)(&(&stream->buf_poll_work)->work) --> &gt->eu_stall->stream_lock
> >
> > [17606.849312]  Possible unsafe locking scenario:
> >
> > [17606.849318]        CPU0                    CPU1
> > [17606.849323]        ----                    ----
> > [17606.849328]   lock(&gt->eu_stall->stream_lock);
> > [17606.849334]                                lock((work_completion)(&(&stream->buf_poll_work)->work));
> > [17606.849344]                                lock(&gt->eu_stall->stream_lock);
> > [17606.849352]   lock((wq_completion)xe_eu_stall);
> > [17606.849359]
> >                 *** DEADLOCK ***
> >
> > [17606.849365] 1 lock held by xe_eu_stall/21899:
> > [17606.849371]  #0: ffff88810d0786a8 (&gt->eu_stall->stream_lock){+.+.}-{4:4}, at: xe_eu_stall_stream_close+0x27/0x70 [xe]
> > [17606.849430]
> >                stack backtrace:
> > [17606.849435] CPU: 3 UID: 0 PID: 21899 Comm: xe_eu_stall Not tainted 6.13.0-upstream #3
> > [17606.849445] Hardware name: Intel Corporation Lunar Lake Client Platform/LNL-M LP5 RVP1, BIOS LNLMFWI1.R00.3220.D89.2407012051 07/01/2024
> > [17606.849457] Call Trace:
> > [17606.849461]  <TASK>
> > [17606.849465]  dump_stack_lvl+0x82/0xd0
> > [17606.849473]  print_circular_bug+0x2d2/0x410
> > [17606.849473]  print_circular_bug+0x2d2/0x410
> > [17606.849482]  check_noncircular+0x15d/0x180
> > [17606.849492]  __lock_acquire+0x167c/0x27d0
> > [17606.849500]  lock_acquire+0xd5/0x300
> > [17606.849507]  ? touch_wq_lockdep_map+0x21/0x80
> > [17606.849515]  ? lockdep_init_map_type+0x4b/0x260
> > [17606.849522]  ? touch_wq_lockdep_map+0x21/0x80
> > [17606.849529]  touch_wq_lockdep_map+0x36/0x80
> > [17606.849536]  ? touch_wq_lockdep_map+0x21/0x80
> > [17606.849544]  __flush_workqueue+0x7e/0x4a0
> > [17606.849551]  ? find_held_lock+0x2b/0x80
> > [17606.849561]  drain_workqueue+0x92/0x130
> > [17606.849569]  destroy_workqueue+0x55/0x380
> > [17606.849577]  xe_eu_stall_data_buf_destroy+0x11/0x50 [xe]
> > [17606.849627]  xe_eu_stall_stream_close+0x37/0x70 [xe]
> > [17606.849678]  __fput+0xed/0x2b0
> > [17606.849683]  __x64_sys_close+0x37/0x80
> > [17606.849691]  do_syscall_64+0x68/0x140
> > [17606.849698]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [17606.849706] RIP: 0033:0x7fdc81b14f67
> > [17606.849712] Code: ff e8 0d 16 02 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 73 ba f7 ff
> > [17606.849728] RSP: 002b:00007fffd2bd7e58 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
> > [17606.849738] RAX: ffffffffffffffda RBX: 0000559f7fa08100 RCX: 00007fdc81b14f67
> > [17606.849746] RDX: 0000000000000000 RSI: 0000000000006901 RDI: 0000000000000004
> > [17606.849754] RBP: 00007fdc7d40bc90 R08: 0000000000000000 R09: 000000007fffffff
> > [17606.849762] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> > [17606.849770] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
> > [17606.849783]  </TASK>
>
> Could you try out this change in your patch (also just use
> gt->eu_stall->stream_lock, not stream->buf_lock) and see if it resolves
> this issue:
>
>	@@ -573,7 +573,6 @@ static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)
>
>	 static void xe_eu_stall_data_buf_destroy(struct xe_eu_stall_data_stream *stream)
>	 {
>	-       destroy_workqueue(stream->buf_poll_wq);
>		xe_bo_unpin_map_no_vm(stream->bo);
>		kfree(stream->xecore_buf);
>	 }
>	@@ -829,6 +828,8 @@ static int xe_eu_stall_stream_close(struct inode *inode, struct file *file)
>		xe_eu_stall_stream_free(stream);
>		mutex_unlock(&gt->eu_stall->stream_lock);
>
>	+       destroy_workqueue(stream->buf_poll_wq);
>	+
>		return 0;
>	 }
>
> Basically move destroy_workqueue outside gt->eu_stall->stream_lock.
>
> I will look more into this. But from the above trace it looks like the
> issue of lock order between two locks in different code paths. The two
> locks are stream_lock and something let's call wq_lock (associated with the
> workqueue or the work item).
>
> So this is what we see about the order of these two locks in these
> instances in the code (assuming we are only using stream_lock):
>
> 1. eu_stall_data_buf_poll_work_fn: wq_lock is taken first followed by
>    stream_lock inside eu_stall_data_buf_poll.
>
> 2. xe_eu_stall_disable_locked: stream_lock is taken first followed by
>    wq_lock when we do cancel_delayed_work_sync (if at all)
>
> 3. xe_eu_stall_stream_close: stream_lock is taken first followed by wq_lock
>    when we do destroy_workqueue.
>
> So looks like lockdep is complaining about the lock order reversal between
> 1. and 3. above. I am not sure if the order reversal between 1. and 2. is a
> problem or not (if lockdep complains about this or not). If it is, we could
> try moving cancel_delayed_work_sync also outside stream_lock. But we
> haven't seen a trace for the second case yet.
>
> So anyway, the first thing to try is the patch above and see if it fixes
> this issue.
>
> Another idea would be to move buf_poll_wq into gt->eu_stall (rather than
> the stream), so it does not have to be destroyed when the stream fd is
> closed.

So I decided to do a quick POC to see if my suggestion above to get rid of
the circular locking dependency worked. And basically it does, with the
patch below (we have to ensure stream doesn't get freed and I haven't
deleted buf_lock), but you will get the idea. I reproduced the issue with
your kernel patches and IGT and made sure the patch below fixes it. So
buf_lock is not needed.

So with the patch below the circular locking dependency is gone. So let's
go with something like this, if you agree:

diff --git a/drivers/gpu/drm/xe/xe_eu_stall.c b/drivers/gpu/drm/xe/xe_eu_stall.c
index b94585bdf91c6..a3fc424b36665 100644
--- a/drivers/gpu/drm/xe/xe_eu_stall.c
+++ b/drivers/gpu/drm/xe/xe_eu_stall.c
@@ -365,7 +365,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
 	u16 group, instance;
 	unsigned int xecore;
 
-	mutex_lock(&stream->buf_lock);
+	mutex_lock(&gt->eu_stall->stream_lock);
 	for_each_dss_steering(xecore, gt, group, instance) {
 		xecore_buf = &stream->xecore_buf[xecore];
 		read_ptr = xecore_buf->read;
@@ -383,7 +383,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
 			set_bit(xecore, stream->data_drop.mask);
 		xecore_buf->write = write_ptr;
 	}
-	mutex_unlock(&stream->buf_lock);
+	mutex_unlock(&gt->eu_stall->stream_lock);
 
 	return min_data_present;
 }
@@ -503,14 +503,12 @@ static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *st
 		stream->data_drop.reported_to_user = false;
 	}
 
-	mutex_lock(&stream->buf_lock);
 	for_each_dss_steering(xecore, gt, group, instance) {
 		ret = xe_eu_stall_data_buf_read(stream, buf, count, &total_size,
 						gt, group, instance, xecore);
 		if (ret || count == total_size)
 			break;
 	}
-	mutex_unlock(&stream->buf_lock);
 	return total_size ?: (ret ?: -EAGAIN);
 }
 
@@ -573,7 +571,6 @@ static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)
 
 static void xe_eu_stall_data_buf_destroy(struct xe_eu_stall_data_stream *stream)
 {
-	destroy_workqueue(stream->buf_poll_wq);
 	xe_bo_unpin_map_no_vm(stream->bo);
 	kfree(stream->xecore_buf);
 }
@@ -826,9 +823,12 @@ static int xe_eu_stall_stream_close(struct inode *inode, struct file *file)
 	mutex_lock(&gt->eu_stall->stream_lock);
 	xe_eu_stall_disable_locked(stream);
 	xe_eu_stall_data_buf_destroy(stream);
-	xe_eu_stall_stream_free(stream);
+	gt->eu_stall->stream = NULL;
 	mutex_unlock(&gt->eu_stall->stream_lock);
 
+	destroy_workqueue(stream->buf_poll_wq);
+	kfree(stream);
+
 	return 0;
 }