[PATCH 1/1] drm/xe/eustall: Resolve a possible circular locking dependency
Dixit, Ashutosh
ashutosh.dixit at intel.com
Tue Apr 22 00:46:09 UTC 2025
On Thu, 17 Apr 2025 17:07:17 -0700, Harish Chegondi wrote:
>
Hi Harish,
> Use a separate lock in the polling function eu_stall_data_buf_poll()
> instead of eu_stall->stream_lock. This would prevent a possible
> circular locking dependency leading to a deadlock as described below.
I was trying to avoid adding the extra lock. The result can be seen here:
https://patchwork.freedesktop.org/series/148026/
But the result is not pretty and has its own complications, so I don't want
to pursue it. So let us look at the approach in this patch.
So previously we have the following lock order, in different code paths:
1. eu_stall_data_buf_poll_work_fn queue_delayed_work():
workqueue_lock -> stream_lock
2. Read:
stream_lock
3. Enable/disable
stream_lock -> workqueue_lock -> stream_lock
So we have lock order inversion between 1 and 3 and also possible deadlock
in 3 itself.
After this patch we have:
1. eu_stall_data_buf_poll_work_fn queue_delayed_work():
workqueue_lock -> xecore_buf_lock
2. Read:
stream_lock -> xecore_buf_lock
3. Enable/disable:
stream_lock -> workqueue_lock -> xecore_buf_lock
Which fixes both the lock order inversion and the deadlock. And the patch
looks good too, so this is:
Reviewed-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
> This would also require additional locking with the new lock in
> the read function.
>
> <4> [787.192986] ======================================================
> <4> [787.192988] WARNING: possible circular locking dependency detected
> <4> [787.192991] 6.14.0-rc7-xe+ #1 Tainted: G U
> <4> [787.192993] ------------------------------------------------------
> <4> [787.192994] xe_eu_stall/20093 is trying to acquire lock:
> <4> [787.192996] ffff88819847e2c0 ((work_completion)
> (&(&stream->buf_poll_work)->work)), at: __flush_work+0x1f8/0x5e0
> <4> [787.193005] but task is already holding lock:
> <4> [787.193007] ffff88814ce83ba8 (>->eu_stall->stream_lock){3:3},
> at: xe_eu_stall_stream_ioctl+0x41/0x6a0 [xe]
> <4> [787.193090] which lock already depends on the new lock.
> <4> [787.193093] the existing dependency chain (in reverse order) is:
> <4> [787.193095]
> -> #1 (>->eu_stall->stream_lock){+.+.}-{3:3}:
> <4> [787.193099] __mutex_lock+0xb4/0xe40
> <4> [787.193104] mutex_lock_nested+0x1b/0x30
> <4> [787.193106] eu_stall_data_buf_poll_work_fn+0x44/0x1d0 [xe]
> <4> [787.193155] process_one_work+0x21c/0x740
> <4> [787.193159] worker_thread+0x1db/0x3c0
> <4> [787.193161] kthread+0x10d/0x270
> <4> [787.193164] ret_from_fork+0x44/0x70
> <4> [787.193168] ret_from_fork_asm+0x1a/0x30
> <4> [787.193172]
> -> #0 ((work_completion)(&(&stream->buf_poll_work)->work)){+.+.}-{0:0}:
> <4> [787.193176] __lock_acquire+0x1637/0x2810
> <4> [787.193180] lock_acquire+0xc9/0x300
> <4> [787.193183] __flush_work+0x219/0x5e0
> <4> [787.193186] cancel_delayed_work_sync+0x87/0x90
> <4> [787.193189] xe_eu_stall_disable_locked+0x9a/0x260 [xe]
> <4> [787.193237] xe_eu_stall_stream_ioctl+0x5b/0x6a0 [xe]
> <4> [787.193285] __x64_sys_ioctl+0xa4/0xe0
> <4> [787.193289] x64_sys_call+0x131e/0x2650
> <4> [787.193292] do_syscall_64+0x91/0x180
> <4> [787.193295] entry_SYSCALL_64_after_hwframe+0x76/0x7e
> <4> [787.193299]
> other info that might help us debug this:
> <4> [787.193302] Possible unsafe locking scenario:
> <4> [787.193304] CPU0 CPU1
> <4> [787.193305] ---- ----
> <4> [787.193306] lock(>->eu_stall->stream_lock);
> <4> [787.193308] lock((work_completion)
> (&(&stream->buf_poll_work)->work));
> <4> [787.193311] lock(>->eu_stall->stream_lock);
> <4> [787.193313] lock((work_completion)
> (&(&stream->buf_poll_work)->work));
> <4> [787.193315]
> *** DEADLOCK ***
>
> Fixes: 760edec939685 ("drm/xe/eustall: Add support to read() and poll() EU stall data")
> Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4598
> Signed-off-by: Harish Chegondi <harish.chegondi at intel.com>
> ---
> drivers/gpu/drm/xe/xe_eu_stall.c | 11 +++++++++--
> 1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_eu_stall.c b/drivers/gpu/drm/xe/xe_eu_stall.c
> index f2bb9168967c..78f28f3c5e5c 100644
> --- a/drivers/gpu/drm/xe/xe_eu_stall.c
> +++ b/drivers/gpu/drm/xe/xe_eu_stall.c
> @@ -52,6 +52,8 @@ struct xe_eu_stall_data_stream {
>
> struct xe_gt *gt;
> struct xe_bo *bo;
> + /* Lock to protect data buffer pointers */
> + struct mutex xecore_buf_lock;
> struct per_xecore_buf *xecore_buf;
> struct {
> bool reported_to_user;
> @@ -378,7 +380,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
> u16 group, instance;
> unsigned int xecore;
>
> - mutex_lock(>->eu_stall->stream_lock);
> + mutex_lock(&stream->xecore_buf_lock);
> for_each_dss_steering(xecore, gt, group, instance) {
> xecore_buf = &stream->xecore_buf[xecore];
> read_ptr = xecore_buf->read;
> @@ -396,7 +398,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
> set_bit(xecore, stream->data_drop.mask);
> xecore_buf->write = write_ptr;
> }
> - mutex_unlock(>->eu_stall->stream_lock);
> + mutex_unlock(&stream->xecore_buf_lock);
>
> return min_data_present;
> }
> @@ -511,11 +513,13 @@ static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *st
> unsigned int xecore;
> int ret = 0;
>
> + mutex_lock(&stream->xecore_buf_lock);
> if (bitmap_weight(stream->data_drop.mask, XE_MAX_DSS_FUSE_BITS)) {
> if (!stream->data_drop.reported_to_user) {
> stream->data_drop.reported_to_user = true;
> xe_gt_dbg(gt, "EU stall data dropped in XeCores: %*pb\n",
> XE_MAX_DSS_FUSE_BITS, stream->data_drop.mask);
> + mutex_unlock(&stream->xecore_buf_lock);
> return -EIO;
> }
> stream->data_drop.reported_to_user = false;
> @@ -527,6 +531,7 @@ static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *st
> if (ret || count == total_size)
> break;
> }
> + mutex_unlock(&stream->xecore_buf_lock);
> return total_size ?: (ret ?: -EAGAIN);
> }
>
> @@ -583,6 +588,7 @@ static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)
> {
> struct xe_gt *gt = stream->gt;
>
> + mutex_destroy(&stream->xecore_buf_lock);
> gt->eu_stall->stream = NULL;
> kfree(stream);
> }
> @@ -718,6 +724,7 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
> }
>
> init_waitqueue_head(&stream->poll_wq);
> + mutex_init(&stream->xecore_buf_lock);
> INIT_DELAYED_WORK(&stream->buf_poll_work, eu_stall_data_buf_poll_work_fn);
> stream->per_xecore_buf_size = per_xecore_buf_size;
> stream->sampling_rate_mult = props->sampling_rate_mult;
> --
> 2.48.1
>
More information about the Intel-xe
mailing list