[PATCH 1/1] drm/xe/eustall: Resolve a possible circular locking dependency

Dixit, Ashutosh ashutosh.dixit at intel.com
Tue Apr 22 00:46:09 UTC 2025


On Thu, 17 Apr 2025 17:07:17 -0700, Harish Chegondi wrote:
>

Hi Harish,

> Use a separate lock in the polling function eu_stall_data_buf_poll()
> instead of eu_stall->stream_lock. This would prevent a possible
> circular locking dependency leading to a deadlock as described below.

I was trying to avoid adding the extra lock. The result can be seen here:

	https://patchwork.freedesktop.org/series/148026/

But the result is not pretty and has its own complications, so I don't want
to pursue it. So let us look at the approach in this patch.

So previously we have the following lock order, in different code paths:

	1. eu_stall_data_buf_poll_work_fn queue_delayed_work():
	   workqueue_lock -> stream_lock

	2. Read:
	   stream_lock

	3. Enable/disable
	   stream_lock -> workqueue_lock -> stream_lock

So we have lock order inversion between 1 and 3 and also possible deadlock
in 3 itself.

After this patch we have:

	1. eu_stall_data_buf_poll_work_fn queue_delayed_work():
	   workqueue_lock -> xecore_buf_lock

	2. Read:
	   stream_lock -> xecore_buf_lock

	3. Enable/disable:
	   stream_lock -> workqueue_lock -> xecore_buf_lock

Which fixes both the lock order inversion and the deadlock. And the patch
looks good too, so this is:

Reviewed-by: Ashutosh Dixit <ashutosh.dixit at intel.com>

> This would also require additional locking with the new lock in
> the read function.
>
> <4> [787.192986] ======================================================
> <4> [787.192988] WARNING: possible circular locking dependency detected
> <4> [787.192991] 6.14.0-rc7-xe+ #1 Tainted: G     U
> <4> [787.192993] ------------------------------------------------------
> <4> [787.192994] xe_eu_stall/20093 is trying to acquire lock:
> <4> [787.192996] ffff88819847e2c0 ((work_completion)
> (&(&stream->buf_poll_work)->work)), at: __flush_work+0x1f8/0x5e0
> <4> [787.193005] but task is already holding lock:
> <4> [787.193007] ffff88814ce83ba8 (&gt->eu_stall->stream_lock){3:3},
> at: xe_eu_stall_stream_ioctl+0x41/0x6a0 [xe]
> <4> [787.193090] which lock already depends on the new lock.
> <4> [787.193093] the existing dependency chain (in reverse order) is:
> <4> [787.193095]
> -> #1 (&gt->eu_stall->stream_lock){+.+.}-{3:3}:
> <4> [787.193099]        __mutex_lock+0xb4/0xe40
> <4> [787.193104]        mutex_lock_nested+0x1b/0x30
> <4> [787.193106]        eu_stall_data_buf_poll_work_fn+0x44/0x1d0 [xe]
> <4> [787.193155]        process_one_work+0x21c/0x740
> <4> [787.193159]        worker_thread+0x1db/0x3c0
> <4> [787.193161]        kthread+0x10d/0x270
> <4> [787.193164]        ret_from_fork+0x44/0x70
> <4> [787.193168]        ret_from_fork_asm+0x1a/0x30
> <4> [787.193172]
> -> #0 ((work_completion)(&(&stream->buf_poll_work)->work)){+.+.}-{0:0}:
> <4> [787.193176]        __lock_acquire+0x1637/0x2810
> <4> [787.193180]        lock_acquire+0xc9/0x300
> <4> [787.193183]        __flush_work+0x219/0x5e0
> <4> [787.193186]        cancel_delayed_work_sync+0x87/0x90
> <4> [787.193189]        xe_eu_stall_disable_locked+0x9a/0x260 [xe]
> <4> [787.193237]        xe_eu_stall_stream_ioctl+0x5b/0x6a0 [xe]
> <4> [787.193285]        __x64_sys_ioctl+0xa4/0xe0
> <4> [787.193289]        x64_sys_call+0x131e/0x2650
> <4> [787.193292]        do_syscall_64+0x91/0x180
> <4> [787.193295]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> <4> [787.193299]
> other info that might help us debug this:
> <4> [787.193302]  Possible unsafe locking scenario:
> <4> [787.193304]        CPU0                    CPU1
> <4> [787.193305]        ----                    ----
> <4> [787.193306]   lock(&gt->eu_stall->stream_lock);
> <4> [787.193308]                        lock((work_completion)
>					(&(&stream->buf_poll_work)->work));
> <4> [787.193311]                        lock(&gt->eu_stall->stream_lock);
> <4> [787.193313]   lock((work_completion)
>			(&(&stream->buf_poll_work)->work));
> <4> [787.193315]
>  *** DEADLOCK ***
>
> Fixes: 760edec939685 ("drm/xe/eustall: Add support to read() and poll() EU stall data")
> Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4598
> Signed-off-by: Harish Chegondi <harish.chegondi at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_eu_stall.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_eu_stall.c b/drivers/gpu/drm/xe/xe_eu_stall.c
> index f2bb9168967c..78f28f3c5e5c 100644
> --- a/drivers/gpu/drm/xe/xe_eu_stall.c
> +++ b/drivers/gpu/drm/xe/xe_eu_stall.c
> @@ -52,6 +52,8 @@ struct xe_eu_stall_data_stream {
>
>	struct xe_gt *gt;
>	struct xe_bo *bo;
> +	/* Lock to protect data buffer pointers */
> +	struct mutex xecore_buf_lock;
>	struct per_xecore_buf *xecore_buf;
>	struct {
>		bool reported_to_user;
> @@ -378,7 +380,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
>	u16 group, instance;
>	unsigned int xecore;
>
> -	mutex_lock(&gt->eu_stall->stream_lock);
> +	mutex_lock(&stream->xecore_buf_lock);
>	for_each_dss_steering(xecore, gt, group, instance) {
>		xecore_buf = &stream->xecore_buf[xecore];
>		read_ptr = xecore_buf->read;
> @@ -396,7 +398,7 @@ static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
>			set_bit(xecore, stream->data_drop.mask);
>		xecore_buf->write = write_ptr;
>	}
> -	mutex_unlock(&gt->eu_stall->stream_lock);
> +	mutex_unlock(&stream->xecore_buf_lock);
>
>	return min_data_present;
>  }
> @@ -511,11 +513,13 @@ static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *st
>	unsigned int xecore;
>	int ret = 0;
>
> +	mutex_lock(&stream->xecore_buf_lock);
>	if (bitmap_weight(stream->data_drop.mask, XE_MAX_DSS_FUSE_BITS)) {
>		if (!stream->data_drop.reported_to_user) {
>			stream->data_drop.reported_to_user = true;
>			xe_gt_dbg(gt, "EU stall data dropped in XeCores: %*pb\n",
>				  XE_MAX_DSS_FUSE_BITS, stream->data_drop.mask);
> +			mutex_unlock(&stream->xecore_buf_lock);
>			return -EIO;
>		}
>		stream->data_drop.reported_to_user = false;
> @@ -527,6 +531,7 @@ static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *st
>		if (ret || count == total_size)
>			break;
>	}
> +	mutex_unlock(&stream->xecore_buf_lock);
>	return total_size ?: (ret ?: -EAGAIN);
>  }
>
> @@ -583,6 +588,7 @@ static void xe_eu_stall_stream_free(struct xe_eu_stall_data_stream *stream)
>  {
>	struct xe_gt *gt = stream->gt;
>
> +	mutex_destroy(&stream->xecore_buf_lock);
>	gt->eu_stall->stream = NULL;
>	kfree(stream);
>  }
> @@ -718,6 +724,7 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
>	}
>
>	init_waitqueue_head(&stream->poll_wq);
> +	mutex_init(&stream->xecore_buf_lock);
>	INIT_DELAYED_WORK(&stream->buf_poll_work, eu_stall_data_buf_poll_work_fn);
>	stream->per_xecore_buf_size = per_xecore_buf_size;
>	stream->sampling_rate_mult = props->sampling_rate_mult;
> --
> 2.48.1
>


More information about the Intel-xe mailing list