[PATCH] drm/nouveau: use dedicated wq for fence uevents work

Thu Feb 22 14:44:29 UTC 2024

Using the kernel global workqueue to signal fences can lead to
unexpected deadlocks. Some other work (e.g. from a different driver)
could directly or indirectly depend on this fence to be signaled.
However, if the WQ_MAX_ACTIVE limit is reached by waiters, this can
prevent the work signaling the fence from running.

While this seems fairly unlikely, it's potentially exploitable.

Fixes: 39126abc5e20 ("nouveau: offload fence uevents work to workqueue")
Signed-off-by: Danilo Krummrich <dakr at redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_drm.c   | 13 +++++++++++--
 drivers/gpu/drm/nouveau/nouveau_drv.h   |  3 +++
 drivers/gpu/drm/nouveau/nouveau_fence.c |  3 ++-
 drivers/gpu/drm/nouveau/nouveau_fence.h |  2 ++
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 6f6c31a9937b..6be202081077 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -598,9 +598,15 @@ nouveau_drm_device_init(struct drm_device *dev)
 		goto fail_alloc;
 	}
 
+	drm->fence_wq = alloc_workqueue("nouveau_fence_wq", 0, WQ_MAX_ACTIVE);
+	if (!drm->fence_wq) {
+		ret = -ENOMEM;
+		goto fail_sched_wq;
+	}
+
 	ret = nouveau_cli_init(drm, "DRM-master", &drm->master);
 	if (ret)
-		goto fail_wq;
+		goto fail_fence_wq;
 
 	ret = nouveau_cli_init(drm, "DRM", &drm->client);
 	if (ret)
@@ -670,7 +676,9 @@ nouveau_drm_device_init(struct drm_device *dev)
 	nouveau_cli_fini(&drm->client);
 fail_master:
 	nouveau_cli_fini(&drm->master);
-fail_wq:
+fail_fence_wq:
+	destroy_workqueue(drm->fence_wq);
+fail_sched_wq:
 	destroy_workqueue(drm->sched_wq);
 fail_alloc:
 	nvif_parent_dtor(&drm->parent);
@@ -725,6 +733,7 @@ nouveau_drm_device_fini(struct drm_device *dev)
 
 	nouveau_cli_fini(&drm->client);
 	nouveau_cli_fini(&drm->master);
+	destroy_workqueue(drm->fence_wq);
 	destroy_workqueue(drm->sched_wq);
 	nvif_parent_dtor(&drm->parent);
 	mutex_destroy(&drm->clients_lock);
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 8a6d94c8b163..b43619a213a4 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -261,6 +261,9 @@ struct nouveau_drm {
 	/* Workqueue used for channel schedulers. */
 	struct workqueue_struct *sched_wq;
 
+	/* Workqueue used to signal fences. */
+	struct workqueue_struct *fence_wq;
+
 	/* context for accelerated drm-internal operations */
 	struct nouveau_channel *cechan;
 	struct nouveau_channel *channel;
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 93f08f9479d8..c3ea3cd933cd 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -174,7 +174,7 @@ static int
 nouveau_fence_wait_uevent_handler(struct nvif_event *event, void *repv, u32 repc)
 {
 	struct nouveau_fence_chan *fctx = container_of(event, typeof(*fctx), event);
-	schedule_work(&fctx->uevent_work);
+	queue_work(fctx->wq, &fctx->uevent_work);
 	return NVIF_EVENT_KEEP;
 }
 
@@ -194,6 +194,7 @@ nouveau_fence_context_new(struct nouveau_channel *chan, struct nouveau_fence_cha
 	INIT_LIST_HEAD(&fctx->pending);
 	spin_lock_init(&fctx->lock);
 	fctx->context = chan->drm->runl[chan->runlist].context_base + chan->chid;
+	fctx->wq = chan->drm->fence_wq;
 
 	if (chan == chan->drm->cechan)
 		strcpy(fctx->name, "copy engine channel");
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h
index 8bc065acfe35..bc13110bdfa4 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.h
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
@@ -44,7 +44,9 @@ struct nouveau_fence_chan {
 	u32 context;
 	char name[32];
 
+	struct workqueue_struct *wq;
 	struct work_struct uevent_work;
+
 	struct nvif_event event;
 	int notify_ref, dead, killed;
 };

base-commit: 1f4c6f11a557642505e5f403e0dfabbaff9c529a
-- 
2.43.0