[RFC] locking/lockdep: Don't bunch up all flush_workqueue callers into a single completion cross-release context

Fri Oct 13 10:24:04 UTC 2017

From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

It looks like all completions as created by flush_workqueue map
into a single lock class which creates lockdep false positives.

Move flush_workqueue to a macro so that separate lock classes
get created.

Example of a false positive:

 [   20.805315] ======================================================
 [   20.805316] WARNING: possible circular locking dependency detected
 [   20.805319] 4.14.0-rc4-CI-CI_DRM_3225+ #1 Tainted: G     U
 [   20.805320] ------------------------------------------------------
 [   20.805322] kworker/6:1H/1438 is trying to acquire lock:
 [   20.805324]  (&mm->mmap_sem){++++}, at: [<ffffffffa01c8e01>] __i915_gem_userptr_get_pages_worker+0x141/0x240 [i915]
 [   20.805355] but now in release context of a crosslock acquired at the following:
 [   20.805357]  ((complete)&this_flusher.done){+.+.}, at: [<ffffffff8190b06d>] wait_for_completion+0x1d/0x20
 [   20.805363] which lock already depends on the new lock.
 [   20.805365]  the existing dependency chain (in reverse order) is:
 [   20.805367] -> #1 ((complete)&this_flusher.done){+.+.}:
 [   20.805372]        __lock_acquire+0x1420/0x15e0
 [   20.805374]        lock_acquire+0xb0/0x200
 [   20.805376]        wait_for_common+0x58/0x210
 [   20.805378]        wait_for_completion+0x1d/0x20
 [   20.805381]        flush_workqueue+0x1af/0x540
 [   20.805400]        i915_gem_userptr_mn_invalidate_range_start+0x13c/0x150 [i915]
 [   20.805404]        __mmu_notifier_invalidate_range_start+0x76/0xc0
 [   20.805406]        unmap_vmas+0x7d/0xa0
 [   20.805408]        unmap_region+0xae/0x110
 [   20.805410]        do_munmap+0x276/0x3f0
 [   20.805411]        vm_munmap+0x67/0x90
 [   20.805413]        SyS_munmap+0xe/0x20
 [   20.805415]        entry_SYSCALL_64_fastpath+0x1c/0xb1
 [   20.805416] -> #0 (&mm->mmap_sem){++++}:
 [   20.805419]        down_read+0x3e/0x70
 [   20.805435]        __i915_gem_userptr_get_pages_worker+0x141/0x240 [i915]
 [   20.805438]        process_one_work+0x233/0x660
 [   20.805440]        worker_thread+0x4e/0x3b0
 [   20.805441]        kthread+0x152/0x190
 [   20.805442] other info that might help us debug this:
 [   20.805445]  Possible unsafe locking scenario by crosslock:
 [   20.805447]        CPU0                    CPU1
 [   20.805448]        ----                    ----
 [   20.805449]   lock(&mm->mmap_sem);
 [   20.805451]   lock((complete)&this_flusher.done);
 [   20.805453]                                lock(&mm->mmap_sem);
 [   20.805455]                                unlock((complete)&this_flusher.done);
 [   20.805457] *** DEADLOCK ***
 [   20.805460] 2 locks held by kworker/6:1H/1438:
 [   20.805461]  #0:  (&(&pool->lock)->rlock){-.-.}, at: [<ffffffff8109c94c>] process_one_work+0x2dc/0x660
 [   20.805465]  #1:  (&x->wait#10){....}, at: [<ffffffff810cd69d>] complete+0x1d/0x60
 [   20.805469] stack backtrace:
 [   20.805472] CPU: 6 PID: 1438 Comm: kworker/6:1H Tainted: G     U          4.14.0-rc4-CI-CI_DRM_3225+ #1
 [   20.805474] Hardware name: MSI MS-7924/Z97M-G43(MS-7924), BIOS V1.12 02/15/2016
 [   20.805480] Call Trace:
 [   20.805483]  dump_stack+0x68/0x9f
 [   20.805486]  print_circular_bug+0x235/0x3c0
 [   20.805488]  ? HARDIRQ_verbose+0x10/0x10
 [   20.805490]  check_prev_add+0x430/0x840
 [   20.805492]  ? ret_from_fork+0x27/0x40
 [   20.805494]  lock_commit_crosslock+0x3ee/0x660
 [   20.805496]  ? lock_commit_crosslock+0x3ee/0x660
 [   20.805498]  complete+0x29/0x60
 [   20.805500]  pwq_dec_nr_in_flight+0x9c/0xa0
 [   20.805502]  ? _raw_spin_lock_irq+0x40/0x50
 [   20.805504]  process_one_work+0x335/0x660
 [   20.805506]  worker_thread+0x4e/0x3b0
 [   20.805508]  kthread+0x152/0x190
 [   20.805509]  ? process_one_work+0x660/0x660
 [   20.805511]  ? kthread_create_on_node+0x40/0x40
 [   20.805513]  ret_from_fork+0x27/0x40

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: linux-kernel at vger.kernel.org
Cc: Tejun Heo <tj at kernel.org>
Cc: Linus Torvalds <torvalds at linux-foundation.org>
Cc: Peter Zijlstra <peterz at infradead.org>
Cc: Thomas Gleixner <tglx at linutronix.de>
Cc: boqun.feng at gmail.com
Cc: byungchul.park at lge.com
Cc: david at fromorbit.com
Cc: johannes at sipsolutions.net
Cc: oleg at redhat.com
--
Borrowed the Cc list from a recent fix in the same area.
Apologies if it is too wide.

Another pair of eyes to check my thinking here is correct
would also be good.
---
 include/linux/workqueue.h | 26 +++++++++++++++++++++++++-
 kernel/workqueue.c        | 41 ++++++++++++++---------------------------
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1c49431f3121..ca63d4b09110 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -12,6 +12,7 @@
 #include <linux/threads.h>
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
+#include <linux/completion.h>
 
 struct workqueue_struct;
 
@@ -449,7 +450,30 @@ extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct delayed_work *dwork, unsigned long delay);
 
-extern void flush_workqueue(struct workqueue_struct *wq);
+/*
+ * Structure used to wait for workqueue flush.
+ */
+struct wq_flusher {
+	struct list_head	list;		/* WQ: list of flushers */
+	int			flush_color;	/* WQ: flush color waiting for */
+	struct completion	done;		/* flush completion */
+};
+
+extern void __flush_workqueue(struct workqueue_struct *wq,
+			      struct wq_flusher *flusher);
+
+/* Static inline wrapper so completion gets separate lockdep keys. */
+static inline void flush_workqueue(struct workqueue_struct *wq)
+{
+	struct wq_flusher this_flusher = {
+		.list = LIST_HEAD_INIT(this_flusher.list),
+		.flush_color = -1,
+		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+	};
+
+	__flush_workqueue(wq, &this_flusher);
+}
+
 extern void drain_workqueue(struct workqueue_struct *wq);
 
 extern int schedule_on_each_cpu(work_func_t func);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..c2e8f61f6b95 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,15 +221,6 @@ struct pool_workqueue {
 	struct rcu_head		rcu;
 } __aligned(1 << WORK_STRUCT_FLAG_BITS);
 
-/*
- * Structure used to wait for workqueue flush.
- */
-struct wq_flusher {
-	struct list_head	list;		/* WQ: list of flushers */
-	int			flush_color;	/* WQ: flush color waiting for */
-	struct completion	done;		/* flush completion */
-};
-
 struct wq_device;
 
 /*
@@ -2606,13 +2597,9 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
  * This function sleeps until all work items which were queued on entry
  * have finished execution, but it is not livelocked by new incoming ones.
  */
-void flush_workqueue(struct workqueue_struct *wq)
+void __flush_workqueue(struct workqueue_struct *wq,
+		       struct wq_flusher *this_flusher)
 {
-	struct wq_flusher this_flusher = {
-		.list = LIST_HEAD_INIT(this_flusher.list),
-		.flush_color = -1,
-		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
-	};
 	int next_color;
 
 	if (WARN_ON(!wq_online))
@@ -2635,14 +2622,14 @@ void flush_workqueue(struct workqueue_struct *wq)
 		 * by one.
 		 */
 		WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
-		this_flusher.flush_color = wq->work_color;
+		this_flusher->flush_color = wq->work_color;
 		wq->work_color = next_color;
 
 		if (!wq->first_flusher) {
 			/* no flush in progress, become the first flusher */
-			WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
+			WARN_ON_ONCE(wq->flush_color != this_flusher->flush_color);
 
-			wq->first_flusher = &this_flusher;
+			wq->first_flusher = this_flusher;
 
 			if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
 						       wq->work_color)) {
@@ -2653,8 +2640,8 @@ void flush_workqueue(struct workqueue_struct *wq)
 			}
 		} else {
 			/* wait in queue */
-			WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
-			list_add_tail(&this_flusher.list, &wq->flusher_queue);
+			WARN_ON_ONCE(wq->flush_color == this_flusher->flush_color);
+			list_add_tail(&this_flusher->list, &wq->flusher_queue);
 			flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
 		}
 	} else {
@@ -2663,14 +2650,14 @@ void flush_workqueue(struct workqueue_struct *wq)
 		 * The next flush completion will assign us
 		 * flush_color and transfer to flusher_queue.
 		 */
-		list_add_tail(&this_flusher.list, &wq->flusher_overflow);
+		list_add_tail(&this_flusher->list, &wq->flusher_overflow);
 	}
 
 	check_flush_dependency(wq, NULL);
 
 	mutex_unlock(&wq->mutex);
 
-	wait_for_completion(&this_flusher.done);
+	wait_for_completion(&this_flusher->done);
 
 	/*
 	 * Wake-up-and-cascade phase
@@ -2678,19 +2665,19 @@ void flush_workqueue(struct workqueue_struct *wq)
 	 * First flushers are responsible for cascading flushes and
 	 * handling overflow.  Non-first flushers can simply return.
 	 */
-	if (wq->first_flusher != &this_flusher)
+	if (wq->first_flusher != this_flusher)
 		return;
 
 	mutex_lock(&wq->mutex);
 
 	/* we might have raced, check again with mutex held */
-	if (wq->first_flusher != &this_flusher)
+	if (wq->first_flusher != this_flusher)
 		goto out_unlock;
 
 	wq->first_flusher = NULL;
 
-	WARN_ON_ONCE(!list_empty(&this_flusher.list));
-	WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
+	WARN_ON_ONCE(!list_empty(&this_flusher->list));
+	WARN_ON_ONCE(wq->flush_color != this_flusher->flush_color);
 
 	while (true) {
 		struct wq_flusher *next, *tmp;
@@ -2755,7 +2742,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 out_unlock:
 	mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL(flush_workqueue);
+EXPORT_SYMBOL(__flush_workqueue);
 
 /**
  * drain_workqueue - drain a workqueue
-- 
2.9.5