[PATCH] kernel/locking/semaphore: use wake_q in up()

Thu May 9 20:06:33 UTC 2019

console_trylock, called from within printk, can be called from pretty
much anywhere. Including try_to_wake_up. Note that this isn't common,
usually the box is in pretty bad shape at that point already. But it
really doesn't help when then lockdep jumps in and spams the logs,
potentially obscuring the real backtrace we're really interested in.
One case I've seen (slightly simplified backtrace):

 Call Trace:
  <IRQ>
  console_trylock+0xe/0x60
  vprintk_emit+0xf1/0x320
  printk+0x4d/0x69
  __warn_printk+0x46/0x90
  native_smp_send_reschedule+0x2f/0x40
  check_preempt_curr+0x81/0xa0
  ttwu_do_wakeup+0x14/0x220
  try_to_wake_up+0x218/0x5f0
  pollwake+0x6f/0x90
  credit_entropy_bits+0x204/0x310
  add_interrupt_randomness+0x18f/0x210
  handle_irq+0x67/0x160
  do_IRQ+0x5e/0x130
  common_interrupt+0xf/0xf
  </IRQ>

This alone isn't a problem, but the spinlock in the semaphore is also
still held while waking up waiters (up() -> __up() -> try_to_wake_up()
callchain), which then closes the runqueue vs. semaphore.lock loop,
and upsets lockdep, which issues a circular locking splat to dmesg.
Worse it upsets developers, since we don't want to spam dmesg with
clutter when the machine is dying already.

Fix this specific locking recursion by moving the wake_up_process out
from under the semaphore.lock spinlock, using wake_q as recommended by
Peter Zijlstra.

As Petr Mladek points out this doesn't fix all the locking recursions
in this area. If we actually recursive in the above callchain:

      + try_to_wake_up()    # takes p->pi_lock
        + ttwu_remote()     # takes rq lock
          + ttwu_do_wakeup()
            + check_preempt_curr()
              + native_smp_send_reschedule()
                + __warn_printk()
                  + printk()
                    + vprintk_emit()
                      + console_trylock() # success
                      + console_unlock()
                        + up_console_sem()
                          + up() # wait list in not empty
                            + __up()
                              + wake_up_process()
                                + try_to_wake_up()

Then there's any number of scheduler related locks will deadlock.
Given that the kernel is dying already (the printk() in
native_smp_send_reschedule() happens because we run on an offlined
CPU) I think there's limited value in trying to fix this:

- We haven't seen the actual deadlock in our CI, only lockdep
  complaining about the possibility.

- The real issue is that the lockdep splat hides useful dmesg
  information we capture in e.g. pstore or on screen about the real
  cause of why the kernel is dying.

- The console_unlock in the above callchain should have managed to get
  all the dmesg up to that point out already. Dying later on is
  somewhat ok - I've only seen this lockdep splat in pstore when the
  machine died anyway.

Also cc'ing John Ogness since perhaps his printk rework fixes this all
properly.

v2: Ditch attempt to fix console_trylock.

v3: Add a comment explaining why the taks we're waking won't
disappear (Chris), and improve commit message to address review
questions.

v4: Use wake_q (Peter Z).

Signed-off-by: Daniel Vetter <daniel.vetter at intel.com>
Cc: Peter Zijlstra <peterz at infradead.org>
Cc: Ingo Molnar <mingo at redhat.com>
Cc: Will Deacon <will.deacon at arm.com>
Cc: Petr Mladek <pmladek at suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky at gmail.com>
Cc: Steven Rostedt <rostedt at goodmis.org>
Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
Cc: John Ogness <john.ogness at linutronix.de>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: linux-kernel at vger.kernel.org
Signed-off-by: Daniel Vetter <daniel.vetter at ffwll.ch>
---
 kernel/locking/semaphore.c | 42 +++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 561acdd39960..7a6f33715688 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -33,12 +33,12 @@
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
+#include <linux/sched/wake_q.h>
 
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
 static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
 
 /**
  * down - acquire the semaphore
@@ -169,6 +169,14 @@ int down_timeout(struct semaphore *sem, long timeout)
 }
 EXPORT_SYMBOL(down_timeout);
 
+/* Functions for the contended case */
+
+struct semaphore_waiter {
+	struct list_head list;
+	struct task_struct *task;
+	bool up;
+};
+
 /**
  * up - release the semaphore
  * @sem: the semaphore to release
@@ -179,24 +187,25 @@ EXPORT_SYMBOL(down_timeout);
 void up(struct semaphore *sem)
 {
 	unsigned long flags;
+	struct semaphore_waiter *waiter;
+	DEFINE_WAKE_Q(wake_q);
 
 	raw_spin_lock_irqsave(&sem->lock, flags);
-	if (likely(list_empty(&sem->wait_list)))
+	if (likely(list_empty(&sem->wait_list))) {
 		sem->count++;
-	else
-		__up(sem);
+	} else {
+		waiter =  list_first_entry(&sem->wait_list,
+					   struct semaphore_waiter, list);
+		list_del(&waiter->list);
+		waiter->up = true;
+		wake_q_add(&wake_q, waiter->task);
+	}
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+	wake_up_q(&wake_q);
 }
 EXPORT_SYMBOL(up);
 
-/* Functions for the contended case */
-
-struct semaphore_waiter {
-	struct list_head list;
-	struct task_struct *task;
-	bool up;
-};
-
 /*
  * Because this function is inlined, the 'state' parameter will be
  * constant, and thus optimised away by the compiler.  Likewise the
@@ -252,12 +261,3 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 {
 	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
 }
-
-static noinline void __sched __up(struct semaphore *sem)
-{
-	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
-						struct semaphore_waiter, list);
-	list_del(&waiter->list);
-	waiter->up = true;
-	wake_up_process(waiter->task);
-}
-- 
2.20.1