From b4abf91047cf054f203dcfac97e1038388826937 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jan 2016 11:25:38 +0100
Subject: rtmutex: Make wait_lock irq safe

Sasha reported a lockdep splat about a potential deadlock between RCU boosting
rtmutex and the posix timer it_lock.

CPU0					CPU1

rtmutex_lock(&rcu->rt_mutex)
  spin_lock(&rcu->rt_mutex.wait_lock)
					local_irq_disable()
					spin_lock(&timer->it_lock)
					spin_lock(&rcu->mutex.wait_lock)
--> Interrupt
    spin_lock(&timer->it_lock)

This is caused by the following code sequence on CPU1

     rcu_read_lock()
     x = lookup();
     if (x)
     	spin_lock_irqsave(&x->it_lock);
     rcu_read_unlock();
     return x;

We could fix that in the posix timer code by keeping rcu read locked across
the spinlocked and irq disabled section, but the above sequence is common and
there is no reason not to support it.

Taking rt_mutex.wait_lock irq safe prevents the deadlock.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/locking/rtmutex.c | 135 +++++++++++++++++++++++++----------------------
 1 file changed, 72 insertions(+), 63 deletions(-)

(limited to 'kernel/locking')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8251e75dd9c0..3e746607abe5 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
  * 2) Drop lock->wait_lock
  * 3) Try to unlock the lock with cmpxchg
  */
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+					unsigned long flags)
 	__releases(lock->wait_lock)
 {
 	struct task_struct *owner = rt_mutex_owner(lock);
 
 	clear_rt_mutex_waiters(lock);
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 	/*
 	 * If a new waiter comes in between the unlock and the cmpxchg
 	 * we have two situations:
@@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 /*
  * Simple slow path only version: lock->owner is protected by lock->wait_lock.
  */
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+					unsigned long flags)
 	__releases(lock->wait_lock)
 {
 	lock->owner = NULL;
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 	return true;
 }
 #endif
@@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	int ret = 0, depth = 0;
 	struct rt_mutex *lock;
 	bool detect_deadlock;
-	unsigned long flags;
 	bool requeue = true;
 
 	detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	/*
 	 * [1] Task cannot go away as we did a get_task() before !
 	 */
-	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	raw_spin_lock_irq(&task->pi_lock);
 
 	/*
 	 * [2] Get the waiter on which @task is blocked on.
@@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * operations.
 	 */
 	if (!raw_spin_trylock(&lock->wait_lock)) {
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		raw_spin_unlock_irq(&task->pi_lock);
 		cpu_relax();
 		goto retry;
 	}
@@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		/*
 		 * No requeue[7] here. Just release @task [8]
 		 */
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		raw_spin_unlock(&task->pi_lock);
 		put_task_struct(task);
 
 		/*
@@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		 * If there is no owner of the lock, end of chain.
 		 */
 		if (!rt_mutex_owner(lock)) {
-			raw_spin_unlock(&lock->wait_lock);
+			raw_spin_unlock_irq(&lock->wait_lock);
 			return 0;
 		}
 
 		/* [10] Grab the next task, i.e. owner of @lock */
 		task = rt_mutex_owner(lock);
 		get_task_struct(task);
-		raw_spin_lock_irqsave(&task->pi_lock, flags);
+		raw_spin_lock(&task->pi_lock);
 
 		/*
 		 * No requeue [11] here. We just do deadlock detection.
@@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		top_waiter = rt_mutex_top_waiter(lock);
 
 		/* [13] Drop locks */
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock(&task->pi_lock);
+		raw_spin_unlock_irq(&lock->wait_lock);
 
 		/* If owner is not blocked, end of chain. */
 		if (!next_lock)
@@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	rt_mutex_enqueue(lock, waiter);
 
 	/* [8] Release the task */
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	raw_spin_unlock(&task->pi_lock);
 	put_task_struct(task);
 
 	/*
@@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		 */
 		if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
 			wake_up_process(rt_mutex_top_waiter(lock)->task);
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock_irq(&lock->wait_lock);
 		return 0;
 	}
 
 	/* [10] Grab the next task, i.e. the owner of @lock */
 	task = rt_mutex_owner(lock);
 	get_task_struct(task);
-	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	raw_spin_lock(&task->pi_lock);
 
 	/* [11] requeue the pi waiters if necessary */
 	if (waiter == rt_mutex_top_waiter(lock)) {
@@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	top_waiter = rt_mutex_top_waiter(lock);
 
 	/* [13] Drop the locks */
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock(&task->pi_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	/*
 	 * Make the actual exit decisions [12], based on the stored
@@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	goto again;
 
  out_unlock_pi:
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	raw_spin_unlock_irq(&task->pi_lock);
  out_put_task:
 	put_task_struct(task);
 
@@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 /*
  * Try to take an rt-mutex
  *
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
  *
  * @lock:   The lock to be acquired.
  * @task:   The task which wants to acquire the lock
@@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 				struct rt_mutex_waiter *waiter)
 {
-	unsigned long flags;
-
 	/*
 	 * Before testing whether we can acquire @lock, we set the
 	 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	 * case, but conditionals are more expensive than a redundant
 	 * store.
 	 */
-	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	raw_spin_lock(&task->pi_lock);
 	task->pi_blocked_on = NULL;
 	/*
 	 * Finish the lock acquisition. @task is the new owner. If
@@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	 */
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	raw_spin_unlock(&task->pi_lock);
 
 takeit:
 	/* We got the lock. */
@@ -883,7 +882,7 @@ takeit:
  *
  * Prepare waiter and propagate pi chain
  *
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
  */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
@@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	struct rt_mutex_waiter *top_waiter = waiter;
 	struct rt_mutex *next_lock;
 	int chain_walk = 0, res;
-	unsigned long flags;
 
 	/*
 	 * Early deadlock detection. We really don't want the task to
@@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	if (owner == task)
 		return -EDEADLK;
 
-	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	raw_spin_lock(&task->pi_lock);
 	__rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
@@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 
 	task->pi_blocked_on = waiter;
 
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	raw_spin_unlock(&task->pi_lock);
 
 	if (!owner)
 		return 0;
 
-	raw_spin_lock_irqsave(&owner->pi_lock, flags);
+	raw_spin_lock(&owner->pi_lock);
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		rt_mutex_dequeue_pi(owner, top_waiter);
 		rt_mutex_enqueue_pi(owner, waiter);
@@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
 
-	raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+	raw_spin_unlock(&owner->pi_lock);
 	/*
 	 * Even if full deadlock detection is on, if the owner is not
 	 * blocked itself, we can avoid finding this out in the chain
@@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	 */
 	get_task_struct(owner);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
 					 next_lock, waiter, task);
 
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irq(&lock->wait_lock);
 
 	return res;
 }
@@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
  * Remove the top waiter from the current tasks pi waiter tree and
  * queue it up.
  *
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
  */
 static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 				    struct rt_mutex *lock)
 {
 	struct rt_mutex_waiter *waiter;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&current->pi_lock, flags);
+	raw_spin_lock(&current->pi_lock);
 
 	waiter = rt_mutex_top_waiter(lock);
 
@@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 	 */
 	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
-	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+	raw_spin_unlock(&current->pi_lock);
 
 	wake_q_add(wake_q, waiter->task);
 }
@@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 /*
  * Remove a waiter from a lock and give up
  *
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
  * have just failed to try_to_take_rt_mutex().
  */
 static void remove_waiter(struct rt_mutex *lock,
@@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock,
 	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex *next_lock;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&current->pi_lock, flags);
+	raw_spin_lock(&current->pi_lock);
 	rt_mutex_dequeue(lock, waiter);
 	current->pi_blocked_on = NULL;
-	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+	raw_spin_unlock(&current->pi_lock);
 
 	/*
 	 * Only update priority if the waiter was the highest priority
@@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock,
 	if (!owner || !is_top_waiter)
 		return;
 
-	raw_spin_lock_irqsave(&owner->pi_lock, flags);
+	raw_spin_lock(&owner->pi_lock);
 
 	rt_mutex_dequeue_pi(owner, waiter);
 
@@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock,
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
 
-	raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+	raw_spin_unlock(&owner->pi_lock);
 
 	/*
 	 * Don't walk the chain, if the owner task is not blocked
@@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock,
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(owner);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
 				   next_lock, NULL, current);
 
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irq(&lock->wait_lock);
 }
 
 /*
@@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task)
  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
  * @lock:		 the rt_mutex to take
  * @state:		 the state the task should block in (TASK_INTERRUPTIBLE
- * 			 or TASK_UNINTERRUPTIBLE)
+ *			 or TASK_UNINTERRUPTIBLE)
  * @timeout:		 the pre-initialized and started timer, or NULL for none
  * @waiter:		 the pre-initialized rt_mutex_waiter
  *
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
  */
 static int __sched
 __rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 				break;
 		}
 
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock_irq(&lock->wait_lock);
 
 		debug_rt_mutex_print_deadlock(waiter);
 
 		schedule();
 
-		raw_spin_lock(&lock->wait_lock);
+		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
 	}
 
@@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		  enum rtmutex_chainwalk chwalk)
 {
 	struct rt_mutex_waiter waiter;
+	unsigned long flags;
 	int ret = 0;
 
 	debug_rt_mutex_init_waiter(&waiter);
 	RB_CLEAR_NODE(&waiter.pi_tree_entry);
 	RB_CLEAR_NODE(&waiter.tree_entry);
 
-	raw_spin_lock(&lock->wait_lock);
+	/*
+	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
+	 * be called in early boot if the cmpxchg() fast path is disabled
+	 * (debug, no architecture support). In this case we will acquire the
+	 * rtmutex with lock->wait_lock held. But we cannot unconditionally
+	 * enable interrupts in that early boot case. So we need to use the
+	 * irqsave/restore variants.
+	 */
+	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	/* Try to acquire the lock again: */
 	if (try_to_take_rt_mutex(lock, current, NULL)) {
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 		return 0;
 	}
 
@@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	/* Remove pending timer: */
 	if (unlikely(timeout))
@@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
  */
 static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
+	unsigned long flags;
 	int ret;
 
 	/*
@@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 		return 0;
 
 	/*
-	 * The mutex has currently no owner. Lock the wait lock and
-	 * try to acquire the lock.
+	 * The mutex has currently no owner. Lock the wait lock and try to
+	 * acquire the lock. We use irqsave here to support early boot calls.
 	 */
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	ret = try_to_take_rt_mutex(lock, current, NULL);
 
@@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	return ret;
 }
@@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 					struct wake_q_head *wake_q)
 {
-	raw_spin_lock(&lock->wait_lock);
+	unsigned long flags;
+
+	/* irqsave required to support early boot calls */
+	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
 	debug_rt_mutex_unlock(lock);
 
@@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 	 */
 	while (!rt_mutex_has_waiters(lock)) {
 		/* Drops lock->wait_lock ! */
-		if (unlock_rt_mutex_safe(lock) == true)
+		if (unlock_rt_mutex_safe(lock, flags) == true)
 			return false;
 		/* Relock the rtmutex and try again */
-		raw_spin_lock(&lock->wait_lock);
+		raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	}
 
 	/*
@@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 	 */
 	mark_wakeup_next_waiter(wake_q, lock);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	/* check PI boosting */
 	return true;
@@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 {
 	int ret;
 
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irq(&lock->wait_lock);
 
 	if (try_to_take_rt_mutex(lock, task, NULL)) {
-		raw_spin_unlock(&lock->wait_lock);
+		raw_spin_unlock_irq(&lock->wait_lock);
 		return 1;
 	}
 
@@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 	if (unlikely(ret))
 		remove_waiter(lock, waiter);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	debug_rt_mutex_print_deadlock(waiter);
 
@@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 {
 	int ret;
 
-	raw_spin_lock(&lock->wait_lock);
+	raw_spin_lock_irq(&lock->wait_lock);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
@@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 	 */
 	fixup_rt_mutex_waiters(lock);
 
-	raw_spin_unlock(&lock->wait_lock);
+	raw_spin_unlock_irq(&lock->wait_lock);
 
 	return ret;
 }
-- 
cgit 


From 8a5fd56431fe1682e870bd6ab0c276e74befbeb9 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 4 Feb 2016 14:40:40 +0100
Subject: locking/lockdep: Fix stack trace caching logic

check_prev_add() caches saved stack trace in static trace variable
to avoid duplicate save_trace() calls in dependencies involving trylocks.
But that caching logic contains a bug. We may not save trace on first
iteration due to early return from check_prev_add(). Then on the
second iteration when we actually need the trace we don't save it
because we think that we've already saved it.

Let check_prev_add() itself control when stack is saved.

There is another bug. Trace variable is protected by graph lock.
But we can temporary release graph lock during printing.

Fix this by invalidating cached stack trace when we release graph lock.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: glider@google.com
Cc: kcc@google.com
Cc: peter@hurleysoftware.com
Cc: sasha.levin@oracle.com
Link: http://lkml.kernel.org/r/1454593240-121647-1-git-send-email-dvyukov@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel/locking')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace56618f6..c7710e4092ef 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance, int trylock_loop)
+	       struct held_lock *next, int distance, int *stack_saved)
 {
 	struct lock_list *entry;
 	int ret;
@@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		}
 	}
 
-	if (!trylock_loop && !save_trace(&trace))
-		return 0;
+	if (!*stack_saved) {
+		if (!save_trace(&trace))
+			return 0;
+		*stack_saved = 1;
+	}
 
 	/*
 	 * Ok, all validations passed, add the new lock
@@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 * Debugging printouts:
 	 */
 	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+		/* We drop graph lock, so another thread can overwrite trace. */
+		*stack_saved = 0;
 		graph_unlock();
 		printk("\n new dependency: ");
 		print_lock_name(hlock_class(prev));
@@ -1929,7 +1934,7 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
 	int depth = curr->lockdep_depth;
-	int trylock_loop = 0;
+	int stack_saved = 0;
 	struct held_lock *hlock;
 
 	/*
@@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 */
 		if (hlock->read != 2 && hlock->check) {
 			if (!check_prev_add(curr, hlock, next,
-						distance, trylock_loop))
+						distance, &stack_saved))
 				return 0;
 			/*
 			 * Stop after the first non-trylock entry,
@@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		if (curr->held_locks[depth].irq_context !=
 				curr->held_locks[depth-1].irq_context)
 			break;
-		trylock_loop = 1;
 	}
 	return 1;
 out_bug:
-- 
cgit 


From a63f38cc4ccfa076f87fc3d0c276ee62e710f953 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 3 Feb 2016 13:44:12 -0800
Subject: locking/lockdep: Convert hash tables to hlists

Mike said:

: CONFIG_UBSAN_ALIGNMENT breaks x86-64 kernel with lockdep enabled, i.e.
: kernel with CONFIG_UBSAN_ALIGNMENT=y fails to load without even any error
: message.
:
: The problem is that ubsan callbacks use spinlocks and might be called
: before lockdep is initialized.  Particularly this line in the
: reserve_ebda_region function causes problem:
:
: lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
:
: If i put lockdep_init() before reserve_ebda_region call in
: x86_64_start_reservations kernel loads well.

Fix this ordering issue permanently: change lockdep so that it uses hlists
for the hash tables.  Unlike a list_head, an hlist_head is in its
initialized state when it is all-zeroes, so lockdep is ready for operation
immediately upon boot - lockdep_init() need not have run.

The patch will also save some memory.

Probably lockdep_init() and lockdep_initialized can be done away with now.

Suggested-by: Mike Krinkin <krinkin.m.u@gmail.com>
Reported-by: Mike Krinkin <krinkin.m.u@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: mm-commits@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  |  4 ++--
 kernel/locking/lockdep.c | 42 +++++++++++++++++++-----------------------
 2 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'kernel/locking')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index c57e424d914b..4dca42fd32f5 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -66,7 +66,7 @@ struct lock_class {
 	/*
 	 * class-hash:
 	 */
-	struct list_head		hash_entry;
+	struct hlist_node		hash_entry;
 
 	/*
 	 * global list of all lock-classes:
@@ -199,7 +199,7 @@ struct lock_chain {
 	u8				irq_context;
 	u8				depth;
 	u16				base;
-	struct list_head		entry;
+	struct hlist_node		entry;
 	u64				chain_key;
 };
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c7710e4092ef..716547fdb873 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes);
 #define __classhashfn(key)	hash_long((unsigned long)key, CLASSHASH_BITS)
 #define classhashentry(key)	(classhash_table + __classhashfn((key)))
 
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
 
 /*
  * We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
 #define __chainhashfn(chain)	hash_long(chain, CHAINHASH_BITS)
 #define chainhashentry(chain)	(chainhash_table + __chainhashfn((chain)))
 
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
 
 /*
  * The hash key of the lock dependency chains is a hash itself too:
@@ -666,7 +666,7 @@ static inline struct lock_class *
 look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
 	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
+	struct hlist_head *hash_head;
 	struct lock_class *class;
 
 #ifdef CONFIG_DEBUG_LOCKDEP
@@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return NULL;
 
-	list_for_each_entry_rcu(class, hash_head, hash_entry) {
+	hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
 		if (class->key == key) {
 			/*
 			 * Huh! same key, different name? Did someone trample
@@ -742,7 +742,7 @@ static inline struct lock_class *
 register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
 	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
+	struct hlist_head *hash_head;
 	struct lock_class *class;
 
 	DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * We have to do the hash-walk again, to avoid races
 	 * with another CPU:
 	 */
-	list_for_each_entry_rcu(class, hash_head, hash_entry) {
+	hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
 		if (class->key == key)
 			goto out_unlock_set;
 	}
@@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * We use RCU's safe list-add method to make
 	 * parallel walking of the hash-list safe:
 	 */
-	list_add_tail_rcu(&class->hash_entry, hash_head);
+	hlist_add_head_rcu(&class->hash_entry, hash_head);
 	/*
 	 * Add it to the global list of classes:
 	 */
@@ -2021,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 				     u64 chain_key)
 {
 	struct lock_class *class = hlock_class(hlock);
-	struct list_head *hash_head = chainhashentry(chain_key);
+	struct hlist_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
 	struct held_lock *hlock_curr;
 	int i, j;
@@ -2037,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	 * We can walk it lock-free, because entries only get added
 	 * to the hash:
 	 */
-	list_for_each_entry_rcu(chain, hash_head, entry) {
+	hlist_for_each_entry_rcu(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 cache_hit:
 			debug_atomic_inc(chain_lookup_hits);
@@ -2061,7 +2061,7 @@ cache_hit:
 	/*
 	 * We have to walk the chain again locked - to avoid duplicates:
 	 */
-	list_for_each_entry(chain, hash_head, entry) {
+	hlist_for_each_entry(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 			graph_unlock();
 			goto cache_hit;
@@ -2095,7 +2095,7 @@ cache_hit:
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
-	list_add_tail_rcu(&chain->entry, hash_head);
+	hlist_add_head_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
 
@@ -3879,7 +3879,7 @@ void lockdep_reset(void)
 	nr_process_chains = 0;
 	debug_locks = 1;
 	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_LIST_HEAD(chainhash_table + i);
+		INIT_HLIST_HEAD(chainhash_table + i);
 	raw_local_irq_restore(flags);
 }
 
@@ -3898,7 +3898,7 @@ static void zap_class(struct lock_class *class)
 	/*
 	 * Unhash the class and remove it from the all_lock_classes list:
 	 */
-	list_del_rcu(&class->hash_entry);
+	hlist_del_rcu(&class->hash_entry);
 	list_del_rcu(&class->lock_entry);
 
 	RCU_INIT_POINTER(class->key, NULL);
@@ -3921,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
 void lockdep_free_key_range(void *start, unsigned long size)
 {
 	struct lock_class *class;
-	struct list_head *head;
+	struct hlist_head *head;
 	unsigned long flags;
 	int i;
 	int locked;
@@ -3934,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 	 */
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry_rcu(class, head, hash_entry) {
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
 			if (within(class->key, start, size))
 				zap_class(class);
 			else if (within(class->name, start, size))
@@ -3966,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
 	struct lock_class *class;
-	struct list_head *head;
+	struct hlist_head *head;
 	unsigned long flags;
 	int i, j;
 	int locked;
@@ -3991,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 	locked = graph_lock();
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry_rcu(class, head, hash_entry) {
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
 			int match = 0;
 
 			for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4031,10 +4027,10 @@ void lockdep_init(void)
 		return;
 
 	for (i = 0; i < CLASSHASH_SIZE; i++)
-		INIT_LIST_HEAD(classhash_table + i);
+		INIT_HLIST_HEAD(classhash_table + i);
 
 	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_LIST_HEAD(chainhash_table + i);
+		INIT_HLIST_HEAD(chainhash_table + i);
 
 	lockdep_initialized = 1;
 }
-- 
cgit 


From 06bea3dbfe6a4c333c4333362c46bdf4d9e43504 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 4 Feb 2016 11:29:36 -0800
Subject: locking/lockdep: Eliminate lockdep_init()

Lockdep is initialized at compile time now.  Get rid of lockdep_init().

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Krinkin <krinkin.m.u@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: mm-commits@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/c6x/kernel/setup.c                       |  2 -
 arch/microblaze/kernel/setup.c                |  2 -
 arch/powerpc/kernel/setup_32.c                |  2 -
 arch/powerpc/kernel/setup_64.c                |  3 --
 arch/s390/kernel/early.c                      |  1 -
 arch/sparc/kernel/head_64.S                   |  8 ----
 arch/x86/lguest/boot.c                        |  6 ---
 include/linux/lockdep.h                       |  2 -
 init/main.c                                   |  5 ---
 kernel/locking/lockdep.c                      | 59 ---------------------------
 tools/lib/lockdep/common.c                    |  5 ---
 tools/lib/lockdep/include/liblockdep/common.h |  1 -
 tools/lib/lockdep/preload.c                   |  2 -
 13 files changed, 98 deletions(-)

(limited to 'kernel/locking')

diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c
index 72e17f7ebd6f..786e36e2f61d 100644
--- a/arch/c6x/kernel/setup.c
+++ b/arch/c6x/kernel/setup.c
@@ -281,8 +281,6 @@ notrace void __init machine_init(unsigned long dt_ptr)
 	 */
 	set_ist(_vectors_start);
 
-	lockdep_init();
-
 	/*
 	 * dtb is passed in from bootloader.
 	 * fdt is linked in blob.
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c
index 89a2a9394927..f31ebb5dc26c 100644
--- a/arch/microblaze/kernel/setup.c
+++ b/arch/microblaze/kernel/setup.c
@@ -130,8 +130,6 @@ void __init machine_early_init(const char *cmdline, unsigned int ram,
 	memset(__bss_start, 0, __bss_stop-__bss_start);
 	memset(_ssbss, 0, _esbss-_ssbss);
 
-	lockdep_init();
-
 /* initialize device tree for usage in early_printk */
 	early_init_devtree(_fdt_start);
 
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index ad8c9db61237..d544fa311757 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -114,8 +114,6 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */
 
 notrace void __init machine_init(u64 dt_ptr)
 {
-	lockdep_init();
-
 	/* Enable early debugging if any specified (see udbg.h) */
 	udbg_early_init();
 
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 5c03a6a9b054..f98be8383a39 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -255,9 +255,6 @@ void __init early_setup(unsigned long dt_ptr)
 	setup_paca(&boot_paca);
 	fixup_boot_paca();
 
-	/* Initialize lockdep early or else spinlocks will blow */
-	lockdep_init();
-
 	/* -------- printk is now safe to use ------- */
 
 	/* Enable early debugging if any specified (see udbg.h) */
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index c55576bbaa1f..a0684de5a93b 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -448,7 +448,6 @@ void __init startup_init(void)
 	rescue_initrd();
 	clear_bss_section();
 	init_kernel_storage_key();
-	lockdep_init();
 	lockdep_off();
 	setup_lowcore_early();
 	setup_facility_list();
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index f2d30cab5b3f..cd1f592cd347 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -696,14 +696,6 @@ tlb_fixup_done:
 	call	__bzero
 	 sub	%o1, %o0, %o1
 
-#ifdef CONFIG_LOCKDEP
-	/* We have this call this super early, as even prom_init can grab
-	 * spinlocks and thus call into the lockdep code.
-	 */
-	call	lockdep_init
-	 nop
-#endif
-
 	call	prom_init
 	 mov	%l7, %o0			! OpenPROM cif handler
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4ba229ac3f4f..f56cc418c87d 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1520,12 +1520,6 @@ __init void lguest_init(void)
 	 */
 	reserve_top_address(lguest_data.reserve_mem);
 
-	/*
-	 * If we don't initialize the lock dependency checker now, it crashes
-	 * atomic_notifier_chain_register, then paravirt_disable_iospace.
-	 */
-	lockdep_init();
-
 	/* Hook in our special panic hypercall code. */
 	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 4dca42fd32f5..d026b190c530 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -261,7 +261,6 @@ struct held_lock {
 /*
  * Initialization, self-test and debugging-output methods:
  */
-extern void lockdep_init(void);
 extern void lockdep_info(void);
 extern void lockdep_reset(void);
 extern void lockdep_reset_lock(struct lockdep_map *lock);
@@ -392,7 +391,6 @@ static inline void lockdep_on(void)
 # define lockdep_set_current_reclaim_state(g)	do { } while (0)
 # define lockdep_clear_current_reclaim_state()	do { } while (0)
 # define lockdep_trace_alloc(g)			do { } while (0)
-# define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
 		do { (void)(name); (void)(key); } while (0)
diff --git a/init/main.c b/init/main.c
index 58c9e374704b..b3008bcfb1dc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -499,11 +499,6 @@ asmlinkage __visible void __init start_kernel(void)
 	char *command_line;
 	char *after_dashes;
 
-	/*
-	 * Need to run as early as possible, to initialize the
-	 * lockdep hash:
-	 */
-	lockdep_init();
 	set_task_stack_end_magic(&init_task);
 	smp_setup_processor_id();
 	debug_objects_early_init();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 716547fdb873..3261214323fa 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void)
 	return ret;
 }
 
-static int lockdep_initialized;
-
 unsigned long nr_list_entries;
 static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 
@@ -433,19 +431,6 @@ unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
 
 #ifdef CONFIG_DEBUG_LOCKDEP
-/*
- * We cannot printk in early bootup code. Not even early_printk()
- * might work. So we mark any initialization errors and printk
- * about it later on, in lockdep_info().
- */
-static int lockdep_init_error;
-static const char *lock_init_error;
-static unsigned long lockdep_init_trace_data[20];
-static struct stack_trace lockdep_init_trace = {
-	.max_entries = ARRAY_SIZE(lockdep_init_trace_data),
-	.entries = lockdep_init_trace_data,
-};
-
 /*
  * Various lockdep statistics:
  */
@@ -669,20 +654,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	struct hlist_head *hash_head;
 	struct lock_class *class;
 
-#ifdef CONFIG_DEBUG_LOCKDEP
-	/*
-	 * If the architecture calls into lockdep before initializing
-	 * the hashes then we'll warn about it later. (we cannot printk
-	 * right now)
-	 */
-	if (unlikely(!lockdep_initialized)) {
-		lockdep_init();
-		lockdep_init_error = 1;
-		lock_init_error = lock->name;
-		save_stack_trace(&lockdep_init_trace);
-	}
-#endif
-
 	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
 		debug_locks_off();
 		printk(KERN_ERR
@@ -4013,28 +3984,6 @@ out_restore:
 	raw_local_irq_restore(flags);
 }
 
-void lockdep_init(void)
-{
-	int i;
-
-	/*
-	 * Some architectures have their own start_kernel()
-	 * code which calls lockdep_init(), while we also
-	 * call lockdep_init() from the start_kernel() itself,
-	 * and we want to initialize the hashes only once:
-	 */
-	if (lockdep_initialized)
-		return;
-
-	for (i = 0; i < CLASSHASH_SIZE; i++)
-		INIT_HLIST_HEAD(classhash_table + i);
-
-	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_HLIST_HEAD(chainhash_table + i);
-
-	lockdep_initialized = 1;
-}
-
 void __init lockdep_info(void)
 {
 	printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
@@ -4061,14 +4010,6 @@ void __init lockdep_info(void)
 
 	printk(" per task-struct memory footprint: %lu bytes\n",
 		sizeof(struct held_lock) * MAX_LOCK_DEPTH);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-	if (lockdep_init_error) {
-		printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
-		printk("Call stack leading to lockdep invocation was:\n");
-		print_stack_trace(&lockdep_init_trace, 0);
-	}
-#endif
 }
 
 static void
diff --git a/tools/lib/lockdep/common.c b/tools/lib/lockdep/common.c
index 9be663340f0a..d1c89cc06f5f 100644
--- a/tools/lib/lockdep/common.c
+++ b/tools/lib/lockdep/common.c
@@ -11,11 +11,6 @@ static __thread struct task_struct current_obj;
 bool debug_locks = true;
 bool debug_locks_silent;
 
-__attribute__((constructor)) static void liblockdep_init(void)
-{
-	lockdep_init();
-}
-
 __attribute__((destructor)) static void liblockdep_exit(void)
 {
 	debug_check_no_locks_held();
diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h
index a60c14b9662a..6e66277ec437 100644
--- a/tools/lib/lockdep/include/liblockdep/common.h
+++ b/tools/lib/lockdep/include/liblockdep/common.h
@@ -44,7 +44,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 void lock_release(struct lockdep_map *lock, int nested,
 			unsigned long ip);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
-extern void lockdep_init(void);
 
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .name = (_name), .key = (void *)(_key), }
diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c
index 21cdf869a01b..52844847569c 100644
--- a/tools/lib/lockdep/preload.c
+++ b/tools/lib/lockdep/preload.c
@@ -439,7 +439,5 @@ __attribute__((constructor)) static void init_preload(void)
 	ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock");
 #endif
 
-	lockdep_init();
-
 	__init_state = done;
 }
-- 
cgit 


From 920c720aa5aa3900a7f1689228fdfc2580a91e7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 1 Feb 2016 15:11:28 +0100
Subject: locking/mcs: Fix mcs_spin_lock() ordering

Similar to commit b4b29f94856a ("locking/osq: Fix ordering of node
initialisation in osq_lock") the use of xchg_acquire() is
fundamentally broken with MCS like constructs.

Furthermore, it turns out we rely on the global transitivity of this
operation because the unlock path observes the pointer with a
READ_ONCE(), not an smp_load_acquire().

This is non-critical because the MCS code isn't actually used and
mostly serves as documentation, a stepping stone to the more complex
things we've build on top of the idea.

Reported-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Fixes: 3552a07a9c4a ("locking/mcs: Use acquire/release semantics")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mcs_spinlock.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel/locking')

diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5b9102a47ea5..c835270f0c2f 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	node->locked = 0;
 	node->next   = NULL;
 
-	prev = xchg_acquire(lock, node);
+	/*
+	 * We rely on the full barrier with global transitivity implied by the
+	 * below xchg() to order the initialization stores above against any
+	 * observation of @node. And to provide the ACQUIRE ordering associated
+	 * with a LOCK primitive.
+	 */
+	prev = xchg(lock, node);
 	if (likely(prev == NULL)) {
 		/*
 		 * Lock acquired, don't need to set node->locked to 1. Threads
-- 
cgit 


From eaff0e7003cca6c2748b67ead2d4b1a8ad858fc7 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Thu, 10 Dec 2015 15:17:46 -0500
Subject: locking/pvqspinlock: Move lock stealing count tracking code into
 pv_queued_spin_steal_lock()

This patch moves the lock stealing count tracking code into
pv_queued_spin_steal_lock() instead of via a jacket function simplifying
the code.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449778666-13593-3-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_paravirt.h | 16 +++++++++-------
 kernel/locking/qspinlock_stat.h     | 13 -------------
 2 files changed, 9 insertions(+), 20 deletions(-)

(limited to 'kernel/locking')

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 87bb235c3448..78f04a232d3a 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -54,6 +54,11 @@ struct pv_node {
 	u8			state;
 };
 
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
 /*
  * By replacing the regular queued_spin_trylock() with the function below,
  * it will be called once when a lock waiter enter the PV slowpath before
@@ -65,9 +70,11 @@ struct pv_node {
 static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
+	int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+		   (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
 
-	return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-		(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+	qstat_inc(qstat_pv_lock_stealing, ret);
+	return ret;
 }
 
 /*
@@ -137,11 +144,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 }
 #endif /* _Q_PENDING_BITS == 8 */
 
-/*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
 /*
  * Lock and MCS node addresses hash table for fast lookup
  *
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 640dcecdd1df..869988d46124 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -279,19 +279,6 @@ static inline void __pv_wait(u8 *ptr, u8 val)
 #define pv_kick(c)	__pv_kick(c)
 #define pv_wait(p, v)	__pv_wait(p, v)
 
-/*
- * PV unfair trylock count tracking function
- */
-static inline int qstat_spin_steal_lock(struct qspinlock *lock)
-{
-	int ret = pv_queued_spin_steal_lock(lock);
-
-	qstat_inc(qstat_pv_lock_stealing, ret);
-	return ret;
-}
-#undef  queued_spin_trylock
-#define queued_spin_trylock(l)	qstat_spin_steal_lock(l)
-
 #else /* CONFIG_QUEUED_LOCK_STAT */
 
 static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ }
-- 
cgit 


From cb037fdad6772df2d49fe61c97d7c0d8265bc918 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Thu, 10 Dec 2015 15:17:44 -0500
Subject: locking/qspinlock: Use smp_cond_acquire() in pending code

The newly introduced smp_cond_acquire() was used to replace the
slowpath lock acquisition loop. Similarly, the new function can also
be applied to the pending bit locking loop. This patch uses the new
function in that loop.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449778666-13593-1-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel/locking')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 393d1874b9e0..ce2f75e32ae1 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	 * sequentiality; this is because not all clear_pending_set_locked()
 	 * implementations imply full barriers.
 	 */
-	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
-		cpu_relax();
+	smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
 
 	/*
 	 * take ownership and clear the pending bit.
@@ -435,7 +434,7 @@ queue:
 	 *
 	 * The PV pv_wait_head_or_lock function, if active, will acquire
 	 * the lock and return a non-zero value. So we have to skip the
-	 * smp_load_acquire() call. As the next PV queue head hasn't been
+	 * smp_cond_acquire() call. As the next PV queue head hasn't been
 	 * designated yet, there is no way for the locked value to become
 	 * _Q_SLOW_VAL. So both the set_locked() and the
 	 * atomic_cmpxchg_relaxed() calls will be safe.
@@ -466,7 +465,7 @@ locked:
 			break;
 		}
 		/*
-		 * The smp_load_acquire() call above has provided the necessary
+		 * The smp_cond_acquire() call above has provided the necessary
 		 * acquire semantics required for locking. At most two
 		 * iterations of this loop may be ran.
 		 */
-- 
cgit 


From 32d62510f949d3c8e83b9b3b844a84446611661b Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Thu, 10 Dec 2015 15:17:45 -0500
Subject: locking/pvqspinlock: Enable slowpath locking count tracking

This patch enables the tracking of the number of slowpath locking
operations performed. This can be used to compare against the number
of lock stealing operations to see what percentage of locks are stolen
versus acquired via the regular slowpath.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449778666-13593-2-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_paravirt.h | 5 +++++
 kernel/locking/qspinlock_stat.h     | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'kernel/locking')

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 78f04a232d3a..21ede57f68b3 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -400,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 	if (READ_ONCE(pn->state) == vcpu_hashed)
 		lp = (struct qspinlock **)1;
 
+	/*
+	 * Tracking # of slowpath locking operations
+	 */
+	qstat_inc(qstat_pv_lock_slowpath, true);
+
 	for (;; waitcnt++) {
 		/*
 		 * Set correct vCPU state to be used by queue node wait-early
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 869988d46124..eb2a2c9bc3fc 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,6 +22,7 @@
  *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
  *   pv_latency_kick	- average latency (ns) of vCPU kick operation
  *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_slowpath	- # of locking operations via the slowpath
  *   pv_lock_stealing	- # of lock stealing operations
  *   pv_spurious_wakeup	- # of spurious wakeups
  *   pv_wait_again	- # of vCPU wait's that happened after a vCPU kick
@@ -45,6 +46,7 @@ enum qlock_stats {
 	qstat_pv_kick_wake,
 	qstat_pv_latency_kick,
 	qstat_pv_latency_wake,
+	qstat_pv_lock_slowpath,
 	qstat_pv_lock_stealing,
 	qstat_pv_spurious_wakeup,
 	qstat_pv_wait_again,
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
 	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
 	[qstat_pv_latency_kick]	   = "pv_latency_kick",
 	[qstat_pv_latency_wake]    = "pv_latency_wake",
+	[qstat_pv_lock_slowpath]   = "pv_lock_slowpath",
 	[qstat_pv_lock_stealing]   = "pv_lock_stealing",
 	[qstat_pv_wait_again]      = "pv_wait_again",
 	[qstat_pv_wait_early]      = "pv_wait_early",
-- 
cgit 


From 1329ce6fbbe4536592dfcfc8d64d61bfeb598fe6 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 24 Jan 2016 18:23:43 -0800
Subject: locking/mutex: Allow next waiter lockless wakeup

Make use of wake-queues and enable the wakeup to occur after releasing the
wait_lock. This is similar to what we do with rtmutex top waiter,
slightly shortening the critical region and allow other waiters to
acquire the wait_lock sooner. In low contention cases it can also help
the recently woken waiter to find the wait_lock available (fastpath)
when it continues execution.

Reviewed-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Waiman Long <waiman.long@hpe.com>
Cc: Will Deacon <Will.Deacon@arm.com>
Link: http://lkml.kernel.org/r/20160125022343.GA3322@linux-uzut.site
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/locking')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0551c219c40e..e364b424b019 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -716,6 +716,7 @@ static inline void
 __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
 {
 	unsigned long flags;
+	WAKE_Q(wake_q);
 
 	/*
 	 * As a performance measurement, release the lock before doing other
@@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
 					   struct mutex_waiter, list);
 
 		debug_mutex_wake_waiter(lock, waiter);
-
-		wake_up_process(waiter->task);
+		wake_q_add(&wake_q, waiter->task);
 	}
 
 	spin_unlock_mutex(&lock->wait_lock, flags);
+	wake_up_q(&wake_q);
 }
 
 /*
-- 
cgit 


From 5f18ab5c6bdba735b31a1e7c2618f48eae6b1b5c Mon Sep 17 00:00:00 2001
From: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Date: Thu, 11 Feb 2016 00:33:32 +0100
Subject: locking/lockdep: Prevent chain_key collisions

The chain_key hashing macro iterate_chain_key(key1, key2) does not
generate a new different value if both key1 and key2 are 0. In that
case the generated value is again 0. This can lead to collisions which
can result in lockdep not detecting deadlocks or circular
dependencies.

Avoid the problem by using class_idx (1-based) instead of class id
(0-based) as an input for the hashing macro 'key2' in
iterate_chain_key(key1, key2).

The use of class id created collisions in cases like the following:

1.- Consider an initial state in which no class has been acquired yet.
Under these circumstances an AA deadlock will not be detected by
lockdep:

  lock  [key1,key2]->new key  (key1=old chain_key, key2=id)
  --------------------------
  A     [0,0]->0
  A     [0,0]->0 (collision)

  The newly generated chain_key collides with the one used before and as
  a result the check for a deadlock is skipped

  A simple test using liblockdep and a pthread mutex confirms the
  problem: (omitting stack traces)

    new class 0xe15038: 0x7ffc64950f20
    acquire class [0xe15038] 0x7ffc64950f20
    acquire class [0xe15038] 0x7ffc64950f20
    hash chain already cached, key: 0000000000000000 tail class:
    [0xe15038] 0x7ffc64950f20

2.- Consider an ABBA in 2 different tasks and no class yet acquired.

  T1 [key1,key2]->new key     T2[key1,key2]->new key
  --                          --
  A [0,0]->0

                              B [0,1]->1

  B [0,1]->1  (collision)

                              A

In this case the collision prevents lockdep from creating the new
dependency A->B. This in turn results in lockdep not detecting the
circular dependency when T2 acquires A.

Signed-off-by: Alfredo Alvarez Fernandez <alfredoalvarezernandez@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: sasha.levin@oracle.com
Link: http://lkml.kernel.org/r/1455147212-2389-4-git-send-email-alfredoalvarezernandez@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel/locking')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 3261214323fa..6f446eb3c742 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2143,7 +2143,7 @@ static void check_chain_key(struct task_struct *curr)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
 	struct held_lock *hlock, *prev_hlock = NULL;
-	unsigned int i, id;
+	unsigned int i;
 	u64 chain_key = 0;
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
@@ -2160,17 +2160,16 @@ static void check_chain_key(struct task_struct *curr)
 				(unsigned long long)hlock->prev_chain_key);
 			return;
 		}
-		id = hlock->class_idx - 1;
 		/*
 		 * Whoops ran out of static storage again?
 		 */
-		if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+		if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
 			return;
 
 		if (prev_hlock && (prev_hlock->irq_context !=
 							hlock->irq_context))
 			chain_key = 0;
-		chain_key = iterate_chain_key(chain_key, id);
+		chain_key = iterate_chain_key(chain_key, hlock->class_idx);
 		prev_hlock = hlock;
 	}
 	if (chain_key != curr->curr_chain_key) {
@@ -3048,7 +3047,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	struct task_struct *curr = current;
 	struct lock_class *class = NULL;
 	struct held_lock *hlock;
-	unsigned int depth, id;
+	unsigned int depth;
 	int chain_head = 0;
 	int class_idx;
 	u64 chain_key;
@@ -3151,11 +3150,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	 * The 'key ID' is what is the most compact key value to drive
 	 * the hash, not class->key.
 	 */
-	id = class - lock_classes;
 	/*
 	 * Whoops, we did it again.. ran straight out of our static allocation.
 	 */
-	if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+	if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
 		return 0;
 
 	chain_key = curr->curr_chain_key;
@@ -3173,7 +3171,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		chain_key = 0;
 		chain_head = 1;
 	}
-	chain_key = iterate_chain_key(chain_key, id);
+	chain_key = iterate_chain_key(chain_key, class_idx);
 
 	if (nest_lock && !__lock_is_held(nest_lock))
 		return print_lock_nested_lock_not_held(curr, hlock, ip);
-- 
cgit 


From 9e4e7554e755aaad8df0e26268787438735b4b76 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 29 Feb 2016 10:03:58 +0100
Subject: locking/lockdep: Detect chain_key collisions

Add detection for chain_key collision under CONFIG_DEBUG_LOCKDEP.
When a collision is detected the problem is reported and all lock
debugging is turned off.

Tested using liblockdep and the added tests before and after
applying the fix, confirming both that the code added for the
detection correctly reports the problem and that the fix actually
fixes it.

Tested tweaking lockdep to generate false collisions and
verified that the problem is reported and that lock debugging is
turned off.

Also tested with lockdep's test suite after applying the patch:

    [    0.000000] Good, all 253 testcases passed! |

Signed-off-by: Alfredo Alvarez Fernandez <alfredoalvarezernandez@gmail.com>
Cc: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: sasha.levin@oracle.com
Link: http://lkml.kernel.org/r/1455864533-7536-4-git-send-email-alfredoalvarezernandez@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 59 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 8 deletions(-)

(limited to 'kernel/locking')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 6f446eb3c742..f894a2cd9b2a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1981,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
 	return lock_classes + chain_hlocks[chain->base + i];
 }
 
+/*
+ * Returns the index of the first held_lock of the current chain
+ */
+static inline int get_first_held_lock(struct task_struct *curr,
+					struct held_lock *hlock)
+{
+	int i;
+	struct held_lock *hlock_curr;
+
+	for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+		hlock_curr = curr->held_locks + i;
+		if (hlock_curr->irq_context != hlock->irq_context)
+			break;
+
+	}
+
+	return ++i;
+}
+
+/*
+ * Checks whether the chain and the current held locks are consistent
+ * in depth and also in content. If they are not it most likely means
+ * that there was a collision during the calculation of the chain_key.
+ * Returns: 0 not passed, 1 passed
+ */
+static int check_no_collision(struct task_struct *curr,
+			struct held_lock *hlock,
+			struct lock_chain *chain)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+	int i, j, id;
+
+	i = get_first_held_lock(curr, hlock);
+
+	if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+		return 0;
+
+	for (j = 0; j < chain->depth - 1; j++, i++) {
+		id = curr->held_locks[i].class_idx - 1;
+
+		if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+			return 0;
+	}
+#endif
+	return 1;
+}
+
 /*
  * Look up a dependency chain. If the key is not present yet then
  * add it and return 1 - in this case the new dependency chain is
@@ -1994,7 +2041,6 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	struct lock_class *class = hlock_class(hlock);
 	struct hlist_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
-	struct held_lock *hlock_curr;
 	int i, j;
 
 	/*
@@ -2012,6 +2058,9 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 		if (chain->chain_key == chain_key) {
 cache_hit:
 			debug_atomic_inc(chain_lookup_hits);
+			if (!check_no_collision(curr, hlock, chain))
+				return 0;
+
 			if (very_verbose(class))
 				printk("\nhash chain already cached, key: "
 					"%016Lx tail class: [%p] %s\n",
@@ -2049,13 +2098,7 @@ cache_hit:
 	chain = lock_chains + nr_lock_chains++;
 	chain->chain_key = chain_key;
 	chain->irq_context = hlock->irq_context;
-	/* Find the first held_lock of current chain */
-	for (i = curr->lockdep_depth - 1; i >= 0; i--) {
-		hlock_curr = curr->held_locks + i;
-		if (hlock_curr->irq_context != hlock->irq_context)
-			break;
-	}
-	i++;
+	i = get_first_held_lock(curr, hlock);
 	chain->depth = curr->lockdep_depth + 1 - i;
 	if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
 		chain->base = nr_chain_hlocks;
-- 
cgit 


From 25528213fe9f75f4e286f08d35a73ca2bb634a50 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 15 Mar 2016 14:52:49 -0700
Subject: tags: Fix DEFINE_PER_CPU expansions

$ make tags
  GEN     tags
ctags: Warning: drivers/acpi/processor_idle.c:64: null expansion of name pattern "\1"
ctags: Warning: drivers/xen/events/events_2l.c:41: null expansion of name pattern "\1"
ctags: Warning: kernel/locking/lockdep.c:151: null expansion of name pattern "\1"
ctags: Warning: kernel/rcu/rcutorture.c:133: null expansion of name pattern "\1"
ctags: Warning: kernel/rcu/rcutorture.c:135: null expansion of name pattern "\1"
ctags: Warning: kernel/workqueue.c:323: null expansion of name pattern "\1"
ctags: Warning: net/ipv4/syncookies.c:53: null expansion of name pattern "\1"
ctags: Warning: net/ipv6/syncookies.c:44: null expansion of name pattern "\1"
ctags: Warning: net/rds/page.c:45: null expansion of name pattern "\1"

Which are all the result of the DEFINE_PER_CPU pattern:

  scripts/tags.sh:200:	'/\<DEFINE_PER_CPU([^,]*, *\([[:alnum:]_]*\)/\1/v/'
  scripts/tags.sh:201:	'/\<DEFINE_PER_CPU_SHARED_ALIGNED([^,]*, *\([[:alnum:]_]*\)/\1/v/'

The below cures them. All except the workqueue one are within reasonable
distance of the 80 char limit. TJ do you have any preference on how to
fix the wq one, or shall we just not care its too long?

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/processor_idle.c  | 4 ++--
 drivers/xen/events/events_2l.c | 5 +++--
 kernel/locking/lockdep.c       | 3 +--
 kernel/rcu/rcutorture.c        | 6 ++----
 kernel/workqueue.c             | 3 +--
 net/ipv4/syncookies.c          | 3 +--
 net/ipv6/syncookies.c          | 3 +--
 net/rds/page.c                 | 4 ++--
 8 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'kernel/locking')

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 175c86bee3a9..9ca2b2fefd76 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -61,8 +61,8 @@ module_param(latency_factor, uint, 0644);
 
 static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device);
 
-static DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX],
-								acpi_cstate);
+static
+DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], acpi_cstate);
 
 static int disabled_by_idle_boot_param(void)
 {
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index 7dd46312c180..403fe3955393 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -38,8 +38,9 @@
 /* Find the first set bit in a evtchn mask */
 #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
 
-static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD],
-		      cpu_evtchn_mask);
+#define EVTCHN_MASK_SIZE (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)
+
+static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_MASK_SIZE], cpu_evtchn_mask);
 
 static unsigned evtchn_2l_max_channels(void)
 {
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f894a2cd9b2a..53ab2f85d77e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -148,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 }
 
 #ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
-		      cpu_lock_stats);
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats);
 
 static inline u64 lockstat_clock(void)
 {
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d2988d047d66..4d5cc6aa7e1e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current;
 static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
-		      rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
-		      rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
 static atomic_t n_rcu_torture_alloc;
 static atomic_t n_rcu_torture_alloc_fail;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7ff5dc7d2ac5..16e13d8628a3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -320,8 +320,7 @@ static bool wq_debug_force_rr_cpu = false;
 module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
 
 /* the per-cpu worker pools */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
-				     cpu_worker_pools);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
 
 static DEFINE_IDR(worker_pool_idr);	/* PR: idr of all pools */
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 643a86c49020..2d5589b61e9f 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -50,8 +50,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
 #define TSBITS	6
 #define TSMASK	(((__u32)1 << TSBITS) - 1)
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
-		      ipv4_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
 
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 2906ef20795e..aae3e5ca63ea 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -41,8 +41,7 @@ static __u16 const msstab[] = {
 	9000 - 60,
 };
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
-		      ipv6_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
 
 static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
 		       __be16 sport, __be16 dport, u32 count, int c)
diff --git a/net/rds/page.c b/net/rds/page.c
index 5a14e6d6a926..616f21f4e7d7 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -42,8 +42,8 @@ struct rds_page_remainder {
 	unsigned long	r_offset;
 };
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
-				     rds_page_remainders);
+static
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
 
 /*
  * returns 0 on success or -errno on failure.
-- 
cgit 


From 5c9a8750a6409c63a0f01d51a9024861022f6593 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Tue, 22 Mar 2016 14:27:30 -0700
Subject: kernel: add kcov code coverage

kcov provides code coverage collection for coverage-guided fuzzing
(randomized testing).  Coverage-guided fuzzing is a testing technique
that uses coverage feedback to determine new interesting inputs to a
system.  A notable user-space example is AFL
(http://lcamtuf.coredump.cx/afl/).  However, this technique is not
widely used for kernel testing due to missing compiler and kernel
support.

kcov does not aim to collect as much coverage as possible.  It aims to
collect more or less stable coverage that is function of syscall inputs.
To achieve this goal it does not collect coverage in soft/hard
interrupts and instrumentation of some inherently non-deterministic or
non-interesting parts of kernel is disbled (e.g.  scheduler, locking).

Currently there is a single coverage collection mode (tracing), but the
API anticipates additional collection modes.  Initially I also
implemented a second mode which exposes coverage in a fixed-size hash
table of counters (what Quentin used in his original patch).  I've
dropped the second mode for simplicity.

This patch adds the necessary support on kernel side.  The complimentary
compiler support was added in gcc revision 231296.

We've used this support to build syzkaller system call fuzzer, which has
found 90 kernel bugs in just 2 months:

  https://github.com/google/syzkaller/wiki/Found-Bugs

We've also found 30+ bugs in our internal systems with syzkaller.
Another (yet unexplored) direction where kcov coverage would greatly
help is more traditional "blob mutation".  For example, mounting a
random blob as a filesystem, or receiving a random blob over wire.

Why not gcov.  Typical fuzzing loop looks as follows: (1) reset
coverage, (2) execute a bit of code, (3) collect coverage, repeat.  A
typical coverage can be just a dozen of basic blocks (e.g.  an invalid
input).  In such context gcov becomes prohibitively expensive as
reset/collect coverage steps depend on total number of basic
blocks/edges in program (in case of kernel it is about 2M).  Cost of
kcov depends only on number of executed basic blocks/edges.  On top of
that, kernel requires per-thread coverage because there are always
background threads and unrelated processes that also produce coverage.
With inlined gcov instrumentation per-thread coverage is not possible.

kcov exposes kernel PCs and control flow to user-space which is
insecure.  But debugfs should not be mapped as user accessible.

Based on a patch by Quentin Casasnovas.

[akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode']
[akpm@linux-foundation.org: unbreak allmodconfig]
[akpm@linux-foundation.org: follow x86 Makefile layout standards]
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: syzkaller <syzkaller@googlegroups.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@google.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: David Drysdale <drysdale@google.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kcov.txt                | 111 ++++++++++++++
 Makefile                              |  11 +-
 arch/x86/Kconfig                      |   1 +
 arch/x86/boot/Makefile                |   7 +
 arch/x86/boot/compressed/Makefile     |   3 +
 arch/x86/entry/vdso/Makefile          |   3 +
 arch/x86/kernel/Makefile              |   6 +
 arch/x86/kernel/apic/Makefile         |   4 +
 arch/x86/kernel/cpu/Makefile          |   4 +
 arch/x86/lib/Makefile                 |   3 +
 arch/x86/mm/Makefile                  |   3 +
 arch/x86/realmode/rm/Makefile         |   3 +
 drivers/firmware/efi/libstub/Makefile |   3 +
 include/linux/kcov.h                  |  29 ++++
 include/linux/sched.h                 |  11 ++
 include/uapi/linux/kcov.h             |  10 ++
 kernel/Makefile                       |  12 ++
 kernel/exit.c                         |   2 +
 kernel/fork.c                         |   3 +
 kernel/kcov.c                         | 273 ++++++++++++++++++++++++++++++++++
 kernel/locking/Makefile               |   3 +
 kernel/rcu/Makefile                   |   4 +
 kernel/sched/Makefile                 |   4 +
 lib/Kconfig.debug                     |  21 +++
 lib/Makefile                          |  12 ++
 mm/Makefile                           |  15 ++
 mm/kasan/Makefile                     |   1 +
 scripts/Makefile.lib                  |   6 +
 28 files changed, 567 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/kcov.txt
 create mode 100644 include/linux/kcov.h
 create mode 100644 include/uapi/linux/kcov.h
 create mode 100644 kernel/kcov.c

(limited to 'kernel/locking')

diff --git a/Documentation/kcov.txt b/Documentation/kcov.txt
new file mode 100644
index 000000000000..779ff4ab1c1d
--- /dev/null
+++ b/Documentation/kcov.txt
@@ -0,0 +1,111 @@
+kcov: code coverage for fuzzing
+===============================
+
+kcov exposes kernel code coverage information in a form suitable for coverage-
+guided fuzzing (randomized testing). Coverage data of a running kernel is
+exported via the "kcov" debugfs file. Coverage collection is enabled on a task
+basis, and thus it can capture precise coverage of a single system call.
+
+Note that kcov does not aim to collect as much coverage as possible. It aims
+to collect more or less stable coverage that is function of syscall inputs.
+To achieve this goal it does not collect coverage in soft/hard interrupts
+and instrumentation of some inherently non-deterministic parts of kernel is
+disbled (e.g. scheduler, locking).
+
+Usage:
+======
+
+Configure kernel with:
+
+        CONFIG_KCOV=y
+
+CONFIG_KCOV requires gcc built on revision 231296 or later.
+Profiling data will only become accessible once debugfs has been mounted:
+
+        mount -t debugfs none /sys/kernel/debug
+
+The following program demonstrates kcov usage from within a test program:
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+#define COVER_SIZE			(64<<10)
+
+int main(int argc, char **argv)
+{
+	int fd;
+	unsigned long *cover, n, i;
+
+	/* A single fd descriptor allows coverage collection on a single
+	 * thread.
+	 */
+	fd = open("/sys/kernel/debug/kcov", O_RDWR);
+	if (fd == -1)
+		perror("open"), exit(1);
+	/* Setup trace mode and trace size. */
+	if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
+		perror("ioctl"), exit(1);
+	/* Mmap buffer shared between kernel- and user-space. */
+	cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
+				     PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if ((void*)cover == MAP_FAILED)
+		perror("mmap"), exit(1);
+	/* Enable coverage collection on the current thread. */
+	if (ioctl(fd, KCOV_ENABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Reset coverage from the tail of the ioctl() call. */
+	__atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
+	/* That's the target syscal call. */
+	read(-1, NULL, 0);
+	/* Read number of PCs collected. */
+	n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+	for (i = 0; i < n; i++)
+		printf("0x%lx\n", cover[i + 1]);
+	/* Disable coverage collection for the current thread. After this call
+	 * coverage can be enabled for a different thread.
+	 */
+	if (ioctl(fd, KCOV_DISABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Free resources. */
+	if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
+		perror("munmap"), exit(1);
+	if (close(fd))
+		perror("close"), exit(1);
+	return 0;
+}
+
+After piping through addr2line output of the program looks as follows:
+
+SyS_read
+fs/read_write.c:562
+__fdget_pos
+fs/file.c:774
+__fget_light
+fs/file.c:746
+__fget_light
+fs/file.c:750
+__fget_light
+fs/file.c:760
+__fdget_pos
+fs/file.c:784
+SyS_read
+fs/read_write.c:562
+
+If a program needs to collect coverage from several threads (independently),
+it needs to open /sys/kernel/debug/kcov in each thread separately.
+
+The interface is fine-grained to allow efficient forking of test processes.
+That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
+mmaps coverage buffer and then forks child processes in a loop. Child processes
+only need to enable coverage (disable happens automatically on thread end).
diff --git a/Makefile b/Makefile
index e055b969c325..b98a4f70d1b5 100644
--- a/Makefile
+++ b/Makefile
@@ -365,6 +365,7 @@ LDFLAGS_MODULE  =
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
 CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage
+CFLAGS_KCOV	= -fsanitize-coverage=trace-pc
 
 
 # Use USERINCLUDE when you must reference the UAPI directories only.
@@ -411,7 +412,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_UBSAN
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
 export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -673,6 +674,14 @@ endif
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 
+ifdef CONFIG_KCOV
+  ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
+    $(warning Cannot use CONFIG_KCOV: \
+             -fsanitize-coverage=trace-pc is not supported by compiler)
+    CFLAGS_KCOV =
+  endif
+endif
+
 ifeq ($(cc-name),clang)
 KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
 KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8b680a5cb25b..54478b7635de 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0bf6749522d9..b1ef9e489084 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -12,6 +12,13 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT		:= n
+
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5e1d26e09407..6915ff2bd996 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,6 +19,9 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
 	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index f9fb859c98b9..6874da5f67fc 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -7,6 +7,9 @@ KASAN_SANITIZE			:= n
 UBSAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d5fb0871aba3..adaae2c781c1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,6 +25,12 @@ OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
 OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
 
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT		:= n
+
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 8bb12ddc5db8..8e63ebdcbd0b 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT		:= n
+
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o ipi.o vector.o
 obj-y				+= hw_nmi.o
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 0d373d7affc8..4a8697f7d4ef 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
 # Make sure load_percpu_segment has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index a501fa25da41..72a576752a7e 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,6 +2,9 @@
 # Makefile for x86 specific library files.
 #
 
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o	:= n
+
 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
 quiet_cmd_inat_tables = GEN     $@
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 67cf2e1e557b..f98913258c63 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o	:= n
+
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
 
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 053abe7b0ef7..b95964610ea7 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -9,6 +9,9 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 always := realmode.bin realmode.relocs
 
 wakeup-objs	:= wakeup_asm.o wakemain.o video-mode.o
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index a15841eced4e..da99bbb74aeb 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -25,6 +25,9 @@ KASAN_SANITIZE			:= n
 UBSAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT			:= n
+
 lib-y				:= efi-stub-helper.o
 
 # include the stub's generic dependencies from lib/ when building for ARM/arm64
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
new file mode 100644
index 000000000000..2883ac98c280
--- /dev/null
+++ b/include/linux/kcov.h
@@ -0,0 +1,29 @@
+#ifndef _LINUX_KCOV_H
+#define _LINUX_KCOV_H
+
+#include <uapi/linux/kcov.h>
+
+struct task_struct;
+
+#ifdef CONFIG_KCOV
+
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
+enum kcov_mode {
+	/* Coverage collection is not enabled yet. */
+	KCOV_MODE_DISABLED = 0,
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 */
+	KCOV_MODE_TRACE = 1,
+};
+
+#else
+
+static inline void kcov_task_init(struct task_struct *t) {}
+static inline void kcov_task_exit(struct task_struct *t) {}
+
+#endif /* CONFIG_KCOV */
+#endif /* _LINUX_KCOV_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 084ed9fba620..34495d2d2d7b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
@@ -1818,6 +1819,16 @@ struct task_struct {
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
+#ifdef CONFIG_KCOV
+	/* Coverage collection mode enabled for this task (0 if disabled). */
+	enum kcov_mode kcov_mode;
+	/* Size of the kcov_area. */
+	unsigned	kcov_size;
+	/* Buffer for coverage collection. */
+	void		*kcov_area;
+	/* kcov desciptor wired with this task or NULL. */
+	struct kcov	*kcov;
+#endif
 #ifdef CONFIG_MEMCG
 	struct mem_cgroup *memcg_in_oom;
 	gfp_t memcg_oom_gfp_mask;
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
new file mode 100644
index 000000000000..574e22ec640d
--- /dev/null
+++ b/include/uapi/linux/kcov.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_KCOV_IOCTLS_H
+#define _LINUX_KCOV_IOCTLS_H
+
+#include <linux/types.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+
+#endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index baa55e50a315..f0c40bf49d9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,6 +18,17 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
 endif
 
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
 # cond_syscall is currently not LTO compatible
 CFLAGS_sys_ni.o = $(DISABLE_LTO)
 
@@ -68,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 10e088237fed..953d1a1c0387 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,7 @@
 #include <linux/oom.h>
 #include <linux/writeback.h>
 #include <linux/shm.h>
+#include <linux/kcov.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -655,6 +656,7 @@ void do_exit(long code)
 	TASKS_RCU(int tasks_rcu_i);
 
 	profile_task_exit(tsk);
+	kcov_task_exit(tsk);
 
 	WARN_ON(blk_needs_flush_plug(tsk));
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b8d1e7ceeea..d277e83ed3e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,6 +75,7 @@
 #include <linux/aio.h>
 #include <linux/compiler.h>
 #include <linux/sysctl.h>
+#include <linux/kcov.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -392,6 +393,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	account_kernel_stack(ti, 1);
 
+	kcov_task_init(tsk);
+
 	return tsk;
 
 free_ti:
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100644
index 000000000000..3efbee0834a8
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,273 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ *  - initial state after open()
+ *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ *  - then, mmap() call (several calls are allowed but not useful)
+ *  - then, repeated enable/disable for a task (only one task a time allowed)
+ */
+struct kcov {
+	/*
+	 * Reference counter. We keep one for:
+	 *  - opened file descriptor
+	 *  - task with enabled coverage (we can't unwire it from another task)
+	 */
+	atomic_t		refcount;
+	/* The lock protects mode, size, area and t. */
+	spinlock_t		lock;
+	enum kcov_mode		mode;
+	/* Size of arena (in long's for KCOV_MODE_TRACE). */
+	unsigned		size;
+	/* Coverage buffer shared with user space. */
+	void			*area;
+	/* Task for which we collect coverage, or NULL. */
+	struct task_struct	*t;
+};
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	enum kcov_mode mode;
+
+	t = current;
+	/*
+	 * We are interested in code coverage as a function of a syscall inputs,
+	 * so we ignore code executed in interrupts.
+	 */
+	if (!t || in_interrupt())
+		return;
+	mode = READ_ONCE(t->kcov_mode);
+	if (mode == KCOV_MODE_TRACE) {
+		unsigned long *area;
+		unsigned long pos;
+
+		/*
+		 * There is some code that runs in interrupts but for which
+		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+		 * interrupts, there are paired barrier()/WRITE_ONCE() in
+		 * kcov_ioctl_locked().
+		 */
+		barrier();
+		area = t->kcov_area;
+		/* The first word is number of subsequent PCs. */
+		pos = READ_ONCE(area[0]) + 1;
+		if (likely(pos < t->kcov_size)) {
+			area[pos] = _RET_IP_;
+			WRITE_ONCE(area[0], pos);
+		}
+	}
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+static void kcov_get(struct kcov *kcov)
+{
+	atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+	if (atomic_dec_and_test(&kcov->refcount)) {
+		vfree(kcov->area);
+		kfree(kcov);
+	}
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+	t->kcov_mode = KCOV_MODE_DISABLED;
+	t->kcov_size = 0;
+	t->kcov_area = NULL;
+	t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+	struct kcov *kcov;
+
+	kcov = t->kcov;
+	if (kcov == NULL)
+		return;
+	spin_lock(&kcov->lock);
+	if (WARN_ON(kcov->t != t)) {
+		spin_unlock(&kcov->lock);
+		return;
+	}
+	/* Just to not leave dangling references behind. */
+	kcov_task_init(t);
+	kcov->t = NULL;
+	spin_unlock(&kcov->lock);
+	kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	int res = 0;
+	void *area;
+	struct kcov *kcov = vma->vm_file->private_data;
+	unsigned long size, off;
+	struct page *page;
+
+	area = vmalloc_user(vma->vm_end - vma->vm_start);
+	if (!area)
+		return -ENOMEM;
+
+	spin_lock(&kcov->lock);
+	size = kcov->size * sizeof(unsigned long);
+	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	    vma->vm_end - vma->vm_start != size) {
+		res = -EINVAL;
+		goto exit;
+	}
+	if (!kcov->area) {
+		kcov->area = area;
+		vma->vm_flags |= VM_DONTEXPAND;
+		spin_unlock(&kcov->lock);
+		for (off = 0; off < size; off += PAGE_SIZE) {
+			page = vmalloc_to_page(kcov->area + off);
+			if (vm_insert_page(vma, vma->vm_start + off, page))
+				WARN_ONCE(1, "vm_insert_page() failed");
+		}
+		return 0;
+	}
+exit:
+	spin_unlock(&kcov->lock);
+	vfree(area);
+	return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+	struct kcov *kcov;
+
+	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+	if (!kcov)
+		return -ENOMEM;
+	atomic_set(&kcov->refcount, 1);
+	spin_lock_init(&kcov->lock);
+	filep->private_data = kcov;
+	return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+	kcov_put(filep->private_data);
+	return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct task_struct *t;
+	unsigned long size, unused;
+
+	switch (cmd) {
+	case KCOV_INIT_TRACE:
+		/*
+		 * Enable kcov in trace mode and setup buffer size.
+		 * Must happen before anything else.
+		 */
+		if (kcov->mode != KCOV_MODE_DISABLED)
+			return -EBUSY;
+		/*
+		 * Size must be at least 2 to hold current position and one PC.
+		 * Later we allocate size * sizeof(unsigned long) memory,
+		 * that must not overflow.
+		 */
+		size = arg;
+		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+			return -EINVAL;
+		kcov->size = size;
+		kcov->mode = KCOV_MODE_TRACE;
+		return 0;
+	case KCOV_ENABLE:
+		/*
+		 * Enable coverage for the current task.
+		 * At this point user must have been enabled trace mode,
+		 * and mmapped the file. Coverage collection is disabled only
+		 * at task exit or voluntary by KCOV_DISABLE. After that it can
+		 * be enabled for another task.
+		 */
+		unused = arg;
+		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
+		    kcov->area == NULL)
+			return -EINVAL;
+		if (kcov->t != NULL)
+			return -EBUSY;
+		t = current;
+		/* Cache in task struct for performance. */
+		t->kcov_size = kcov->size;
+		t->kcov_area = kcov->area;
+		/* See comment in __sanitizer_cov_trace_pc(). */
+		barrier();
+		WRITE_ONCE(t->kcov_mode, kcov->mode);
+		t->kcov = kcov;
+		kcov->t = t;
+		/* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+		kcov_get(kcov);
+		return 0;
+	case KCOV_DISABLE:
+		/* Disable coverage for the current task. */
+		unused = arg;
+		if (unused != 0 || current->kcov != kcov)
+			return -EINVAL;
+		t = current;
+		if (WARN_ON(kcov->t != t))
+			return -EINVAL;
+		kcov_task_init(t);
+		kcov->t = NULL;
+		kcov_put(kcov);
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct kcov *kcov;
+	int res;
+
+	kcov = filep->private_data;
+	spin_lock(&kcov->lock);
+	res = kcov_ioctl_locked(kcov, cmd, arg);
+	spin_unlock(&kcov->lock);
+	return res;
+}
+
+static const struct file_operations kcov_fops = {
+	.open		= kcov_open,
+	.unlocked_ioctl	= kcov_ioctl,
+	.mmap		= kcov_mmap,
+	.release        = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+		pr_err("failed to create kcov in debugfs\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8e96f6cc2a4a..31322a4275cd 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,6 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT		:= n
 
 obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
 
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 61a16569ffbf..032b2c015beb 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,7 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
 obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 302d6ebd64f7..414d9c16da42 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
 endif
 
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5a60f45cd9bb..532d4d52d1df 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -696,6 +696,27 @@ source "lib/Kconfig.kasan"
 
 endmenu # "Memory Debugging"
 
+config ARCH_HAS_KCOV
+	bool
+	help
+	  KCOV does not have any arch-specific code, but currently it is enabled
+	  only for x86_64. KCOV requires testing on other archs, and most likely
+	  disabling of instrumentation for some early boot code.
+
+config KCOV
+	bool "Code coverage for fuzzing"
+	depends on ARCH_HAS_KCOV
+	select DEBUG_FS
+	help
+	  KCOV exposes kernel code coverage information in a form suitable
+	  for coverage-guided fuzzing (randomized testing).
+
+	  If RANDOMIZE_BASE is enabled, PC values will not be stable across
+	  different machines and across reboots. If you need stable PC values,
+	  disable RANDOMIZE_BASE.
+
+	  For more details, see Documentation/kcov.txt.
+
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
 	depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 4962d14c450f..a1de5b61ff40 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
 endif
 
+# These files are disabled because they produce lots of non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. For example,
+# rbtree can be global and individual rotations don't correlate with inputs.
+KCOV_INSTRUMENT_string.o := n
+KCOV_INSTRUMENT_rbtree.o := n
+KCOV_INSTRUMENT_list_debug.o := n
+KCOV_INSTRUMENT_debugobjects.o := n
+KCOV_INSTRUMENT_dynamic_debug.o := n
+# Kernel does not boot if we instrument this file as it uses custom calling
+# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
+KCOV_INSTRUMENT_hweight.o := n
+
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
diff --git a/mm/Makefile b/mm/Makefile
index 6da300a1414b..f5e797cbd128 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,6 +5,21 @@
 KASAN_SANITIZE_slab_common.o := n
 KASAN_SANITIZE_slub.o := n
 
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index a61460d9f5b0..131daadf40e4 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,5 +1,6 @@
 KASAN_SANITIZE := n
 UBSAN_SANITIZE_kasan.o := n
+KCOV_INSTRUMENT := n
 
 CFLAGS_REMOVE_kasan.o = -pg
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index ad50d5859ac4..ddf83d0181e7 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -136,6 +136,12 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_UBSAN))
 endif
 
+ifeq ($(CONFIG_KCOV),y)
+_c_flags += $(if $(patsubst n%,, \
+	$(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \
+	$(CFLAGS_KCOV))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
-- 
cgit