diff options
Diffstat (limited to 'kernel/locking')
-rw-r--r-- | kernel/locking/Makefile | 5 | ||||
-rw-r--r-- | kernel/locking/lock_events.c | 179 | ||||
-rw-r--r-- | kernel/locking/lock_events.h | 59 | ||||
-rw-r--r-- | kernel/locking/lock_events_list.h | 67 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 1333 | ||||
-rw-r--r-- | kernel/locking/lockdep_internals.h | 41 | ||||
-rw-r--r-- | kernel/locking/lockdep_proc.c | 12 | ||||
-rw-r--r-- | kernel/locking/locktorture.c | 23 | ||||
-rw-r--r-- | kernel/locking/percpu-rwsem.c | 2 | ||||
-rw-r--r-- | kernel/locking/qspinlock.c | 25 | ||||
-rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 19 | ||||
-rw-r--r-- | kernel/locking/qspinlock_stat.h | 233 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 37 | ||||
-rw-r--r-- | kernel/locking/rwsem-spinlock.c | 339 | ||||
-rw-r--r-- | kernel/locking/rwsem-xadd.c | 213 | ||||
-rw-r--r-- | kernel/locking/rwsem.c | 25 | ||||
-rw-r--r-- | kernel/locking/rwsem.h | 174 | ||||
-rw-r--r-- | kernel/locking/spinlock.c | 7 | ||||
-rw-r--r-- | kernel/locking/spinlock_debug.c | 6 |
19 files changed, 1805 insertions, 994 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 392c7f23af76..6fe2f333aecb 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..fa2c2f951c6b --- /dev/null +++ b/kernel/locking/lock_events.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <[email protected]> + */ + +/* + * Collect locking event counts + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/fs.h> + +#include "lock_events.h" + +#undef LOCK_EVENT +#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, + +#define LOCK_EVENTS_DIR "lock_event_counts" + +/* + * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different + * types of locks will be reported under the <debugfs>/lock_event_counts/ + * directory. See lock_events_list.h for the list of available locking + * events. + * + * Writing to the special ".reset_counts" file will reset all the above + * locking event counts. This is a very slow operation and so should not + * be done frequently. + * + * These event counts are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counts usable even in a production + * environment. + */ +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + + [LOCKEVENT_reset_cnts] = ".reset_counts", +}; + +/* + * Per-cpu counts + */ +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The lockevent_read() function can be overridden. + */ +ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, id, len; + u64 sum = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + id = (long)file_inode(file)->i_private; + + if (id >= lockevent_num) + return -EBADF; + + for_each_possible_cpu(cpu) + sum += per_cpu(lockevents[id], cpu); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When idx = reset_cnts, reset all the counts. + */ +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(lockevents, cpu); + + for (i = 0 ; i < lockevent_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_lockevent = { + .read = lockevent_read, + .write = lockevent_write, + .llseek = default_llseek, +}; + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#include <asm/paravirt.h> + +static bool __init skip_lockevent(const char *name) +{ + static int pv_on __initdata = -1; + + if (pv_on < 0) + pv_on = !pv_is_native_spin_unlock(); + /* + * Skip PV qspinlock events on bare metal. + */ + if (!pv_on && !memcmp(name, "pv_", 3)) + return true; + return false; +} +#else +static inline bool skip_lockevent(const char *name) +{ + return false; +} +#endif + +/* + * Initialize debugfs for the locking event counts. + */ +static int __init init_lockevent_counts(void) +{ + struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); + int i; + + if (!d_counts) + goto out; + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < lockevent_num; i++) { + if (skip_lockevent(lockevent_names[i])) + continue; + if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent)) + goto fail_undo; + } + + if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + d_counts, (void *)(long)LOCKEVENT_reset_cnts, + &fops_lockevent)) + goto fail_undo; + + return 0; +fail_undo: + debugfs_remove_recursive(d_counts); +out: + pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); + return -ENOMEM; +} +fs_initcall(init_lockevent_counts); diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h new file mode 100644 index 000000000000..feb1acc54611 --- /dev/null +++ b/kernel/locking/lock_events.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <[email protected]> + */ + +#ifndef __LOCKING_LOCK_EVENTS_H +#define __LOCKING_LOCK_EVENTS_H + +enum lock_events { + +#include "lock_events_list.h" + + lockevent_num, /* Total number of lock event counts */ + LOCKEVENT_reset_cnts = lockevent_num, +}; + +#ifdef CONFIG_LOCK_EVENT_COUNTS +/* + * Per-cpu counters + */ +DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void __lockevent_inc(enum lock_events event, bool cond) +{ + if (cond) + __this_cpu_inc(lockevents[event]); +} + +#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) +#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c) + +static inline void __lockevent_add(enum lock_events event, int inc) +{ + __this_cpu_add(lockevents[event], inc); +} + +#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) + +#else /* CONFIG_LOCK_EVENT_COUNTS */ + +#define lockevent_inc(ev) +#define lockevent_add(ev, c) +#define lockevent_cond_inc(ev, c) + +#endif /* CONFIG_LOCK_EVENT_COUNTS */ +#endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h new file mode 100644 index 000000000000..ad7668cfc9da --- /dev/null +++ b/kernel/locking/lock_events_list.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <[email protected]> + */ + +#ifndef LOCK_EVENT +#define LOCK_EVENT(name) LOCKEVENT_ ## name, +#endif + +#ifdef CONFIG_QUEUED_SPINLOCKS +#ifdef CONFIG_PARAVIRT_SPINLOCKS +/* + * Locking events for PV qspinlock. + */ +LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */ +LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */ +LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */ +LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */ +LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */ +LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */ +LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */ +LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */ +LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */ +LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */ +LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Locking events for qspinlock + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1. + */ +LOCK_EVENT(lock_pending) /* # of locking ops via pending code */ +LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */ +LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */ +LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */ +LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */ +LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +/* + * Locking events for rwsem + */ +LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ +LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ +LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ +LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ +LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ +LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ +LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ +LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ +LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ +LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ +LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ +LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ +LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 95932333a48b..d06190fa5082 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -45,11 +45,14 @@ #include <linux/hash.h> #include <linux/ftrace.h> #include <linux/stringify.h> +#include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/nmi.h> +#include <linux/rcupdate.h> +#include <linux/kprobes.h> #include <asm/sections.h> @@ -81,6 +84,7 @@ module_param(lock_stat, int, 0644); * code to recurse back into the lockdep code... */ static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { @@ -130,13 +134,17 @@ static inline int debug_locks_off_graph_unlock(void) unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); /* * All data structures here are protected by the global debug_lock. * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. + * nr_lock_classes is the number of elements of lock_classes[] that is + * in use. */ +#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) +#define KEYHASH_SIZE (1UL << KEYHASH_BITS) +static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; #ifndef CONFIG_DEBUG_LOCKDEP static @@ -277,11 +285,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock) #endif /* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. + * We keep a global list of all lock classes. The list is only accessed with + * the lockdep spinlock lock held. free_lock_classes is a list with free + * elements. These elements are linked together by the lock_entry member in + * struct lock_class. */ LIST_HEAD(all_lock_classes); +static LIST_HEAD(free_lock_classes); + +/** + * struct pending_free - information about data structures about to be freed + * @zapped: Head of a list with struct lock_class elements. + * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements + * are about to be freed. + */ +struct pending_free { + struct list_head zapped; + DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS); +}; + +/** + * struct delayed_free - data structures used for delayed freeing + * + * A data structure for delayed freeing of data structures that may be + * accessed by RCU readers at the time these were freed. + * + * @rcu_head: Used to schedule an RCU callback for freeing data structures. + * @index: Index of @pf to which freed data structures are added. + * @scheduled: Whether or not an RCU callback has been scheduled. + * @pf: Array with information about data structures about to be freed. + */ +static struct delayed_free { + struct rcu_head rcu_head; + int index; + int scheduled; + struct pending_free pf[2]; +} delayed_free; /* * The lockdep classes are in a hash-table as well, for fast lookup: @@ -331,6 +370,11 @@ void lockdep_on(void) } EXPORT_SYMBOL(lockdep_on); +void lockdep_set_selftest_task(struct task_struct *task) +{ + lockdep_selftest_task_struct = task; +} + /* * Debugging switches: */ @@ -390,29 +434,14 @@ static void print_lockdep_off(const char *bug_msg) #endif } -static int save_trace(struct stack_trace *trace) +static int save_trace(struct lock_trace *trace) { - trace->nr_entries = 0; - trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->entries = stack_trace + nr_stack_trace_entries; - - trace->skip = 3; - - save_stack_trace(trace); - - /* - * Some daft arches put -1 at the end to indicate its a full trace. - * - * <rant> this is buggy anyway, since it takes a whole extra entry so a - * complete trace that maxes out the entries provided will be reported - * as incomplete, friggin useless </rant> - */ - if (trace->nr_entries != 0 && - trace->entries[trace->nr_entries-1] == ULONG_MAX) - trace->nr_entries--; - - trace->max_entries = trace->nr_entries; + unsigned long *entries = stack_trace + nr_stack_trace_entries; + unsigned int max_entries; + trace->offset = nr_stack_trace_entries; + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + trace->nr_entries = stack_trace_save(entries, max_entries, 3); nr_stack_trace_entries += trace->nr_entries; if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { @@ -472,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) { char c = '.'; - if (class->usage_mask & lock_flag(bit + 2)) + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) c = '+'; if (class->usage_mask & lock_flag(bit)) { c = '-'; - if (class->usage_mask & lock_flag(bit + 2)) + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) c = '?'; } @@ -599,12 +628,15 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ -static int static_obj(void *obj) +static int static_obj(const void *obj) { unsigned long start = (unsigned long) &_stext, end = (unsigned long) &_end, addr = (unsigned long) obj; + if (arch_is_kernel_initmem_freed(addr)) + return 0; + /* * static variable? */ @@ -716,6 +748,17 @@ static bool assign_lock_key(struct lockdep_map *lock) { unsigned long can_addr, addr = (unsigned long)lock; +#ifdef __KERNEL__ + /* + * lockdep_free_key_range() assumes that struct lock_class_key + * objects do not overlap. Since we use the address of lock + * objects as class key for static objects, check whether the + * size of lock_class_key objects does not exceed the size of + * the smallest lock object. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t)); +#endif + if (__is_kernel_percpu_address(addr, &can_addr)) lock->key = (void *)can_addr; else if (__is_module_percpu_address(addr, &can_addr)) @@ -735,6 +778,289 @@ static bool assign_lock_key(struct lockdep_map *lock) return true; } +#ifdef CONFIG_DEBUG_LOCKDEP + +/* Check whether element @e occurs in list @h */ +static bool in_list(struct list_head *e, struct list_head *h) +{ + struct list_head *f; + + list_for_each(f, h) { + if (e == f) + return true; + } + + return false; +} + +/* + * Check whether entry @e occurs in any of the locks_after or locks_before + * lists. + */ +static bool in_any_class_list(struct list_head *e) +{ + struct lock_class *class; + int i; + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (in_list(e, &class->locks_after) || + in_list(e, &class->locks_before)) + return true; + } + return false; +} + +static bool class_lock_list_valid(struct lock_class *c, struct list_head *h) +{ + struct lock_list *e; + + list_for_each_entry(e, h, entry) { + if (e->links_to != c) { + printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s", + c->name ? : "(?)", + (unsigned long)(e - list_entries), + e->links_to && e->links_to->name ? + e->links_to->name : "(?)", + e->class && e->class->name ? e->class->name : + "(?)"); + return false; + } + } + return true; +} + +#ifdef CONFIG_PROVE_LOCKING +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +#endif + +static bool check_lock_chain_key(struct lock_chain *chain) +{ +#ifdef CONFIG_PROVE_LOCKING + u64 chain_key = 0; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + /* + * The 'unsigned long long' casts avoid that a compiler warning + * is reported when building tools/lib/lockdep. + */ + if (chain->chain_key != chain_key) { + printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n", + (unsigned long long)(chain - lock_chains), + (unsigned long long)chain->chain_key, + (unsigned long long)chain_key); + return false; + } +#endif + return true; +} + +static bool in_any_zapped_class_list(struct lock_class *class) +{ + struct pending_free *pf; + int i; + + for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) { + if (in_list(&class->lock_entry, &pf->zapped)) + return true; + } + + return false; +} + +static bool __check_data_structures(void) +{ + struct lock_class *class; + struct lock_chain *chain; + struct hlist_head *head; + struct lock_list *e; + int i; + + /* Check whether all classes occur in a lock list. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!in_list(&class->lock_entry, &all_lock_classes) && + !in_list(&class->lock_entry, &free_lock_classes) && + !in_any_zapped_class_list(class)) { + printk(KERN_INFO "class %px/%s is not in any class list\n", + class, class->name ? : "(?)"); + return false; + } + } + + /* Check whether all classes have valid lock lists. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!class_lock_list_valid(class, &class->locks_before)) + return false; + if (!class_lock_list_valid(class, &class->locks_after)) + return false; + } + + /* Check the chain_key of all lock chains. */ + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + if (!check_lock_chain_key(chain)) + return false; + } + } + + /* + * Check whether all list entries that are in use occur in a class + * lock list. + */ + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (!in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class->name ? : "(?)", + e->links_to->name ? : "(?)"); + return false; + } + } + + /* + * Check whether all list entries that are not in use do not occur in + * a class lock list. + */ + for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class && e->class->name ? e->class->name : + "(?)", + e->links_to && e->links_to->name ? + e->links_to->name : "(?)"); + return false; + } + } + + return true; +} + +int check_consistency = 0; +module_param(check_consistency, int, 0644); + +static void check_data_structures(void) +{ + static bool once = false; + + if (check_consistency && !once) { + if (!__check_data_structures()) { + once = true; + WARN_ON(once); + } + } +} + +#else /* CONFIG_DEBUG_LOCKDEP */ + +static inline void check_data_structures(void) { } + +#endif /* CONFIG_DEBUG_LOCKDEP */ + +/* + * Initialize the lock_classes[] array elements, the free_lock_classes list + * and also the delayed_free structure. + */ +static void init_data_structures_once(void) +{ + static bool ds_initialized, rcu_head_initialized; + int i; + + if (likely(rcu_head_initialized)) + return; + + if (system_state >= SYSTEM_SCHEDULING) { + init_rcu_head(&delayed_free.rcu_head); + rcu_head_initialized = true; + } + + if (ds_initialized) + return; + + ds_initialized = true; + + INIT_LIST_HEAD(&delayed_free.pf[0].zapped); + INIT_LIST_HEAD(&delayed_free.pf[1].zapped); + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes); + INIT_LIST_HEAD(&lock_classes[i].locks_after); + INIT_LIST_HEAD(&lock_classes[i].locks_before); + } +} + +static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) +{ + unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS); + + return lock_keys_hash + hash; +} + +/* Register a dynamically allocated key. */ +void lockdep_register_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + unsigned long flags; + + if (WARN_ON_ONCE(static_obj(key))) + return; + hash_head = keyhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) + goto restore_irqs; + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (WARN_ON_ONCE(k == key)) + goto out_unlock; + } + hlist_add_head_rcu(&key->hash_entry, hash_head); +out_unlock: + graph_unlock(); +restore_irqs: + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_register_key); + +/* Check whether a key has been registered as a dynamic key. */ +static bool is_dynamic_key(const struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + bool found = false; + + if (WARN_ON_ONCE(static_obj(key))) + return false; + + /* + * If lock debugging is disabled lock_keys_hash[] may contain + * pointers to memory that has already been freed. Avoid triggering + * a use-after-free in that case by returning early. + */ + if (!debug_locks) + return true; + + hash_head = keyhashentry(key); + + rcu_read_lock(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + found = true; + break; + } + } + rcu_read_unlock(); + + return found; +} + /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -756,7 +1082,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (!lock->key) { if (!assign_lock_key(lock)) return NULL; - } else if (!static_obj(lock->key)) { + } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) { return NULL; } @@ -775,11 +1101,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) goto out_unlock_set; } - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + init_data_structures_once(); + + /* Allocate a new lock class and add it to the hash. */ + class = list_first_entry_or_null(&free_lock_classes, typeof(*class), + lock_entry); + if (!class) { if (!debug_locks_off_graph_unlock()) { return NULL; } @@ -788,13 +1115,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) dump_stack(); return NULL; } - class = lock_classes + nr_lock_classes++; + nr_lock_classes++; debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; class->subclass = subclass; - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); /* * We use RCU's safe list-add method to make @@ -802,9 +1129,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) */ hlist_add_head_rcu(&class->hash_entry, hash_head); /* - * Add it to the global list of classes: + * Remove the class from the free list and add it to the global list + * of classes. */ - list_add_tail(&class->lock_entry, &all_lock_classes); + list_move_tail(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -845,7 +1173,10 @@ out_set_class_cache: */ static struct lock_list *alloc_list_entry(void) { - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + int idx = find_first_zero_bit(list_entries_in_use, + ARRAY_SIZE(list_entries)); + + if (idx >= ARRAY_SIZE(list_entries)) { if (!debug_locks_off_graph_unlock()) return NULL; @@ -853,15 +1184,18 @@ static struct lock_list *alloc_list_entry(void) dump_stack(); return NULL; } - return list_entries + nr_list_entries++; + nr_list_entries++; + __set_bit(idx, list_entries_in_use); + return list_entries + idx; } /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *this, struct list_head *head, +static int add_lock_to_list(struct lock_class *this, + struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct stack_trace *trace) + struct lock_trace *trace) { struct lock_list *entry; /* @@ -873,6 +1207,7 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head, return 0; entry->class = this; + entry->links_to = links_to; entry->distance = distance; entry->trace = *trace; /* @@ -955,7 +1290,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ lock->parent = parent; lock->class->dep_gen_id = lockdep_dependency_gen_id; } @@ -965,7 +1300,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1079,6 +1414,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry, * checking. */ +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ + unsigned long *entries = stack_trace + trace->offset; + + stack_trace_print(entries, trace->nr_entries, spaces); +} + /* * Print a dependency chain entry (this is only done when a deadlock * has been detected): @@ -1091,8 +1433,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_stack_trace(&target->trace, 6); - + print_lock_trace(&target->trace, 6); return 0; } @@ -1186,10 +1527,9 @@ static inline int class_equal(struct lock_list *entry, void *data) } static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt, - struct stack_trace *trace) + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1329,19 +1669,25 @@ check_redundant(struct lock_list *root, struct lock_class *target, } #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +static inline int usage_accumulate(struct lock_list *entry, void *mask) +{ + *(unsigned long *)mask |= entry->class->usage_mask; + + return 0; +} + /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static inline int usage_match(struct lock_list *entry, void *bit) +static inline int usage_match(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); + return entry->class->usage_mask & *(unsigned long *)mask; } - - /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. @@ -1353,14 +1699,14 @@ static inline int usage_match(struct lock_list *entry, void *bit) * Return <0 on error. */ static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { int result; debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, target_entry); return result; } @@ -1376,14 +1722,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, * Return <0 on error. */ static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { int result; debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, target_entry); return result; } @@ -1405,7 +1751,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_stack_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces + bit, len); } } printk("%*s }\n", depth, ""); @@ -1430,7 +1776,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); + print_lock_trace(&entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1543,14 +1889,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -1575,39 +1921,6 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; } -static int -check_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit_backwards, - enum lock_usage_bit bit_forwards, const char *irqclass) -{ - int ret; - struct lock_list this, that; - struct lock_list *uninitialized_var(target_entry); - struct lock_list *uninitialized_var(target_entry1); - - this.parent = NULL; - - this.class = hlock_class(prev); - ret = find_usage_backwards(&this, bit_backwards, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - that.parent = NULL; - that.class = hlock_class(next); - ret = find_usage_forwards(&that, bit_forwards, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - bit_backwards, bit_forwards, irqclass); -} - static const char *state_names[] = { #define LOCKDEP_STATE(__STATE) \ __stringify(__STATE), @@ -1624,70 +1937,184 @@ static const char *state_rnames[] = { static inline const char *state_name(enum lock_usage_bit bit) { - return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; + if (bit & LOCK_USAGE_READ_MASK) + return state_rnames[bit >> LOCK_USAGE_DIR_MASK]; + else + return state_names[bit >> LOCK_USAGE_DIR_MASK]; } +/* + * The bit number is encoded like: + * + * bit0: 0 exclusive, 1 read lock + * bit1: 0 used in irq, 1 irq enabled + * bit2-n: state + */ static int exclusive_bit(int new_bit) { - /* - * USED_IN - * USED_IN_READ - * ENABLED - * ENABLED_READ - * - * bit 0 - write/read - * bit 1 - used_in/enabled - * bit 2+ state - */ - - int state = new_bit & ~3; - int dir = new_bit & 2; + int state = new_bit & LOCK_USAGE_STATE_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * keep state, bit flip the direction and strip read. */ - return state | (dir ^ 2); + return state | (dir ^ LOCK_USAGE_DIR_MASK); +} + +/* + * Observe that when given a bitmask where each bitnr is encoded as above, a + * right shift of the mask transforms the individual bitnrs as -1 and + * conversely, a left shift transforms into +1 for the individual bitnrs. + * + * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can + * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0) + * instead by subtracting the bit number by 2, or shifting the mask right by 2. + * + * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2. + * + * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is + * all bits set) and recompose with bitnr1 flipped. + */ +static unsigned long invert_dir_mask(unsigned long mask) +{ + unsigned long excl = 0; + + /* Invert dir */ + excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK; + excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK; + + return excl; } +/* + * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all + * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). + * And then mask out all bitnr0. + */ +static unsigned long exclusive_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Strip read */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; + excl &= ~LOCKF_IRQ_READ; + + return excl; +} + +/* + * Retrieve the _possible_ original mask to which @mask is + * exclusive. Ie: this is the opposite of exclusive_mask(). + * Note that 2 possible original bits can match an exclusive + * bit: one has LOCK_USAGE_READ_MASK set, the other has it + * cleared. So both are returned for each exclusive bit. + */ +static unsigned long original_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; + + return excl; +} + +/* + * Find the first pair of bit match between an original + * usage mask and an exclusive usage mask. + */ +static int find_exclusive_match(unsigned long mask, + unsigned long excl_mask, + enum lock_usage_bit *bitp, + enum lock_usage_bit *excl_bitp) +{ + int bit, excl; + + for_each_set_bit(bit, &mask, LOCK_USED) { + excl = exclusive_bit(bit); + if (excl_mask & lock_flag(excl)) { + *bitp = bit; + *excl_bitp = excl; + return 0; + } + } + return -1; +} + +/* + * Prove that the new dependency does not connect a hardirq-safe(-read) + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit) + struct held_lock *next) { + unsigned long usage_mask = 0, forward_mask, backward_mask; + enum lock_usage_bit forward_bit = 0, backward_bit = 0; + struct lock_list *uninitialized_var(target_entry1); + struct lock_list *uninitialized_var(target_entry); + struct lock_list this, that; + int ret; + /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: + * Step 1: gather all hard/soft IRQs usages backward in an + * accumulated usage mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; + this.parent = NULL; + this.class = hlock_class(prev); + + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); + if (ret < 0) + return print_bfs_bug(ret); - bit++; /* _READ */ + usage_mask &= LOCKF_USED_IN_IRQ_ALL; + if (!usage_mask) + return 1; /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: + * Step 2: find exclusive uses forward that match the previous + * backward accumulated mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; + forward_mask = exclusive_mask(usage_mask); - return 1; -} + that.parent = NULL; + that.class = hlock_class(next); -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ -#define LOCKDEP_STATE(__STATE) \ - if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ - return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE + ret = find_usage_forwards(&that, forward_mask, &target_entry1); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; - return 1; + /* + * Step 3: we found a bad match! Now retrieve a lock from the backward + * list whose usage mask matches the exclusive usage mask from the + * lock found on the forward list. + */ + backward_mask = original_mask(target_entry1->class->usage_mask); + + ret = find_usage_backwards(&this, backward_mask, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (DEBUG_LOCKS_WARN_ON(ret == 1)) + return 1; + + /* + * Step 4: narrow down to a pair of incompatible usage bits + * and report it. + */ + ret = find_exclusive_match(target_entry->class->usage_mask, + target_entry1->class->usage_mask, + &backward_bit, &forward_bit); + if (DEBUG_LOCKS_WARN_ON(ret == -1)) + return 1; + + return print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + backward_bit, forward_bit, + state_name(backward_bit)); } static void inc_chains(void) @@ -1704,9 +2131,8 @@ static void inc_chains(void) #else -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) +static inline int check_irq_usage(struct task_struct *curr, + struct held_lock *prev, struct held_lock *next) { return 1; } @@ -1834,14 +2260,31 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct stack_trace *trace, - int (*save)(struct stack_trace *trace)) + struct held_lock *next, int distance, struct lock_trace *trace) { struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; struct lock_list this; int ret; + if (!hlock_class(prev)->key || !hlock_class(next)->key) { + /* + * The warning statements below may trigger a use-after-free + * of the class name. It is better to trigger a use-after free + * and to have the class name most of the time instead of not + * having the class name available. + */ + WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(prev), + hlock_class(prev)->name); + WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(next), + hlock_class(next)->name); + return 2; + } + /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by @@ -1855,20 +2298,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); if (unlikely(!ret)) { - if (!trace->entries) { + if (!trace->nr_entries) { /* - * If @save fails here, the printing might trigger - * a WARN but because of the !nr_entries it should - * not do bad things. + * If save_trace fails here, the printing might + * trigger a WARN but because of the !nr_entries it + * should not do bad things. */ - save(trace); + save_trace(trace); } - return print_circular_bug(&this, target_entry, next, prev, trace); + return print_circular_bug(&this, target_entry, next, prev); } else if (unlikely(ret < 0)) return print_bfs_bug(ret); - if (!check_prev_add_irq(curr, prev, next)) + if (!check_irq_usage(curr, prev, next)) return 0; /* @@ -1911,21 +2354,21 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return print_bfs_bug(ret); - if (!trace->entries && !save(trace)) + if (!trace->nr_entries && !save_trace(trace)) return 0; /* * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(next), + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, next->acquire_ip, distance, trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(prev), + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, next->acquire_ip, distance, trace); if (!ret) @@ -1943,14 +2386,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { + struct lock_trace trace = { .nr_entries = 0 }; int depth = curr->lockdep_depth; struct held_lock *hlock; - struct stack_trace trace = { - .nr_entries = 0, - .max_entries = 0, - .entries = NULL, - .skip = 0, - }; /* * Debugging checks. @@ -1976,7 +2414,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) * added: */ if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + int ret = check_prev_add(curr, hlock, next, distance, + &trace); if (!ret) return 0; @@ -2018,8 +2457,8 @@ out_bug: return 0; } -unsigned long nr_lock_chains; struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; @@ -2153,6 +2592,33 @@ static int check_no_collision(struct task_struct *curr, } /* + * Given an index that is >= -1, return the index of the next lock chain. + * Return -2 if there is no next lock chain. + */ +long lockdep_next_lockchain(long i) +{ + i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1); + return i < ARRAY_SIZE(lock_chains) ? i : -2; +} + +unsigned long lock_chain_count(void) +{ + return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains)); +} + +/* Must be called with the graph lock held. */ +static struct lock_chain *alloc_lock_chain(void) +{ + int idx = find_first_zero_bit(lock_chains_in_use, + ARRAY_SIZE(lock_chains)); + + if (unlikely(idx >= ARRAY_SIZE(lock_chains))) + return NULL; + __set_bit(idx, lock_chains_in_use); + return lock_chains + idx; +} + +/* * Adds a dependency chain into chain hashtable. And must be called with * graph_lock held. * @@ -2169,19 +2635,15 @@ static inline int add_chain_cache(struct task_struct *curr, int i, j; /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - - /* - * We might need to take the graph lock, ensure we've got IRQs + * The caller must hold the graph lock, ensure we've got IRQs * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + chain = alloc_lock_chain(); + if (!chain) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2189,7 +2651,6 @@ static inline int add_chain_cache(struct task_struct *curr, dump_stack(); return 0; } - chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; i = get_first_held_lock(curr, hlock); @@ -2206,16 +2667,8 @@ static inline int add_chain_cache(struct task_struct *curr, chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; - } - - if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) nr_chain_hlocks += chain->depth; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * Important for check_no_collision(). - */ - if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + } else { if (!debug_locks_off_graph_unlock()) return 0; @@ -2223,7 +2676,6 @@ static inline int add_chain_cache(struct task_struct *curr, dump_stack(); return 0; } -#endif hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); @@ -2233,19 +2685,16 @@ static inline int add_chain_cache(struct task_struct *curr, } /* - * Look up a dependency chain. + * Look up a dependency chain. Must be called with either the graph lock or + * the RCU read lock held. */ static inline struct lock_chain *lookup_chain_cache(u64 chain_key) { struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { + if (READ_ONCE(chain->chain_key) == chain_key) { debug_atomic_inc(chain_lookup_hits); return chain; } @@ -2367,6 +2816,10 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } + +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ +} #endif /* @@ -2420,6 +2873,12 @@ static void check_chain_key(struct task_struct *curr) #endif } +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + + static void print_usage_bug_scenario(struct held_lock *lock) { @@ -2463,7 +2922,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -2489,10 +2948,6 @@ valid_state(struct task_struct *curr, struct held_lock *this, return 1; } -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2572,7 +3027,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); - ret = find_usage_forwards(&root, bit, &target_entry); + ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); if (ret < 0) return print_bfs_bug(ret); if (ret == 1) @@ -2596,7 +3051,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); - ret = find_usage_backwards(&root, bit, &target_entry); + ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); if (ret < 0) return print_bfs_bug(ret); if (ret == 1) @@ -2651,7 +3106,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = { static inline int state_verbose(enum lock_usage_bit bit, struct lock_class *class) { - return state_verbose_f[bit >> 2](class); + return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class); } typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, @@ -2662,8 +3117,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { int excl_bit = exclusive_bit(new_bit); - int read = new_bit & 1; - int dir = new_bit & 2; + int read = new_bit & LOCK_USAGE_READ_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * mark USED_IN has to look forwards -- to ensure no dependency @@ -2687,19 +3142,19 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, * states. */ if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~1))) + !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) return 0; /* * Check for read in write conflicts */ if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + 1)) + if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) return 0; if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + 1, - state_name(new_bit + 1))) + !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, + state_name(new_bit + LOCK_USAGE_READ_MASK))) return 0; } @@ -2709,35 +3164,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 1; } -enum mark_type { -#define LOCKDEP_STATE(__STATE) __STATE, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - /* * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, enum mark_type mark) +mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) { - enum lock_usage_bit usage_bit; struct held_lock *hlock; int i; for (i = 0; i < curr->lockdep_depth; i++) { + enum lock_usage_bit hlock_bit = base_bit; hlock = curr->held_locks + i; - usage_bit = 2 + (mark << 2); /* ENABLED */ if (hlock->read) - usage_bit += 1; /* READ */ + hlock_bit += LOCK_USAGE_READ_MASK; - BUG_ON(usage_bit >= LOCK_USAGE_STATES); + BUG_ON(hlock_bit >= LOCK_USAGE_STATES); if (!hlock->check) continue; - if (!mark_lock(curr, hlock, usage_bit)) + if (!mark_lock(curr, hlock, hlock_bit)) return 0; } @@ -2758,7 +3206,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, HARDIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ)) return; /* * If we have softirqs enabled, then set the usage @@ -2766,7 +3214,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, SOFTIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) return; curr->hardirq_enable_ip = ip; @@ -2800,7 +3248,7 @@ void lockdep_hardirqs_on(unsigned long ip) /* * See the fine text that goes along with this variable definition. */ - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled)) return; /* @@ -2814,6 +3262,7 @@ void lockdep_hardirqs_on(unsigned long ip) __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; } +NOKPROBE_SYMBOL(lockdep_hardirqs_on); /* * Hardirqs were disabled: @@ -2843,6 +3292,7 @@ void lockdep_hardirqs_off(unsigned long ip) } else debug_atomic_inc(redundant_hardirqs_off); } +NOKPROBE_SYMBOL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -2880,7 +3330,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, SOFTIRQ); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); current->lockdep_recursion = 0; } @@ -3119,13 +3569,12 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (DEBUG_LOCKS_WARN_ON(!key)) return; /* - * Sanity check, the lock-class key must be persistent: + * Sanity check, the lock-class key must either have been allocated + * statically or must have been registered as a dynamic key. */ - if (!static_obj(key)) { - printk("BUG: key %px not in .data!\n", key); - /* - * What it says above ^^^^^, I suggest you read it. - */ + if (!static_obj(key) && !is_dynamic_key(key)) { + if (debug_locks) + printk(KERN_ERR "BUG: key %px has not been registered!\n", key); DEBUG_LOCKS_WARN_ON(1); return; } @@ -3335,6 +3784,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (nest_lock && !__lock_is_held(nest_lock, -1)) return print_lock_nested_lock_not_held(curr, hlock, ip); + if (!debug_locks_silent) { + WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); + WARN_ON_ONCE(!hlock_class(hlock)->key); + } + if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; @@ -3497,6 +3951,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3535,6 +3992,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3650,7 +4110,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 0; } -static int __lock_is_held(const struct lockdep_map *lock, int read) +static nokprobe_inline +int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3883,6 +4344,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) return ret; } EXPORT_SYMBOL_GPL(lock_is_held_type); +NOKPROBE_SYMBOL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { @@ -4123,29 +4585,131 @@ void lockdep_reset(void) raw_local_irq_restore(flags); } +/* Remove a class from a lock chain. Must be called with the graph lock held. */ +static void remove_class_from_lock_chain(struct pending_free *pf, + struct lock_chain *chain, + struct lock_class *class) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_chain *new_chain; + u64 chain_key; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) { + if (chain_hlocks[i] != class - lock_classes) + continue; + /* The code below leaks one chain_hlock[] entry. */ + if (--chain->depth > 0) { + memmove(&chain_hlocks[i], &chain_hlocks[i + 1], + (chain->base + chain->depth - i) * + sizeof(chain_hlocks[0])); + } + /* + * Each lock class occurs at most once in a lock chain so once + * we found a match we can break out of this loop. + */ + goto recalc; + } + /* Since the chain has not been modified, return. */ + return; + +recalc: + chain_key = 0; + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + if (chain->depth && chain->chain_key == chain_key) + return; + /* Overwrite the chain key for concurrent RCU readers. */ + WRITE_ONCE(chain->chain_key, chain_key); + /* + * Note: calling hlist_del_rcu() from inside a + * hlist_for_each_entry_rcu() loop is safe. + */ + hlist_del_rcu(&chain->entry); + __set_bit(chain - lock_chains, pf->lock_chains_being_freed); + if (chain->depth == 0) + return; + /* + * If the modified lock chain matches an existing lock chain, drop + * the modified lock chain. + */ + if (lookup_chain_cache(chain_key)) + return; + new_chain = alloc_lock_chain(); + if (WARN_ON_ONCE(!new_chain)) { + debug_locks_off(); + return; + } + *new_chain = *chain; + hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); +#endif +} + +/* Must be called with the graph lock held. */ +static void remove_class_from_lock_chains(struct pending_free *pf, + struct lock_class *class) +{ + struct lock_chain *chain; + struct hlist_head *head; + int i; + + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + remove_class_from_lock_chain(pf, chain, class); + } + } +} + /* * Remove all references to a lock class. The caller must hold the graph lock. */ -static void zap_class(struct lock_class *class) +static void zap_class(struct pending_free *pf, struct lock_class *class) { + struct lock_list *entry; int i; + WARN_ON_ONCE(!class->key); + /* * Remove all dependencies this lock is * involved in: */ - for (i = 0; i < nr_list_entries; i++) { - if (list_entries[i].class == class) - list_del_rcu(&list_entries[i].entry); + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + entry = list_entries + i; + if (entry->class != class && entry->links_to != class) + continue; + __clear_bit(i, list_entries_in_use); + nr_list_entries--; + list_del_rcu(&entry->entry); } - /* - * Unhash the class and remove it from the all_lock_classes list: - */ - hlist_del_rcu(&class->hash_entry); - list_del(&class->lock_entry); + if (list_empty(&class->locks_after) && + list_empty(&class->locks_before)) { + list_move_tail(&class->lock_entry, &pf->zapped); + hlist_del_rcu(&class->hash_entry); + WRITE_ONCE(class->key, NULL); + WRITE_ONCE(class->name, NULL); + nr_lock_classes--; + } else { + WARN_ONCE(true, "%s() failed for class %s\n", __func__, + class->name); + } + + remove_class_from_lock_chains(pf, class); +} - RCU_INIT_POINTER(class->key, NULL); - RCU_INIT_POINTER(class->name, NULL); +static void reinit_class(struct lock_class *class) +{ + void *const p = class; + const unsigned int offset = offsetof(struct lock_class, key); + + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + memset(p + offset, 0, sizeof(*class) - offset); + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); } static inline int within(const void *addr, void *start, unsigned long size) @@ -4153,55 +4717,171 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +static bool inside_selftest(void) +{ + return current == lockdep_selftest_task_struct; +} + +/* The caller must hold the graph lock. */ +static struct pending_free *get_pending_free(void) +{ + return delayed_free.pf + delayed_free.index; +} + +static void free_zapped_rcu(struct rcu_head *cb); + /* - * Used in module.c to remove lock classes from memory that is going to be - * freed; and possibly re-used by other modules. - * - * We will have had one sync_sched() before getting here, so we're guaranteed - * nobody will look up these exact classes -- they're properly dead but still - * allocated. + * Schedule an RCU callback if no RCU callback is pending. Must be called with + * the graph lock held. */ -void lockdep_free_key_range(void *start, unsigned long size) +static void call_rcu_zapped(struct pending_free *pf) +{ + WARN_ON_ONCE(inside_selftest()); + + if (list_empty(&pf->zapped)) + return; + + if (delayed_free.scheduled) + return; + + delayed_free.scheduled = true; + + WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); + delayed_free.index ^= 1; + + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); +} + +/* The caller must hold the graph lock. May be called from RCU context. */ +static void __free_zapped_classes(struct pending_free *pf) { struct lock_class *class; - struct hlist_head *head; + + check_data_structures(); + + list_for_each_entry(class, &pf->zapped, lock_entry) + reinit_class(class); + + list_splice_init(&pf->zapped, &free_lock_classes); + +#ifdef CONFIG_PROVE_LOCKING + bitmap_andnot(lock_chains_in_use, lock_chains_in_use, + pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains)); + bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains)); +#endif +} + +static void free_zapped_rcu(struct rcu_head *ch) +{ + struct pending_free *pf; unsigned long flags; - int i; - int locked; + + if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) + return; raw_local_irq_save(flags); - locked = graph_lock(); + arch_spin_lock(&lockdep_lock); + current->lockdep_recursion = 1; + + /* closed head */ + pf = delayed_free.pf + (delayed_free.index ^ 1); + __free_zapped_classes(pf); + delayed_free.scheduled = false; /* - * Unhash all classes that were created by this module: + * If there's anything on the open list, close and start a new callback. */ + call_rcu_zapped(delayed_free.pf + delayed_free.index); + + current->lockdep_recursion = 0; + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +/* + * Remove all lock classes from the class hash table and from the + * all_lock_classes list whose key or name is in the address range [start, + * start + size). Move these lock classes to the zapped_classes list. Must + * be called with the graph lock held. + */ +static void __lockdep_free_key_range(struct pending_free *pf, void *start, + unsigned long size) +{ + struct lock_class *class; + struct hlist_head *head; + int i; + + /* Unhash all classes that were created by a module. */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; hlist_for_each_entry_rcu(class, head, hash_entry) { - if (within(class->key, start, size)) - zap_class(class); - else if (within(class->name, start, size)) - zap_class(class); + if (!within(class->key, start, size) && + !within(class->name, start, size)) + continue; + zap_class(pf, class); } } +} - if (locked) - graph_unlock(); +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one synchronize_rcu() before getting here, so we're + * guaranteed nobody will look up these exact classes -- they're properly dead + * but still allocated. + */ +static void lockdep_free_key_range_reg(void *start, unsigned long size) +{ + struct pending_free *pf; + unsigned long flags; + + init_data_structures_once(); + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + current->lockdep_recursion = 1; + pf = get_pending_free(); + __lockdep_free_key_range(pf, start, size); + call_rcu_zapped(pf); + current->lockdep_recursion = 0; + arch_spin_unlock(&lockdep_lock); raw_local_irq_restore(flags); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. - * - * sync_sched() is sufficient because the read-side is IRQ disable. */ synchronize_rcu(); +} - /* - * XXX at this point we could return the resources to the pool; - * instead we leak them. We would need to change to bitmap allocators - * instead of the linear allocators we have now. - */ +/* + * Free all lockdep keys in the range [start, start+size). Does not sleep. + * Ignores debug_locks. Must only be used by the lockdep selftests. + */ +static void lockdep_free_key_range_imm(void *start, unsigned long size) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + init_data_structures_once(); + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + __lockdep_free_key_range(pf, start, size); + __free_zapped_classes(pf); + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_free_key_range_imm(start, size); + else + lockdep_free_key_range_reg(start, size); } /* @@ -4226,14 +4906,12 @@ static bool lock_class_cache_is_registered(struct lockdep_map *lock) return false; } -void lockdep_reset_lock(struct lockdep_map *lock) +/* The caller must hold the graph lock. Does not sleep. */ +static void __lockdep_reset_lock(struct pending_free *pf, + struct lockdep_map *lock) { struct lock_class *class; - unsigned long flags; - int j, locked; - - raw_local_irq_save(flags); - locked = graph_lock(); + int j; /* * Remove all classes this lock might have: @@ -4244,27 +4922,104 @@ void lockdep_reset_lock(struct lockdep_map *lock) */ class = look_up_lock_class(lock, j); if (class) - zap_class(class); + zap_class(pf, class); } /* * Debug check: in the end all mapped classes should * be gone. */ - if (unlikely(lock_class_cache_is_registered(lock))) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); + if (WARN_ON_ONCE(lock_class_cache_is_registered(lock))) + debug_locks_off(); +} + +/* + * Remove all information lockdep has about a lock if debug_locks == 1. Free + * released data structures from RCU context. + */ +static void lockdep_reset_lock_reg(struct lockdep_map *lock) +{ + struct pending_free *pf; + unsigned long flags; + int locked; + + raw_local_irq_save(flags); + locked = graph_lock(); + if (!locked) + goto out_irq; + + pf = get_pending_free(); + __lockdep_reset_lock(pf, lock); + call_rcu_zapped(pf); + + graph_unlock(); +out_irq: + raw_local_irq_restore(flags); +} + +/* + * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the + * lockdep selftests. + */ +static void lockdep_reset_lock_imm(struct lockdep_map *lock) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + __lockdep_reset_lock(pf, lock); + __free_zapped_classes(pf); + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_reset_lock_imm(lock); + else + lockdep_reset_lock_reg(lock); +} + +/* Unregister a dynamically allocated key. */ +void lockdep_unregister_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head = keyhashentry(key); + struct lock_class_key *k; + struct pending_free *pf; + unsigned long flags; + bool found = false; + + might_sleep(); + + if (WARN_ON_ONCE(static_obj(key))) + return; + + raw_local_irq_save(flags); + if (!graph_lock()) + goto out_irq; + + pf = get_pending_free(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + hlist_del_rcu(&k->hash_entry); + found = true; + break; } - goto out_restore; } - if (locked) - graph_unlock(); - -out_restore: + WARN_ON_ONCE(!found); + __lockdep_free_key_range(pf, key, 1); + call_rcu_zapped(pf); + graph_unlock(); +out_irq: raw_local_irq_restore(flags); + + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ + synchronize_rcu(); } +EXPORT_SYMBOL_GPL(lockdep_unregister_key); void __init lockdep_init(void) { @@ -4278,20 +5033,24 @@ void __init lockdep_init(void) printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - printk(" memory used by lock dependency info: %lu kB\n", - (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + - sizeof(struct list_head) * CLASSHASH_SIZE + - sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + - sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE + printk(" memory used by lock dependency info: %zu kB\n", + (sizeof(lock_classes) + + sizeof(classhash_table) + + sizeof(list_entries) + + sizeof(list_entries_in_use) + + sizeof(chainhash_table) + + sizeof(delayed_free) #ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) + + sizeof(lock_cq) + + sizeof(lock_chains) + + sizeof(lock_chains_in_use) + + sizeof(chain_hlocks) #endif ) / 1024 ); - printk(" per task-struct memory footprint: %lu bytes\n", - sizeof(struct held_lock) * MAX_LOCK_DEPTH); + printk(" per task-struct memory footprint: %zu bytes\n", + sizeof(((struct task_struct *)NULL)->held_locks)); } static void diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 88c847a41c8a..150ec3f0c5b5 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -22,6 +22,10 @@ enum lock_usage_bit { LOCK_USAGE_STATES }; +#define LOCK_USAGE_READ_MASK 1 +#define LOCK_USAGE_DIR_MASK 2 +#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) + /* * Usage-state bitmasks: */ @@ -38,13 +42,35 @@ enum { __LOCKF(USED) }; -#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | +static const unsigned long LOCKF_ENABLED_IRQ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | +static const unsigned long LOCKF_USED_IN_IRQ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | +static const unsigned long LOCKF_ENABLED_IRQ_READ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | +static const unsigned long LOCKF_USED_IN_IRQ_READ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) +#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) -#define LOCKF_ENABLED_IRQ_READ \ - (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) -#define LOCKF_USED_IN_IRQ_READ \ - (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ) +#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ) /* * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, @@ -96,7 +122,8 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; extern unsigned long nr_list_entries; -extern unsigned long nr_lock_chains; +long lockdep_next_lockchain(long i); +unsigned long lock_chain_count(void); extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 3d31f9b0059e..9c49ec645d8b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -104,18 +104,18 @@ static const struct seq_operations lockdep_ops = { #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { + if (*pos < 0) + return NULL; + if (*pos == 0) return SEQ_START_TOKEN; - if (*pos - 1 < nr_lock_chains) - return lock_chains + (*pos - 1); - - return NULL; + return lock_chains + (*pos - 1); } static void *lc_next(struct seq_file *m, void *v, loff_t *pos) { - (*pos)++; + *pos = lockdep_next_lockchain(*pos - 1) + 1; return lc_start(m, pos); } @@ -268,7 +268,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", - nr_lock_chains, MAX_LOCKDEP_CHAINS); + lock_chain_count(), MAX_LOCKDEP_CHAINS); seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); #endif diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 7d0b0ed74404..80a463d31a8d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Module-based torture test facility for locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Authors: Paul E. McKenney <[email protected]> + * Authors: Paul E. McKenney <[email protected]> * Davidlohr Bueso <[email protected]> * Based on kernel/rcu/torture.c. */ @@ -45,7 +32,7 @@ #include <linux/torture.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); +MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); @@ -842,7 +829,9 @@ static void lock_torture_cleanup(void) "End of test: SUCCESS"); kfree(cxt.lwsa); + cxt.lwsa = NULL; kfree(cxt.lrsa); + cxt.lrsa = NULL; end: torture_cleanup_end(); @@ -970,7 +959,7 @@ static int __init lock_torture_init(void) /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, - onoff_interval * HZ); + onoff_interval * HZ, NULL); if (firsterr) goto unwind; } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..f17dad99eec8 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,6 +7,8 @@ #include <linux/sched.h> #include <linux/errno.h> +#include "rwsem.h" + int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 8a8c3c208c5e..e14b32c69639 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -124,9 +124,6 @@ static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(idx > 3); -#endif tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ @@ -398,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * 0,1,0 -> 0,0,1 */ clear_pending_set_locked(lock); - qstat_inc(qstat_lock_pending, true); + lockevent_inc(lock_pending); return; /* @@ -406,18 +403,34 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * queuing. */ queue: - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); pv_queue: node = this_cpu_ptr(&qnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= MAX_NODES)) { + lockevent_inc(lock_no_node); + while (!queued_spin_trylock(lock)) + cpu_relax(); + goto release; + } + node = grab_mcs_node(node, idx); /* * Keep counts of non-zero index values: */ - qstat_inc(qstat_lock_idx1 + idx - 1, idx); + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); /* * Ensure that we increment the head node->count before initialising diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8f36c27c1794..89bab079e7a4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) if (!(val & _Q_LOCKED_PENDING_MASK) && (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { - qstat_inc(qstat_pv_lock_stealing, true); + lockevent_inc(pv_lock_stealing); return true; } if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) @@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); - qstat_hop(hopcnt); + lockevent_pv_hop(hopcnt); return &he->lock; } } @@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) smp_store_mb(pn->state, vcpu_halted); if (!READ_ONCE(node->locked)) { - qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_early, wait_early); + lockevent_inc(pv_wait_node); + lockevent_cond_inc(pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ - qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); + lockevent_cond_inc(pv_spurious_wakeup, + !READ_ONCE(node->locked)); } /* @@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); for (;; waitcnt++) { /* @@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) } } WRITE_ONCE(pn->state, vcpu_hashed); - qstat_inc(qstat_pv_wait_head, true); - qstat_inc(qstat_pv_wait_again, waitcnt); + lockevent_inc(pv_wait_head); + lockevent_cond_inc(pv_wait_again, waitcnt); pv_wait(&lock->locked, _Q_SLOW_VAL); /* @@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * vCPU is harmless other than the additional latency in completing * the unlock. */ - qstat_inc(qstat_pv_kick_unlock, true); + lockevent_inc(pv_kick_unlock); pv_kick(node->cpu); } diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 42d3d8dc8f49..54152670ff24 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -9,253 +9,105 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * Authors: Waiman Long <[email protected]> + * Authors: Waiman Long <[email protected]> */ -/* - * When queued spinlock statistical counters are enabled, the following - * debugfs files will be created for reporting the counter values: - * - * <debugfs>/qlockstat/ - * pv_hash_hops - average # of hops per hashing operation - * pv_kick_unlock - # of vCPU kicks issued at unlock time - * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake - * pv_latency_kick - average latency (ns) of vCPU kick operation - * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs - * pv_wait_again - # of wait's after a queue head vCPU kick - * pv_wait_early - # of early vCPU wait's - * pv_wait_head - # of vCPU wait's at the queue head - * pv_wait_node - # of vCPU wait's at a non-head queue node - * lock_pending - # of locking operations via pending code - * lock_slowpath - # of locking operations via MCS lock queue - * - * Writing to the "reset_counters" file will reset all the above counter - * values. - * - * These statistical counters are implemented as per-cpu variables which are - * summed and computed whenever the corresponding debugfs files are read. This - * minimizes added overhead making the counters usable even in a production - * environment. - * - * There may be slight difference between pv_kick_wake and pv_kick_unlock. - */ -enum qlock_stats { - qstat_pv_hash_hops, - qstat_pv_kick_unlock, - qstat_pv_kick_wake, - qstat_pv_latency_kick, - qstat_pv_latency_wake, - qstat_pv_lock_stealing, - qstat_pv_spurious_wakeup, - qstat_pv_wait_again, - qstat_pv_wait_early, - qstat_pv_wait_head, - qstat_pv_wait_node, - qstat_lock_pending, - qstat_lock_slowpath, - qstat_lock_idx1, - qstat_lock_idx2, - qstat_lock_idx3, - qstat_num, /* Total number of statistical counters */ - qstat_reset_cnts = qstat_num, -}; +#include "lock_events.h" -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS +#ifdef CONFIG_PARAVIRT_SPINLOCKS /* - * Collect pvqspinlock statistics + * Collect pvqspinlock locking event counts */ -#include <linux/debugfs.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/fs.h> -static const char * const qstat_names[qstat_num + 1] = { - [qstat_pv_hash_hops] = "pv_hash_hops", - [qstat_pv_kick_unlock] = "pv_kick_unlock", - [qstat_pv_kick_wake] = "pv_kick_wake", - [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", - [qstat_pv_latency_kick] = "pv_latency_kick", - [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_stealing] = "pv_lock_stealing", - [qstat_pv_wait_again] = "pv_wait_again", - [qstat_pv_wait_early] = "pv_wait_early", - [qstat_pv_wait_head] = "pv_wait_head", - [qstat_pv_wait_node] = "pv_wait_node", - [qstat_lock_pending] = "lock_pending", - [qstat_lock_slowpath] = "lock_slowpath", - [qstat_lock_idx1] = "lock_index1", - [qstat_lock_idx2] = "lock_index2", - [qstat_lock_idx3] = "lock_index3", - [qstat_reset_cnts] = "reset_counters", -}; +#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev] /* - * Per-cpu counters + * PV specific per-cpu counter */ -static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); static DEFINE_PER_CPU(u64, pv_kick_time); /* - * Function to read and return the qlock statistical counter values + * Function to read and return the PV qspinlock counts. * * The following counters are handled specially: - * 1. qstat_pv_latency_kick + * 1. pv_latency_kick * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock - * 2. qstat_pv_latency_wake + * 2. pv_latency_wake * Average wake latency (ns) = pv_latency_wake/pv_kick_wake - * 3. qstat_pv_hash_hops + * 3. pv_hash_hops * Average hops/hash = pv_hash_hops/pv_kick_unlock */ -static ssize_t qstat_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[64]; - int cpu, counter, len; - u64 stat = 0, kicks = 0; + int cpu, id, len; + u64 sum = 0, kicks = 0; /* * Get the counter ID stored in file->f_inode->i_private */ - counter = (long)file_inode(file)->i_private; + id = (long)file_inode(file)->i_private; - if (counter >= qstat_num) + if (id >= lockevent_num) return -EBADF; for_each_possible_cpu(cpu) { - stat += per_cpu(qstats[counter], cpu); + sum += per_cpu(lockevents[id], cpu); /* - * Need to sum additional counter for some of them + * Need to sum additional counters for some of them */ - switch (counter) { + switch (id) { - case qstat_pv_latency_kick: - case qstat_pv_hash_hops: - kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); + case LOCKEVENT_pv_latency_kick: + case LOCKEVENT_pv_hash_hops: + kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu); break; - case qstat_pv_latency_wake: - kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); + case LOCKEVENT_pv_latency_wake: + kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu); break; } } - if (counter == qstat_pv_hash_hops) { + if (id == LOCKEVENT_pv_hash_hops) { u64 frac = 0; if (kicks) { - frac = 100ULL * do_div(stat, kicks); + frac = 100ULL * do_div(sum, kicks); frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); } /* * Return a X.XX decimal number */ - len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", + sum, frac); } else { /* * Round to the nearest ns */ - if ((counter == qstat_pv_latency_kick) || - (counter == qstat_pv_latency_wake)) { + if ((id == LOCKEVENT_pv_latency_kick) || + (id == LOCKEVENT_pv_latency_wake)) { if (kicks) - stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); + sum = DIV_ROUND_CLOSEST_ULL(sum, kicks); } - len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); } return simple_read_from_buffer(user_buf, count, ppos, buf, len); } /* - * Function to handle write request - * - * When counter = reset_cnts, reset all the counter values. - * Since the counter updates aren't atomic, the resetting is done twice - * to make sure that the counters are very likely to be all cleared. - */ -static ssize_t qstat_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - int cpu; - - /* - * Get the counter ID stored in file->f_inode->i_private - */ - if ((long)file_inode(file)->i_private != qstat_reset_cnts) - return count; - - for_each_possible_cpu(cpu) { - int i; - unsigned long *ptr = per_cpu_ptr(qstats, cpu); - - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); - } - return count; -} - -/* - * Debugfs data structures - */ -static const struct file_operations fops_qstat = { - .read = qstat_read, - .write = qstat_write, - .llseek = default_llseek, -}; - -/* - * Initialize debugfs for the qspinlock statistical counters - */ -static int __init init_qspinlock_stat(void) -{ - struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); - int i; - - if (!d_qstat) - goto out; - - /* - * Create the debugfs files - * - * As reading from and writing to the stat files can be slow, only - * root is allowed to do the read/write to limit impact to system - * performance. - */ - for (i = 0; i < qstat_num; i++) - if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat)) - goto fail_undo; - - if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat)) - goto fail_undo; - - return 0; -fail_undo: - debugfs_remove_recursive(d_qstat); -out: - pr_warn("Could not create 'qlockstat' debugfs entries\n"); - return -ENOMEM; -} -fs_initcall(init_qspinlock_stat); - -/* - * Increment the PV qspinlock statistical counters - */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) -{ - if (cond) - this_cpu_inc(qstats[stat]); -} - -/* * PV hash hop count */ -static inline void qstat_hop(int hopcnt) +static inline void lockevent_pv_hop(int hopcnt) { - this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); + this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt); } /* @@ -267,7 +119,7 @@ static inline void __pv_kick(int cpu) per_cpu(pv_kick_time, cpu) = start; pv_kick(cpu); - this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); + this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start); } /* @@ -280,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val) *pkick_time = 0; pv_wait(ptr, val); if (*pkick_time) { - this_cpu_add(qstats[qstat_pv_latency_wake], + this_cpu_add(EVENT_COUNT(pv_latency_wake), sched_clock() - *pkick_time); - qstat_inc(qstat_pv_kick_wake, true); + lockevent_inc(pv_kick_wake); } } #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_LOCK_EVENT_COUNTS */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) { } -static inline void qstat_hop(int hopcnt) { } +static inline void lockevent_pv_hop(int hopcnt) { } -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 581edcc63c26..978d63a8261c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, rt_mutex_set_owner(lock, NULL); } +/** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: does _NOT_ remove the @waiter on failure; must either call + * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; + lockdep_assert_held(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; @@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ret = 0; } - if (unlikely(ret)) - remove_waiter(lock, waiter); - debug_rt_mutex_print_deadlock(waiter); return ret; @@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter + * on failure. + * * Returns: * 0 - task blocked on lock * 1 - acquired the lock for task, caller should wake it up * <0 - error * - * Special API call for FUTEX_REQUEUE_PI support. + * Special API call for PI-futex support. */ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + if (unlikely(ret)) + remove_waiter(lock, waiter); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, * @lock: the rt_mutex we were woken on * @waiter: the pre-initialized rt_mutex_waiter * - * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or + * rt_mutex_wait_proxy_lock(). * * Unless we acquired the lock; we're still enqueued on the wait-list and can * in fact still be granted ownership until we're removed. Therefore we can diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index a7ffb2a96ede..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,339 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells ([email protected]). - * - Derived partially from idea by Andrea Arcangeli <[email protected]> - * - Derived also from comments by Linus - */ -#include <linux/rwsem.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/export.h> - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->count != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). - */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->count += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -int __sched __down_read_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(current); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - out: - return 0; - -out_nolock: - /* - * We didn't take the lock, so that there is a writer, which - * is owner or the first waiter of the sem. If it's a waiter, - * it will be woken by current owner. Not need to wake anybody. - */ - list_del(&waiter.list); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - return -EINTR; -} - -void __sched __down_read(struct rw_semaphore *sem) -{ - __down_read_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_read_killable(struct rw_semaphore *sem) -{ - return __down_read_common(sem, TASK_KILLABLE); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -int __sched __down_write_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->count == 0) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->count = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; - -out_nolock: - list_del(&waiter.list); - if (!list_empty(&sem->wait_list) && sem->count >= 0) - __rwsem_do_wake(sem, 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return -EINTR; -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_write_killable(struct rw_semaphore *sem) -{ - return __down_write_common(sem, TASK_KILLABLE); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count == 0) { - /* got the lock */ - sem->count = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->count == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09b180063ee1..6b3ee9948bf1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * will notice the queued writer. */ wake_q_add(wake_q, waiter->task); + lockevent_inc(rwsem_wake_writer); } return; @@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, goto try_reader_grant; } /* - * It is not really necessary to set it to reader-owned here, - * but it gives the spinners an early indication that the - * readers now have the lock. + * Set it to reader-owned to give spinners an early + * indication that readers now have the lock. */ __rwsem_set_reader_owned(sem, waiter->task); } @@ -198,18 +198,24 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, woken++; tsk = waiter->task; - wake_q_add(wake_q, tsk); + get_task_struct(tsk); list_del(&waiter->list); /* - * Ensure that the last operation is setting the reader + * Ensure calling get_task_struct() before setting the reader * waiter to nil such that rwsem_down_read_failed() cannot * race with do_exit() by always holding a reference count * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add_safe(wake_q, tsk); } adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); if (list_empty(&sem->wait_list)) { /* hit end of list above */ adjustment -= RWSEM_WAITING_BIAS; @@ -220,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, } /* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - DEFINE_WAKE_Q(wake_q); - - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { - /* - * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_ACTIVE_READ_BIAS has already - * been set in the count. - */ - if (atomic_long_read(&sem->count) >= 0) { - raw_spin_unlock_irq(&sem->wait_lock); - return sem; - } - adjustment += RWSEM_WAITING_BIAS; - } - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - /* wait to be given the lock */ - while (true) { - set_current_state(state); - if (!waiter.task) - break; - if (signal_pending_state(state, current)) { - raw_spin_lock_irq(&sem->wait_lock); - if (waiter.task) - goto out_nolock; - raw_spin_unlock_irq(&sem->wait_lock); - break; - } - schedule(); - } - - __set_current_state(TASK_RUNNING); - return sem; -out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); - raw_spin_unlock_irq(&sem->wait_lock); - __set_current_state(TASK_RUNNING); - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - -/* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. @@ -341,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = atomic_long_read(&sem->count); + long count = atomic_long_read(&sem->count); - while (true) { - if (!(count == 0 || count == RWSEM_WAITING_BIAS)) - return false; - - old = atomic_long_cmpxchg_acquire(&sem->count, count, - count + RWSEM_ACTIVE_WRITE_BIAS); - if (old == count) { + while (!count || count == RWSEM_WAITING_BIAS) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, + count + RWSEM_ACTIVE_WRITE_BIAS)) { rwsem_set_owner(sem); + lockevent_inc(rwsem_opt_wlock); return true; } - - count = old; } + return false; } static inline bool owner_on_cpu(struct task_struct *owner) @@ -476,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) osq_unlock(&sem->osq); done: preempt_enable(); + lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } @@ -500,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) #endif /* + * Wait for the read lock to be granted + */ +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +{ + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_READ; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_ACTIVE_READ_BIAS has already + * been set in the count. + */ + if (atomic_long_read(&sem->count) >= 0) { + raw_spin_unlock_irq(&sem->wait_lock); + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_fast); + return sem; + } + adjustment += RWSEM_WAITING_BIAS; + } + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = atomic_long_add_return(adjustment, &sem->count); + + /* + * If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + + /* wait to be given the lock */ + while (true) { + set_current_state(state); + if (!waiter.task) + break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } + schedule(); + lockevent_inc(rwsem_sleep_reader); + } + + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock); + return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + +/* * Wait until we successfully acquire the write lock */ static inline struct rw_semaphore * @@ -575,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) goto out_nolock; schedule(); + lockevent_inc(rwsem_sleep_writer); set_current_state(state); } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); @@ -583,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); return ret; @@ -596,6 +606,7 @@ out_nolock: __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); + lockevent_inc(rwsem_wlock_fail); return ERR_PTR(-EINTR); } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e586f0d03ad3..ccbf18f560ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem) return -EINTR; } - rwsem_set_reader_owned(sem); return 0; } @@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_reader_owned(sem); - } return ret; } @@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem) return -EINTR; } - rwsem_set_owner(sem); return 0; } @@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_owner(sem); - } return ret; } @@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); - rwsem_clear_reader_owned(sem); __up_read(sem); } @@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != current); - rwsem_clear_owner(sem); __up_write(sem); } @@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != current); - rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); @@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); @@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) return -EINTR; } - rwsem_set_owner(sem); return 0; } @@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), + sem); __up_read(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index bad2bca0268b..64877f5294e3 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -23,15 +23,44 @@ * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. */ +#include "lock_events.h" + #define RWSEM_READER_OWNED (1UL << 0) #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) #ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) +# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ + if (!debug_locks_silent && \ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + #c, atomic_long_read(&(sem)->count), \ + (long)((sem)->owner), (long)current, \ + list_empty(&(sem)->wait_list) ? "" : "not ")) \ + debug_locks_off(); \ + } while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. + * Adapted largely from include/asm-i386/rwsem.h + * by Paul Mackerras <[email protected]>. + */ + +/* + * the semaphore definition + */ +#ifdef CONFIG_64BIT +# define RWSEM_ACTIVE_MASK 0xffffffffL #else -# define DEBUG_RWSEMS_WARN_ON(c) +# define RWSEM_ACTIVE_MASK 0x0000ffffL #endif +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) +#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { } #endif + +extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); + +/* + * lock for reading + */ +static inline void __down_read(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + rwsem_down_read_failed(sem); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED), sem); + } else { + rwsem_set_reader_owned(sem); + } +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + /* + * Optimize for the case when the rwsem is not locked at all. + */ + long tmp = RWSEM_UNLOCKED_VALUE; + + lockevent_inc(rwsem_rtrylock); + do { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + tmp + RWSEM_ACTIVE_READ_BIAS)) { + rwsem_set_reader_owned(sem); + return 1; + } + } while (tmp >= 0); + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + rwsem_down_write_failed(sem); + rwsem_set_owner(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + rwsem_set_owner(sem); + return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + long tmp; + + lockevent_inc(rwsem_wtrylock); + tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); + if (tmp == RWSEM_UNLOCKED_VALUE) { + rwsem_set_owner(sem); + return true; + } + return false; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), + sem); + rwsem_clear_reader_owned(sem); + tmp = atomic_long_dec_return_release(&sem->count); + if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) + rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + rwsem_clear_owner(sem); + if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count) < 0)) + rwsem_wake(sem); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); + rwsem_set_reader_owned(sem); + if (tmp < 0) + rwsem_downgrade_wake(sem); +} diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 936f3d14dd6b..0ff08380f531 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -22,6 +22,13 @@ #include <linux/debug_locks.h> #include <linux/export.h> +#ifdef CONFIG_MMIOWB +#ifndef arch_mmiowb_state +DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state); +EXPORT_PER_CPU_SYMBOL(__mmiowb_state); +#endif +#endif + /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..399669f7eba8 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); arch_spin_lock(&lock->raw_lock); + mmiowb_spin_lock(); debug_spin_lock_after(lock); } @@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&lock->raw_lock); - if (ret) + if (ret) { + mmiowb_spin_lock(); debug_spin_lock_after(lock); + } #ifndef CONFIG_SMP /* * Must not happen on UP: @@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) void do_raw_spin_unlock(raw_spinlock_t *lock) { + mmiowb_spin_unlock(); debug_spin_unlock(lock); arch_spin_unlock(&lock->raw_lock); } |