diff options
author | Rodrigo Vivi <[email protected]> | 2018-07-23 09:13:12 -0700 |
---|---|---|
committer | Rodrigo Vivi <[email protected]> | 2018-07-23 09:13:12 -0700 |
commit | c74a7469f97c0f40b46e82ee979f9fb1bb6e847c (patch) | |
tree | f2690a1a916b73ef94657fbf0e0141ae57701825 /kernel/locking | |
parent | 6f15a7de86c8cf2dc09fc9e6d07047efa40ef809 (diff) | |
parent | 500775074f88d9cf5416bed2ca19592812d62c41 (diff) |
Merge drm/drm-next into drm-intel-next-queued
We need a backmerge to get DP_DPCD_REV_14 before we push other
i915 changes to dinq that could break compilation.
Signed-off-by: Rodrigo Vivi <[email protected]>
Diffstat (limited to 'kernel/locking')
-rw-r--r-- | kernel/locking/lockdep.c | 82 | ||||
-rw-r--r-- | kernel/locking/lockdep_proc.c | 45 | ||||
-rw-r--r-- | kernel/locking/locktorture.c | 16 | ||||
-rw-r--r-- | kernel/locking/mcs_spinlock.h | 10 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 348 | ||||
-rw-r--r-- | kernel/locking/qspinlock.c | 247 | ||||
-rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 49 | ||||
-rw-r--r-- | kernel/locking/qspinlock_stat.h | 9 | ||||
-rw-r--r-- | kernel/locking/rwsem-xadd.c | 40 | ||||
-rw-r--r-- | kernel/locking/rwsem.c | 3 | ||||
-rw-r--r-- | kernel/locking/rwsem.h | 30 | ||||
-rw-r--r-- | kernel/locking/test-ww_mutex.c | 2 |
12 files changed, 489 insertions, 392 deletions
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 023386338269..5fa4d3138bf1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -561,20 +561,24 @@ static void print_lock(struct held_lock *hlock) printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } -static void lockdep_print_held_locks(struct task_struct *curr) +static void lockdep_print_held_locks(struct task_struct *p) { - int i, depth = curr->lockdep_depth; + int i, depth = READ_ONCE(p->lockdep_depth); - if (!depth) { - printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); + if (!depth) + printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); + else + printk("%d lock%s held by %s/%d:\n", depth, + depth > 1 ? "s" : "", p->comm, task_pid_nr(p)); + /* + * It's not reliable to print a task's held locks if it's not sleeping + * and it's not the current task. + */ + if (p->state == TASK_RUNNING && p != current) return; - } - printk("%d lock%s held by %s/%d:\n", - depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); - for (i = 0; i < depth; i++) { printk(" #%d: ", i); - print_lock(curr->held_locks + i); + print_lock(p->held_locks + i); } } @@ -1261,11 +1265,11 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.parent = NULL; this.class = class; - local_irq_save(flags); + raw_local_irq_save(flags); arch_spin_lock(&lockdep_lock); ret = __lockdep_count_forward_deps(&this); arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); + raw_local_irq_restore(flags); return ret; } @@ -1288,11 +1292,11 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.parent = NULL; this.class = class; - local_irq_save(flags); + raw_local_irq_save(flags); arch_spin_lock(&lockdep_lock); ret = __lockdep_count_backward_deps(&this); arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); + raw_local_irq_restore(flags); return ret; } @@ -4407,7 +4411,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) if (unlikely(!debug_locks)) return; - local_irq_save(flags); + raw_local_irq_save(flags); for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -4418,7 +4422,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); break; } - local_irq_restore(flags); + raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); @@ -4451,8 +4455,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { struct task_struct *g, *p; - int count = 10; - int unlock = 1; if (unlikely(!debug_locks)) { pr_warn("INFO: lockdep is turned off.\n"); @@ -4460,50 +4462,18 @@ void debug_show_all_locks(void) } pr_warn("\nShowing all locks held in the system:\n"); - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - pr_warn("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - pr_cont(" #%d", 10-count); - mdelay(200); - goto retry; - } - pr_cont(" ignoring it.\n"); - unlock = 0; - } else { - if (count != 10) - pr_cont(" locked it.\n"); - } - - do_each_thread(g, p) { - /* - * It's not reliable to print a task's held locks - * if it's not sleeping (or if it's not the current - * task): - */ - if (p->state == TASK_RUNNING && p != current) + rcu_read_lock(); + for_each_process_thread(g, p) { + if (!p->lockdep_depth) continue; - if (p->lockdep_depth) - lockdep_print_held_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; + lockdep_print_held_locks(p); touch_nmi_watchdog(); - } while_each_thread(g, p); + touch_all_softlockup_watchdogs(); + } + rcu_read_unlock(); pr_warn("\n"); pr_warn("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); #endif diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ad69bbc9bd28..3dd980dfba2d 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -101,18 +101,6 @@ static const struct seq_operations lockdep_ops = { .show = l_show, }; -static int lockdep_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_ops); -} - -static const struct file_operations proc_lockdep_operations = { - .open = lockdep_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { @@ -170,18 +158,6 @@ static const struct seq_operations lockdep_chains_ops = { .stop = lc_stop, .show = lc_show, }; - -static int lockdep_chains_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_chains_ops); -} - -static const struct file_operations proc_lockdep_chains_operations = { - .open = lockdep_chains_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; #endif /* CONFIG_PROVE_LOCKING */ static void lockdep_stats_debug_show(struct seq_file *m) @@ -355,18 +331,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) return 0; } -static int lockdep_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, lockdep_stats_show, NULL); -} - -static const struct file_operations proc_lockdep_stats_operations = { - .open = lockdep_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #ifdef CONFIG_LOCK_STAT struct lock_stat_data { @@ -682,14 +646,11 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { - proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops); #ifdef CONFIG_PROVE_LOCKING - proc_create("lockdep_chains", S_IRUSR, NULL, - &proc_lockdep_chains_operations); + proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops); #endif - proc_create("lockdep_stats", S_IRUSR, NULL, - &proc_lockdep_stats_operations); - + proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show); #ifdef CONFIG_LOCK_STAT proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &proc_lock_stat_operations); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 6850ffd69125..c28224347d69 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -365,7 +365,7 @@ static struct lock_torture_ops mutex_lock_ops = { }; #include <linux/ww_mutex.h> -static DEFINE_WW_CLASS(torture_ww_class); +static DEFINE_WD_CLASS(torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); @@ -913,7 +913,9 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { lock_is_write_held = 0; - cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, + sizeof(*cxt.lwsa), + GFP_KERNEL); if (cxt.lwsa == NULL) { VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); firsterr = -ENOMEM; @@ -942,7 +944,9 @@ static int __init lock_torture_init(void) if (nreaders_stress) { lock_is_read_held = 0; - cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, + sizeof(*cxt.lrsa), + GFP_KERNEL); if (cxt.lrsa == NULL) { VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; @@ -985,7 +989,8 @@ static int __init lock_torture_init(void) } if (nwriters_stress) { - writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), + writer_tasks = kcalloc(cxt.nrealwriters_stress, + sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); @@ -995,7 +1000,8 @@ static int __init lock_torture_init(void) } if (cxt.cur_ops->readlock) { - reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(cxt.nrealreaders_stress, + sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index f046b7ce9dd6..5e10153b4d3c 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -23,13 +23,15 @@ struct mcs_spinlock { #ifndef arch_mcs_spin_lock_contended /* - * Using smp_load_acquire() provides a memory barrier that ensures - * subsequent operations happen after the lock is acquired. + * Using smp_cond_load_acquire() provides the acquire semantics + * required so that subsequent operations happen after the + * lock is acquired. Additionally, some architectures such as + * ARM64 would like to do spin-waiting instead of purely + * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ do { \ - while (!(smp_load_acquire(l))) \ - cpu_relax(); \ + smp_cond_load_acquire(l, VAL); \ } while (0) #endif diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2048359f33d2..1a81a1257b3f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock) static __always_inline bool __mutex_trylock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; + unsigned long zero = 0UL; - if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; return false; @@ -173,6 +174,21 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait } /* + * Add @waiter to a given location in the lock wait_list and set the + * FLAG_WAITERS flag if it's the first waiter. + */ +static void __sched +__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct list_head *list) +{ + debug_mutex_add_waiter(lock, waiter, current); + + list_add_tail(&waiter->list, list); + if (__mutex_waiter_is_first(lock, waiter)) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); +} + +/* * Give up ownership to a specific task, when @task = NULL, this is equivalent * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves * WAITERS. Provides RELEASE semantics like a regular unlock, the @@ -243,6 +259,22 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +/* + * Wait-Die: + * The newer transactions are killed when: + * It (the new transaction) makes a request for a lock being held + * by an older transaction. + * + * Wound-Wait: + * The newer transactions are wounded when: + * An older transaction makes a request for a lock being held by + * the newer transaction. + */ + +/* + * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired + * it. + */ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) { @@ -281,26 +313,108 @@ ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); #endif ww_ctx->acquired++; + ww->ctx = ww_ctx; } +/* + * Determine if context @a is 'after' context @b. IOW, @a is a younger + * transaction than @b and depending on algorithm either needs to wait for + * @b or die. + */ static inline bool __sched __ww_ctx_stamp_after(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) { - return a->stamp - b->stamp <= LONG_MAX && - (a->stamp != b->stamp || a > b); + + return (signed long)(a->stamp - b->stamp) > 0; +} + +/* + * Wait-Die; wake a younger waiter context (when locks held) such that it can + * die. + * + * Among waiters with context, only the first one can have other locks acquired + * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and + * __ww_mutex_check_kill() wake any but the earliest context. + */ +static bool __sched +__ww_mutex_die(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) +{ + if (!ww_ctx->is_wait_die) + return false; + + if (waiter->ww_ctx->acquired > 0 && + __ww_ctx_stamp_after(waiter->ww_ctx, ww_ctx)) { + debug_mutex_wake_waiter(lock, waiter); + wake_up_process(waiter->task); + } + + return true; +} + +/* + * Wound-Wait; wound a younger @hold_ctx if it holds the lock. + * + * Wound the lock holder if there are waiters with older transactions than + * the lock holders. Even if multiple waiters may wound the lock holder, + * it's sufficient that only one does. + */ +static bool __ww_mutex_wound(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, + struct ww_acquire_ctx *hold_ctx) +{ + struct task_struct *owner = __mutex_owner(lock); + + lockdep_assert_held(&lock->wait_lock); + + /* + * Possible through __ww_mutex_add_waiter() when we race with + * ww_mutex_set_context_fastpath(). In that case we'll get here again + * through __ww_mutex_check_waiters(). + */ + if (!hold_ctx) + return false; + + /* + * Can have !owner because of __mutex_unlock_slowpath(), but if owner, + * it cannot go away because we'll have FLAG_WAITERS set and hold + * wait_lock. + */ + if (!owner) + return false; + + if (ww_ctx->acquired > 0 && __ww_ctx_stamp_after(hold_ctx, ww_ctx)) { + hold_ctx->wounded = 1; + + /* + * wake_up_process() paired with set_current_state() + * inserts sufficient barriers to make sure @owner either sees + * it's wounded in __ww_mutex_lock_check_stamp() or has a + * wakeup pending to re-read the wounded state. + */ + if (owner != current) + wake_up_process(owner); + + return true; + } + + return false; } /* - * Wake up any waiters that may have to back off when the lock is held by the - * given context. + * We just acquired @lock under @ww_ctx, if there are later contexts waiting + * behind us on the wait-list, check if they need to die, or wound us. * - * Due to the invariants on the wait list, this can only affect the first - * waiter with a context. + * See __ww_mutex_add_waiter() for the list-order construction; basically the + * list is ordered by stamp, smallest (oldest) first. + * + * This relies on never mixing wait-die/wound-wait on the same wait-list; + * which is currently ensured by that being a ww_class property. * * The current task must not be on the wait list. */ static void __sched -__ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +__ww_mutex_check_waiters(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) { struct mutex_waiter *cur; @@ -310,66 +424,51 @@ __ww_mutex_wakeup_for_backoff(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) if (!cur->ww_ctx) continue; - if (cur->ww_ctx->acquired > 0 && - __ww_ctx_stamp_after(cur->ww_ctx, ww_ctx)) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } - - break; + if (__ww_mutex_die(lock, cur, ww_ctx) || + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) + break; } } /* - * After acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. + * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx + * and wake up any waiters so they can recheck. */ static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; - /* * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be + * the WAITERS check is done, otherwise contended waiters might be * missed. The contended waiters will either see ww_ctx == NULL * and keep spinning, or it will acquire wait_lock, add itself * to waiter list and sleep. */ - smp_mb(); /* ^^^ */ + smp_mb(); /* See comments above and below. */ /* - * Check if lock is contended, if not there is nobody to wake up + * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS + * MB MB + * [R] MUTEX_FLAG_WAITERS [R] ww->ctx + * + * The memory barrier above pairs with the memory barrier in + * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx + * and/or !empty list. */ if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) return; /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. + * Uh oh, we raced in fastpath, check if any of the waiters need to + * die or wound us. */ spin_lock(&lock->base.wait_lock); - __ww_mutex_wakeup_for_backoff(&lock->base, ctx); + __ww_mutex_check_waiters(&lock->base, ctx); spin_unlock(&lock->base.wait_lock); } -/* - * After acquiring lock in the slowpath set ctx. - * - * Unlike for the fast path, the caller ensures that waiters are woken up where - * necessary. - * - * Callers must hold the mutex wait_lock. - */ -static __always_inline void -ww_mutex_set_context_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - ww_mutex_lock_acquired(lock, ctx); - lock->ctx = ctx; -} - #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline @@ -645,37 +744,83 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) } EXPORT_SYMBOL(ww_mutex_unlock); + +static __always_inline int __sched +__ww_mutex_kill(struct mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + if (ww_ctx->acquired > 0) { +#ifdef CONFIG_DEBUG_MUTEXES + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); + ww_ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + + +/* + * Check the wound condition for the current lock acquire. + * + * Wound-Wait: If we're wounded, kill ourself. + * + * Wait-Die: If we're trying to acquire a lock already held by an older + * context, kill ourselves. + * + * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to + * look at waiters before us in the wait-list. + */ static inline int __sched -__ww_mutex_lock_check_stamp(struct mutex *lock, struct mutex_waiter *waiter, - struct ww_acquire_ctx *ctx) +__ww_mutex_check_kill(struct mutex *lock, struct mutex_waiter *waiter, + struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); struct mutex_waiter *cur; + if (ctx->acquired == 0) + return 0; + + if (!ctx->is_wait_die) { + if (ctx->wounded) + return __ww_mutex_kill(lock, ctx); + + return 0; + } + if (hold_ctx && __ww_ctx_stamp_after(ctx, hold_ctx)) - goto deadlock; + return __ww_mutex_kill(lock, ctx); /* * If there is a waiter in front of us that has a context, then its - * stamp is earlier than ours and we must back off. + * stamp is earlier than ours and we must kill ourself. */ cur = waiter; list_for_each_entry_continue_reverse(cur, &lock->wait_list, list) { - if (cur->ww_ctx) - goto deadlock; + if (!cur->ww_ctx) + continue; + + return __ww_mutex_kill(lock, ctx); } return 0; - -deadlock: -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; -#endif - return -EDEADLK; } +/* + * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest + * first. Such that older contexts are preferred to acquire the lock over + * younger contexts. + * + * Waiters without context are interspersed in FIFO order. + * + * Furthermore, for Wait-Die kill ourself immediately when possible (there are + * older contexts already waiting) to avoid unnecessary waiting and for + * Wound-Wait ensure we wound the owning context when it is younger. + */ static inline int __sched __ww_mutex_add_waiter(struct mutex_waiter *waiter, struct mutex *lock, @@ -683,16 +828,21 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, { struct mutex_waiter *cur; struct list_head *pos; + bool is_wait_die; if (!ww_ctx) { - list_add_tail(&waiter->list, &lock->wait_list); + __mutex_add_waiter(lock, waiter, &lock->wait_list); return 0; } + is_wait_die = ww_ctx->is_wait_die; + /* * Add the waiter before the first waiter with a higher stamp. * Waiters without a context are skipped to avoid starving - * them. + * them. Wait-Die waiters may die here. Wound-Wait waiters + * never die here, but they are sorted in stamp order and + * may wound the lock holder. */ pos = &lock->wait_list; list_for_each_entry_reverse(cur, &lock->wait_list, list) { @@ -700,16 +850,16 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, continue; if (__ww_ctx_stamp_after(ww_ctx, cur->ww_ctx)) { - /* Back off immediately if necessary. */ - if (ww_ctx->acquired > 0) { -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock); - ww_ctx->contending_lock = ww; -#endif - return -EDEADLK; + /* + * Wait-Die: if we find an older context waiting, there + * is no point in queueing behind it, as we'd have to + * die the moment it would acquire the lock. + */ + if (is_wait_die) { + int ret = __ww_mutex_kill(lock, ww_ctx); + + if (ret) + return ret; } break; @@ -717,17 +867,28 @@ __ww_mutex_add_waiter(struct mutex_waiter *waiter, pos = &cur->list; + /* Wait-Die: ensure younger waiters die. */ + __ww_mutex_die(lock, cur, ww_ctx); + } + + __mutex_add_waiter(lock, waiter, pos); + + /* + * Wound-Wait: if we're blocking on a mutex owned by a younger context, + * wound that such that we might proceed. + */ + if (!is_wait_die) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + /* - * Wake up the waiter so that it gets a chance to back - * off. + * See ww_mutex_set_context_fastpath(). Orders setting + * MUTEX_FLAG_WAITERS vs the ww->ctx load, + * such that either we or the fastpath will wound @ww->ctx. */ - if (cur->ww_ctx->acquired > 0) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } + smp_mb(); + __ww_mutex_wound(lock, ww_ctx, ww->ctx); } - list_add_tail(&waiter->list, pos); return 0; } @@ -750,6 +911,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (use_ww_ctx && ww_ctx) { if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; } preempt_disable(); @@ -771,7 +940,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ if (__mutex_trylock(lock)) { if (use_ww_ctx && ww_ctx) - __ww_mutex_wakeup_for_backoff(lock, ww_ctx); + __ww_mutex_check_waiters(lock, ww_ctx); goto skip_wait; } @@ -783,25 +952,26 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, &lock->wait_list); + #ifdef CONFIG_DEBUG_MUTEXES waiter.ww_ctx = MUTEX_POISON_WW_CTX; #endif } else { - /* Add in stamp order, waking up waiters that must back off. */ + /* + * Add in stamp order, waking up waiters that must kill + * themselves. + */ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); if (ret) - goto err_early_backoff; + goto err_early_kill; waiter.ww_ctx = ww_ctx; } waiter.task = current; - if (__mutex_waiter_is_first(lock, &waiter)) - __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); - set_current_state(state); for (;;) { /* @@ -814,7 +984,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto acquired; /* - * Check for signals and wound conditions while holding + * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ @@ -823,8 +993,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - if (use_ww_ctx && ww_ctx && ww_ctx->acquired > 0) { - ret = __ww_mutex_lock_check_stamp(lock, &waiter, ww_ctx); + if (use_ww_ctx && ww_ctx) { + ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx); if (ret) goto err; } @@ -858,6 +1028,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, acquired: __set_current_state(TASK_RUNNING); + if (use_ww_ctx && ww_ctx) { + /* + * Wound-Wait; we stole the lock (!first_waiter), check the + * waiters as anyone might want to wound us. + */ + if (!ww_ctx->is_wait_die && + !__mutex_waiter_is_first(lock, &waiter)) + __ww_mutex_check_waiters(lock, ww_ctx); + } + mutex_remove_waiter(lock, &waiter, current); if (likely(list_empty(&lock->wait_list))) __mutex_clear_flag(lock, MUTEX_FLAGS); @@ -869,7 +1049,7 @@ skip_wait: lock_acquired(&lock->dep_map, ip); if (use_ww_ctx && ww_ctx) - ww_mutex_set_context_slowpath(ww, ww_ctx); + ww_mutex_lock_acquired(ww, ww_ctx); spin_unlock(&lock->wait_lock); preempt_enable(); @@ -878,7 +1058,7 @@ skip_wait: err: __set_current_state(TASK_RUNNING); mutex_remove_waiter(lock, &waiter, current); -err_early_backoff: +err_early_kill: spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index d880296245c5..bfaeb05123ff 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -12,11 +12,11 @@ * GNU General Public License for more details. * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. - * (C) Copyright 2013-2014 Red Hat, Inc. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * - * Authors: Waiman Long <[email protected]> + * Authors: Waiman Long <[email protected]> * Peter Zijlstra <[email protected]> */ @@ -33,6 +33,11 @@ #include <asm/qspinlock.h> /* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the * MCS lock. The paper below provides a good description for this kind @@ -77,6 +82,18 @@ #endif /* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* * Per-CPU queue node structures; we can never have more than 4 nested * contexts: task, softirq, hardirq, nmi. * @@ -114,41 +131,18 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail) #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) -/* - * By using the whole 2nd least significant byte for the pending bit, we - * can allow better optimization of the lock acquisition for the pending - * bit holder. +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure * - * This internal structure is also used by the set_locked function which - * is not restricted to _Q_PENDING_BITS == 8. + * *,1,* -> *,0,* */ -struct __qspinlock { - union { - atomic_t val; -#ifdef __LITTLE_ENDIAN - struct { - u8 locked; - u8 pending; - }; - struct { - u16 locked_pending; - u16 tail; - }; -#else - struct { - u16 tail; - u16 locked_pending; - }; - struct { - u8 reserved[2]; - u8 pending; - u8 locked; - }; -#endif - }; -}; +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} -#if _Q_PENDING_BITS == 8 /** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queued spinlock structure @@ -159,9 +153,7 @@ struct __qspinlock { */ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); } /* @@ -176,19 +168,28 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) */ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { - struct __qspinlock *l = (void *)lock; - /* - * Use release semantics to make sure that the MCS node is properly - * initialized before changing the tail code. + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. */ - return (u32)xchg_release(&l->tail, + return (u32)xchg_relaxed(&lock->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; } #else /* _Q_PENDING_BITS == 8 */ /** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queued spinlock structure * @@ -216,10 +217,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) for (;;) { new = (val & _Q_LOCKED_PENDING_MASK) | tail; /* - * Use release semantics to make sure that the MCS node is - * properly initialized before changing the tail code. + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. */ - old = atomic_cmpxchg_release(&lock->val, val, new); + old = atomic_cmpxchg_relaxed(&lock->val, val, new); if (old == val) break; @@ -237,9 +239,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) */ static __always_inline void set_locked(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); } @@ -294,86 +294,83 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; - u32 new, old, tail; + u32 old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); if (pv_enabled()) - goto queue; + goto pv_queue; if (virt_spin_lock(lock)) return; /* - * wait for in-progress pending->locked hand-overs + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. * * 0,1,0 -> 0,0,1 */ if (val == _Q_PENDING_VAL) { - while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) - cpu_relax(); + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); } /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* * trylock || pending * * 0,0,0 -> 0,0,1 ; trylock * 0,0,1 -> 0,1,1 ; pending */ - for (;;) { + val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); + if (!(val & ~_Q_LOCKED_MASK)) { /* - * If we observe any contention; queue. + * We're pending, wait for the owner to go away. + * + * *,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. */ - if (val & ~_Q_LOCKED_MASK) - goto queue; - - new = _Q_LOCKED_VAL; - if (val == new) - new |= _Q_PENDING_VAL; + if (val & _Q_LOCKED_MASK) { + atomic_cond_read_acquire(&lock->val, + !(VAL & _Q_LOCKED_MASK)); + } /* - * Acquire semantic is required here as the function may - * return immediately if the lock was free. + * take ownership and clear the pending bit. + * + * *,1,0 -> *,0,1 */ - old = atomic_cmpxchg_acquire(&lock->val, val, new); - if (old == val) - break; - - val = old; - } - - /* - * we won the trylock - */ - if (new == _Q_LOCKED_VAL) + clear_pending_set_locked(lock); + qstat_inc(qstat_lock_pending, true); return; + } /* - * we're pending, wait for the owner to go away. - * - * *,1,1 -> *,1,0 - * - * this wait loop must be a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because not all clear_pending_set_locked() - * implementations imply full barriers. - */ - smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); - - /* - * take ownership and clear the pending bit. - * - * *,1,0 -> *,0,1 + * If pending was clear but there are waiters in the queue, then + * we need to undo our setting of pending before we queue ourselves. */ - clear_pending_set_locked(lock); - return; + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); /* * End of pending bit optimistic spinning and beginning of MCS * queuing. */ queue: + qstat_inc(qstat_lock_slowpath, true); +pv_queue: node = this_cpu_ptr(&mcs_nodes[0]); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -400,12 +397,18 @@ queue: goto release; /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. * We have already touched the queueing cacheline; don't bother with * pending stuff. * * p,*,* -> n,*,* - * - * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -417,14 +420,8 @@ queue: if (old & _Q_TAIL_MASK) { prev = decode_tail(old); - /* - * We must ensure that the stores to @node are observed before - * the write to prev->next. The address dependency from - * xchg_tail is not sufficient to ensure this because the read - * component of xchg_tail is unordered with respect to the - * initialisation of @node. - */ - smp_store_release(&prev->next, node); + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); @@ -453,8 +450,8 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_load_acquire() call. As the next PV queue head hasn't been - * designated yet, there is no way for the locked value to become + * atomic_cond_read_acquire() call. As the next PV queue head hasn't + * been designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. * @@ -464,44 +461,38 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); + val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* * claim the lock: * * n,0,0 -> 0,0,1 : lock, uncontended - * *,0,0 -> *,0,1 : lock, contended + * *,*,0 -> *,*,1 : lock, contended * - * If the queue head is the only one in the queue (lock value == tail), - * clear the tail code and grab the lock. Otherwise, we only need - * to grab the lock. + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. */ - for (;;) { - /* In the PV case we might already have _Q_LOCKED_VAL set */ - if ((val & _Q_TAIL_MASK) != tail) { - set_locked(lock); - break; - } - /* - * The smp_cond_load_acquire() call above has provided the - * necessary acquire semantics required for locking. At most - * two iterations of this loop may be ran. - */ - old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); - if (old == val) - goto release; /* No contention */ - val = old; - } + /* + * In the PV case we might already have _Q_LOCKED_VAL set. + * + * The atomic_cond_read_acquire() call above has provided the + * necessary acquire semantics required for locking. + */ + if (((val & _Q_TAIL_MASK) == tail) && + atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + + /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ + set_locked(lock); /* * contended path; wait for next if not observed yet, release. */ - if (!next) { - while (!(next = READ_ONCE(node->next))) - cpu_relax(); - } + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); arch_mcs_spin_unlock_contended(&next->locked); pv_kick_node(lock, next); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 6ee477765e6c..5a0cf5f9008c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -56,11 +56,6 @@ struct pv_node { }; /* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - -/* * Hybrid PV queued/unfair lock * * By replacing the regular queued_spin_trylock() with the function below, @@ -87,8 +82,6 @@ struct pv_node { #define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - /* * Stay in unfair lock mode as long as queued mode waiters are * present in the MCS wait queue but the pending bit isn't set. @@ -97,7 +90,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) int val = atomic_read(&lock->val); if (!(val & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { qstat_inc(qstat_pv_lock_stealing, true); return true; } @@ -117,16 +110,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) #if _Q_PENDING_BITS == 8 static __always_inline void set_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 1); -} - -static __always_inline void clear_pending(struct qspinlock *lock) -{ - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 0); + WRITE_ONCE(lock->pending, 1); } /* @@ -136,10 +120,8 @@ static __always_inline void clear_pending(struct qspinlock *lock) */ static __always_inline int trylock_clear_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - return !READ_ONCE(l->locked) && - (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, + return !READ_ONCE(lock->locked) && + (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) == _Q_PENDING_VAL); } #else /* _Q_PENDING_BITS == 8 */ @@ -148,11 +130,6 @@ static __always_inline void set_pending(struct qspinlock *lock) atomic_or(_Q_PENDING_VAL, &lock->val); } -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - static __always_inline int trylock_clear_pending(struct qspinlock *lock) { int val = atomic_read(&lock->val); @@ -384,7 +361,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; /* * If the vCPU is indeed halted, advance its state to match that of @@ -413,7 +389,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * the hash table later on at unlock time, no atomic instruction is * needed. */ - WRITE_ONCE(l->locked, _Q_SLOW_VAL); + WRITE_ONCE(lock->locked, _Q_SLOW_VAL); (void)pv_hash(lock, pn); } @@ -428,7 +404,6 @@ static u32 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; int waitcnt = 0; int loop; @@ -443,7 +418,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_pv_lock_slowpath, true); + qstat_inc(qstat_lock_slowpath, true); for (;; waitcnt++) { /* @@ -479,13 +454,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { + if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) { /* * The lock was free and now we own the lock. * Change the lock value back to _Q_LOCKED_VAL * and unhash the table. */ - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); goto gotlock; } @@ -493,7 +468,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) WRITE_ONCE(pn->state, vcpu_hashed); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); - pv_wait(&l->locked, _Q_SLOW_VAL); + pv_wait(&lock->locked, _Q_SLOW_VAL); /* * Because of lock stealing, the queue head vCPU may not be @@ -518,7 +493,6 @@ gotlock: __visible void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { - struct __qspinlock *l = (void *)lock; struct pv_node *node; if (unlikely(locked != _Q_SLOW_VAL)) { @@ -547,7 +521,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * Now that we have a reference to the (likely) blocked pv_node, * release the lock. */ - smp_store_release(&l->locked, 0); + smp_store_release(&lock->locked, 0); /* * At this point the memory pointed at by lock can be freed/reused, @@ -573,7 +547,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #ifndef __pv_queued_spin_unlock __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; u8 locked; /* @@ -581,7 +554,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 4a30ef63c607..6bd78c0740fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,13 +22,14 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node + * lock_pending - # of locking operations via pending code + * lock_slowpath - # of locking operations via MCS lock queue * * Writing to the "reset_counters" file will reset all the above counter * values. @@ -46,13 +47,14 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, - qstat_pv_lock_slowpath, qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, qstat_pv_wait_early, qstat_pv_wait_head, qstat_pv_wait_node, + qstat_lock_pending, + qstat_lock_slowpath, qstat_num, /* Total number of statistical counters */ qstat_reset_cnts = qstat_num, }; @@ -73,12 +75,13 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_slowpath] = "pv_lock_slowpath", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_early] = "pv_wait_early", [qstat_pv_wait_head] = "pv_wait_head", [qstat_pv_wait_node] = "pv_wait_node", + [qstat_lock_pending] = "lock_pending", + [qstat_lock_slowpath] = "lock_slowpath", [qstat_reset_cnts] = "reset_counters", }; diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index e795908f3607..3064c50e181e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -347,30 +347,31 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) } } +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; bool ret = true; + BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + if (need_resched()) return false; rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) { - /* - * Don't spin if the rwsem is readers owned. - */ - ret = !rwsem_owner_is_reader(owner); - goto done; + if (owner) { + ret = is_rwsem_owner_spinnable(owner) && + owner_on_cpu(owner); } - - /* - * As lock holder preemption issue, we both skip spinning if task is not - * on cpu or its cpu is preempted - */ - ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -done: rcu_read_unlock(); return ret; } @@ -382,11 +383,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) - goto out; + if (!is_rwsem_owner_spinnable(owner)) + return false; rcu_read_lock(); - while (sem->owner == owner) { + while (owner && (READ_ONCE(sem->owner) == owner)) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, @@ -399,8 +400,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) * abort spinning when need_resched or owner is not running or * owner's cpu is preempted. */ - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { + if (need_resched() || !owner_on_cpu(owner)) { rcu_read_unlock(); return false; } @@ -408,12 +408,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) cpu_relax(); } rcu_read_unlock(); -out: + /* * If there is a new owner or the owner is not set, we continue * spinning. */ - return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); + return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 30465a2f2b6c..776308d2fa9e 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -181,6 +181,7 @@ void down_read_non_owner(struct rw_semaphore *sem) might_sleep(); __down_read(sem); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_non_owner); @@ -221,5 +222,3 @@ void up_read_non_owner(struct rw_semaphore *sem) EXPORT_SYMBOL(up_read_non_owner); #endif - - diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a17cba8d94bb..b9d0e72aa80f 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,20 +1,24 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * The owner field of the rw_semaphore structure will be set to - * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear * the owner field when it unlocks. A reader, on the other hand, will * not touch the owner field when it unlocks. * - * In essence, the owner field now has the following 3 states: + * In essence, the owner field now has the following 4 states: * 1) 0 * - lock is free or the owner hasn't set the field yet * 2) RWSEM_READER_OWNED * - lock is currently or previously owned by readers (lock is free * or not set by owner yet) - * 3) Other non-zero value - * - a writer owns the lock + * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well + * - lock is owned by an anonymous writer, so spinning on the lock + * owner should be disabled. + * 4) Other non-zero value + * - a writer owns the lock and other writers can spin on the lock owner. */ -#define RWSEM_READER_OWNED ((struct task_struct *)1UL) +#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0) +#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED) #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) @@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } -static inline bool rwsem_owner_is_writer(struct task_struct *owner) +/* + * Return true if the a rwsem waiter can spin on the rwsem's owner + * and steal the lock, i.e. the lock is not anonymously owned. + * N.B. !owner is considered spinnable. + */ +static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) { - return owner && owner != RWSEM_READER_OWNED; + return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); } -static inline bool rwsem_owner_is_reader(struct task_struct *owner) +/* + * Return true if rwsem is owned by an anonymous writer or readers. + */ +static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) { - return owner == RWSEM_READER_OWNED; + return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; } #else static inline void rwsem_set_owner(struct rw_semaphore *sem) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 0e4cd64ad2c0..5b915b370d5a 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -26,7 +26,7 @@ #include <linux/slab.h> #include <linux/ww_mutex.h> -static DEFINE_WW_CLASS(ww_class); +static DEFINE_WD_CLASS(ww_class); struct workqueue_struct *wq; struct test_mutex { |