aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h59
-rw-r--r--include/asm-generic/qspinlock.h9
-rw-r--r--kernel/locking/qspinlock.c48
-rw-r--r--kernel/locking/qspinlock_paravirt.h43
-rw-r--r--lib/atomic64_test.c120
5 files changed, 210 insertions, 69 deletions
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index b002e711ba88..9f92c180ed2f 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -1,6 +1,65 @@
#ifndef __ASM_QSPINLOCK_PARAVIRT_H
#define __ASM_QSPINLOCK_PARAVIRT_H
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ * struct __qspinlock *l = (void *)lock;
+ * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *
+ * if (likely(lockval == _Q_LOCKED_VAL))
+ * return;
+ * pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ * rdi = lock (first argument)
+ * rsi = lockval (second argument)
+ * rdx = internal variable (set to 0)
+ */
+asm (".pushsection .text;"
+ ".globl " PV_UNLOCK ";"
+ ".align 4,0x90;"
+ PV_UNLOCK ": "
+ "push %rdx;"
+ "mov $0x1,%eax;"
+ "xor %edx,%edx;"
+ "lock cmpxchg %dl,(%rdi);"
+ "cmp $0x1,%al;"
+ "jne .slowpath;"
+ "pop %rdx;"
+ "ret;"
+ ".slowpath: "
+ "push %rsi;"
+ "movzbl %al,%esi;"
+ "call " PV_UNLOCK_SLOWPATH ";"
+ "pop %rsi;"
+ "pop %rdx;"
+ "ret;"
+ ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+ ".popsection");
+
+#else /* CONFIG_64BIT */
+
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+#endif /* CONFIG_64BIT */
#endif
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e2aadbc7151f..39e1cb201b8e 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -12,8 +12,9 @@
* GNU General Public License for more details.
*
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <[email protected]>
+ * Authors: Waiman Long <[email protected]>
*/
#ifndef __ASM_GENERIC_QSPINLOCK_H
#define __ASM_GENERIC_QSPINLOCK_H
@@ -62,7 +63,7 @@ static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
static __always_inline int queued_spin_trylock(struct qspinlock *lock)
{
if (!atomic_read(&lock->val) &&
- (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+ (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0))
return 1;
return 0;
}
@@ -77,7 +78,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
u32 val;
- val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+ val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
if (likely(val == 0))
return;
queued_spin_lock_slowpath(lock, val);
@@ -93,7 +94,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
/*
* smp_mb__before_atomic() in order to guarantee release semantics
*/
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_sub(_Q_LOCKED_VAL, &lock->val);
}
#endif
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..986207887def 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <[email protected]>
+ * Authors: Waiman Long <[email protected]>
* Peter Zijlstra <[email protected]>
*/
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
struct __qspinlock *l = (void *)lock;
- return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+ /*
+ * Use release semantics to make sure that the MCS node is properly
+ * initialized before changing the tail code.
+ */
+ return (u32)xchg_release(&l->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Use release semantics to make sure that the MCS node is
+ * properly initialized before changing the tail code.
+ */
+ old = atomic_cmpxchg_release(&lock->val, val, new);
if (old == val)
break;
@@ -319,7 +329,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (val == new)
new |= _Q_PENDING_VAL;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Acquire semantic is required here as the function may
+ * return immediately if the lock was free.
+ */
+ old = atomic_cmpxchg_acquire(&lock->val, val, new);
if (old == val)
break;
@@ -382,6 +396,7 @@ queue:
* p,*,* -> n,*,*
*/
old = xchg_tail(lock, tail);
+ next = NULL;
/*
* if there was a previous node; link it and wait until reaching the
@@ -393,6 +408,16 @@ queue:
pv_wait_node(node);
arch_mcs_spin_lock_contended(&node->locked);
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
}
/*
@@ -426,7 +451,12 @@ queue:
set_locked(lock);
break;
}
- old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ /*
+ * The smp_load_acquire() call above has provided the necessary
+ * acquire semantics required for locking. At most two
+ * iterations of this loop may be ran.
+ */
+ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
goto release; /* No contention */
@@ -434,10 +464,12 @@ queue:
}
/*
- * contended path; wait for next, release.
+ * contended path; wait for next if not observed yet, release.
*/
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
+ if (!next) {
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+ }
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff4829b..4bd323d38c60 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -308,23 +308,14 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
}
/*
- * PV version of the unlock function to be used in stead of
- * queued_spin_unlock().
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
*/
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 locked;
-
- /*
- * We must not unlock if SLOW, because in that case we must first
- * unhash. Otherwise it would be possible to have multiple @lock
- * entries, which would be BAD.
- */
- locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
- return;
if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent,
@@ -363,12 +354,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
*/
pv_kick(node->cpu);
}
+
/*
* Include the architecture specific callee-save thunk of the
* __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
- * that the callee-save thunk and the real unlock function are close
- * to each other sharing consecutive instruction cachelines.
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
*/
#include <asm/qspinlock_paravirt.h>
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ u8 locked;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 83c33a5bcffb..18e422b259cf 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -27,6 +27,65 @@ do { \
(unsigned long long)r); \
} while (0)
+/*
+ * Test for a atomic operation family,
+ * @test should be a macro accepting parameters (bit, op, ...)
+ */
+
+#define FAMILY_TEST(test, bit, op, args...) \
+do { \
+ test(bit, op, ##args); \
+ test(bit, op##_acquire, ##args); \
+ test(bit, op##_release, ##args); \
+ test(bit, op##_relaxed, ##args); \
+} while (0)
+
+#define TEST_RETURN(bit, op, c_op, val) \
+do { \
+ atomic##bit##_set(&v, v0); \
+ r = v0; \
+ r c_op val; \
+ BUG_ON(atomic##bit##_##op(val, &v) != r); \
+ BUG_ON(atomic##bit##_read(&v) != r); \
+} while (0)
+
+#define RETURN_FAMILY_TEST(bit, op, c_op, val) \
+do { \
+ FAMILY_TEST(TEST_RETURN, bit, op, c_op, val); \
+} while (0)
+
+#define TEST_ARGS(bit, op, init, ret, expect, args...) \
+do { \
+ atomic##bit##_set(&v, init); \
+ BUG_ON(atomic##bit##_##op(&v, ##args) != ret); \
+ BUG_ON(atomic##bit##_read(&v) != expect); \
+} while (0)
+
+#define XCHG_FAMILY_TEST(bit, init, new) \
+do { \
+ FAMILY_TEST(TEST_ARGS, bit, xchg, init, init, new, new); \
+} while (0)
+
+#define CMPXCHG_FAMILY_TEST(bit, init, new, wrong) \
+do { \
+ FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \
+ init, init, new, init, new); \
+ FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \
+ init, init, init, wrong, new); \
+} while (0)
+
+#define INC_RETURN_FAMILY_TEST(bit, i) \
+do { \
+ FAMILY_TEST(TEST_ARGS, bit, inc_return, \
+ i, (i) + one, (i) + one); \
+} while (0)
+
+#define DEC_RETURN_FAMILY_TEST(bit, i) \
+do { \
+ FAMILY_TEST(TEST_ARGS, bit, dec_return, \
+ i, (i) - one, (i) - one); \
+} while (0)
+
static __init void test_atomic(void)
{
int v0 = 0xaaa31337;
@@ -45,6 +104,18 @@ static __init void test_atomic(void)
TEST(, and, &=, v1);
TEST(, xor, ^=, v1);
TEST(, andnot, &= ~, v1);
+
+ RETURN_FAMILY_TEST(, add_return, +=, onestwos);
+ RETURN_FAMILY_TEST(, add_return, +=, -one);
+ RETURN_FAMILY_TEST(, sub_return, -=, onestwos);
+ RETURN_FAMILY_TEST(, sub_return, -=, -one);
+
+ INC_RETURN_FAMILY_TEST(, v0);
+ DEC_RETURN_FAMILY_TEST(, v0);
+
+ XCHG_FAMILY_TEST(, v0, v1);
+ CMPXCHG_FAMILY_TEST(, v0, v1, onestwos);
+
}
#define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
@@ -74,25 +145,10 @@ static __init void test_atomic64(void)
TEST(64, xor, ^=, v1);
TEST(64, andnot, &= ~, v1);
- INIT(v0);
- r += onestwos;
- BUG_ON(atomic64_add_return(onestwos, &v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
- r += -one;
- BUG_ON(atomic64_add_return(-one, &v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
- r -= onestwos;
- BUG_ON(atomic64_sub_return(onestwos, &v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
- r -= -one;
- BUG_ON(atomic64_sub_return(-one, &v) != r);
- BUG_ON(v.counter != r);
+ RETURN_FAMILY_TEST(64, add_return, +=, onestwos);
+ RETURN_FAMILY_TEST(64, add_return, +=, -one);
+ RETURN_FAMILY_TEST(64, sub_return, -=, onestwos);
+ RETURN_FAMILY_TEST(64, sub_return, -=, -one);
INIT(v0);
atomic64_inc(&v);
@@ -100,33 +156,15 @@ static __init void test_atomic64(void)
BUG_ON(v.counter != r);
INIT(v0);
- r += one;
- BUG_ON(atomic64_inc_return(&v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
atomic64_dec(&v);
r -= one;
BUG_ON(v.counter != r);
- INIT(v0);
- r -= one;
- BUG_ON(atomic64_dec_return(&v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
- BUG_ON(atomic64_xchg(&v, v1) != v0);
- r = v1;
- BUG_ON(v.counter != r);
-
- INIT(v0);
- BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
- r = v1;
- BUG_ON(v.counter != r);
+ INC_RETURN_FAMILY_TEST(64, v0);
+ DEC_RETURN_FAMILY_TEST(64, v0);
- INIT(v0);
- BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0);
- BUG_ON(v.counter != r);
+ XCHG_FAMILY_TEST(64, v0, v1);
+ CMPXCHG_FAMILY_TEST(64, v0, v1, v2);
INIT(v0);
BUG_ON(atomic64_add_unless(&v, one, v0));