diff options
Diffstat (limited to 'kernel/locking/qspinlock_paravirt.h')
| -rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 101 | 
1 files changed, 77 insertions, 24 deletions
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 04ab18151cc8..c8e6e9a596f5 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -4,6 +4,7 @@  #include <linux/hash.h>  #include <linux/bootmem.h> +#include <linux/debug_locks.h>  /*   * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead @@ -21,9 +22,14 @@  #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET) +/* + * Queue node uses: vcpu_running & vcpu_halted. + * Queue head uses: vcpu_running & vcpu_hashed. + */  enum vcpu_state {  	vcpu_running = 0, -	vcpu_halted, +	vcpu_halted,		/* Used only in pv_wait_node */ +	vcpu_hashed,		/* = pv_hash'ed + vcpu_halted */  };  struct pv_node { @@ -152,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)  /*   * Wait for node->locked to become true, halt the vcpu after a short spin. - * pv_kick_node() is used to wake the vcpu again. + * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its + * behalf.   */  static void pv_wait_node(struct mcs_spinlock *node)  { @@ -171,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)  		 *  		 * [S] pn->state = vcpu_halted	  [S] next->locked = 1  		 *     MB			      MB -		 * [L] pn->locked		[RmW] pn->state = vcpu_running +		 * [L] pn->locked		[RmW] pn->state = vcpu_hashed  		 * -		 * Matches the xchg() from pv_kick_node(). +		 * Matches the cmpxchg() from pv_kick_node().  		 */  		smp_store_mb(pn->state, vcpu_halted); @@ -181,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)  			pv_wait(&pn->state, vcpu_halted);  		/* -		 * Reset the vCPU state to avoid unncessary CPU kicking +		 * If pv_kick_node() changed us to vcpu_hashed, retain that value +		 * so that pv_wait_head() knows to not also try to hash this lock.  		 */ -		WRITE_ONCE(pn->state, vcpu_running); +		cmpxchg(&pn->state, vcpu_halted, vcpu_running);  		/*  		 * If the locked flag is still not set after wakeup, it is a @@ -193,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)  		 * MCS lock will be released soon.  		 */  	} +  	/*  	 * By now our node->locked should be 1 and our caller will not actually  	 * spin-wait for it. We do however rely on our caller to do a @@ -201,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)  }  /* - * Called after setting next->locked = 1, used to wake those stuck in - * pv_wait_node(). + * Called after setting next->locked = 1 when we're the lock owner. + * + * Instead of waking the waiters stuck in pv_wait_node() advance their state such + * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.   */ -static void pv_kick_node(struct mcs_spinlock *node) +static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)  {  	struct pv_node *pn = (struct pv_node *)node; +	struct __qspinlock *l = (void *)lock;  	/* -	 * Note that because node->locked is already set, this actual -	 * mcs_spinlock entry could be re-used already. +	 * If the vCPU is indeed halted, advance its state to match that of +	 * pv_wait_node(). If OTOH this fails, the vCPU was running and will +	 * observe its next->locked value and advance itself.  	 * -	 * This should be fine however, kicking people for no reason is -	 * harmless. +	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() +	 */ +	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) +		return; + +	/* +	 * Put the lock into the hash table and set the _Q_SLOW_VAL.  	 * -	 * See the comment in pv_wait_node(). +	 * As this is the same vCPU that will check the _Q_SLOW_VAL value and +	 * the hash table later on at unlock time, no atomic instruction is +	 * needed.  	 */ -	if (xchg(&pn->state, vcpu_running) == vcpu_halted) -		pv_kick(pn->cpu); +	WRITE_ONCE(l->locked, _Q_SLOW_VAL); +	(void)pv_hash(lock, pn);  }  /* @@ -232,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)  	struct qspinlock **lp = NULL;  	int loop; +	/* +	 * If pv_kick_node() already advanced our state, we don't need to +	 * insert ourselves into the hash table anymore. +	 */ +	if (READ_ONCE(pn->state) == vcpu_hashed) +		lp = (struct qspinlock **)1; +  	for (;;) {  		for (loop = SPIN_THRESHOLD; loop; loop--) {  			if (!READ_ONCE(l->locked)) @@ -239,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)  			cpu_relax();  		} -		WRITE_ONCE(pn->state, vcpu_halted);  		if (!lp) { /* ONCE */ +			WRITE_ONCE(pn->state, vcpu_hashed);  			lp = pv_hash(lock, pn); +  			/* -			 * lp must be set before setting _Q_SLOW_VAL +			 * We must hash before setting _Q_SLOW_VAL, such that +			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() +			 * we'll be sure to be able to observe our hash entry.  			 * -			 * [S] lp = lock                [RmW] l = l->locked = 0 -			 *     MB                             MB -			 * [S] l->locked = _Q_SLOW_VAL  [L]   lp +			 *   [S] pn->state +			 *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL +			 *       MB                           RMB +			 * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash> +			 *                                [L] pn->state  			 * -			 * Matches the cmpxchg() in __pv_queued_spin_unlock(). +			 * Matches the smp_rmb() in __pv_queued_spin_unlock().  			 */  			if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {  				/* @@ -286,14 +318,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)  {  	struct __qspinlock *l = (void *)lock;  	struct pv_node *node; +	u8 locked;  	/*  	 * We must not unlock if SLOW, because in that case we must first  	 * unhash. Otherwise it would be possible to have multiple @lock  	 * entries, which would be BAD.  	 */ -	if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL)) +	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); +	if (likely(locked == _Q_LOCKED_VAL)) +		return; + +	if (unlikely(locked != _Q_SLOW_VAL)) { +		WARN(!debug_locks_silent, +		     "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", +		     (unsigned long)lock, atomic_read(&lock->val));  		return; +	} + +	/* +	 * A failed cmpxchg doesn't provide any memory-ordering guarantees, +	 * so we need a barrier to order the read of the node data in +	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. +	 * +	 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. +	 */ +	smp_rmb();  	/*  	 * Since the above failed to release, this must be the SLOW path. @@ -310,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)  	/*  	 * At this point the memory pointed at by lock can be freed/reused,  	 * however we can still use the pv_node to kick the CPU. +	 * The other vCPU may not really be halted, but kicking an active +	 * vCPU is harmless other than the additional latency in completing +	 * the unlock.  	 */ -	if (READ_ONCE(node->state) == vcpu_halted) +	if (READ_ONCE(node->state) == vcpu_hashed)  		pv_kick(node->cpu);  }  /*  |