diff options
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 203 | 
1 files changed, 164 insertions, 39 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index f6ff0191ecf7..44a1261cb9ff 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -63,14 +63,101 @@  #include <linux/sched/rt.h>  #include <linux/hugetlb.h>  #include <linux/freezer.h> +#include <linux/bootmem.h>  #include <asm/futex.h>  #include "locking/rtmutex_common.h" -int __read_mostly futex_cmpxchg_enabled; +/* + * Basic futex operation and ordering guarantees: + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. + * + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: + * + * CPU 0                               CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + *   futex_wait(futex, val); + *   uval = *futex; + *                                     *futex = newval; + *                                     sys_futex(WAKE, futex); + *                                       futex_wake(futex); + *                                       if (queue_empty()) + *                                         return; + *   if (uval == val) + *      lock(hash_bucket(futex)); + *      queue(); + *     unlock(hash_bucket(futex)); + *     schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0                                 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + *   futex_wait(futex, val); + * + *   waiters++; + *   mb(); (A) <-- paired with -. + *                              | + *   lock(hash_bucket(futex));  | + *                              | + *   uval = *futex;             | + *                              |        *futex = newval; + *                              |        sys_futex(WAKE, futex); + *                              |          futex_wake(futex); + *                              | + *                              `------->  mb(); (B) + *   if (uval == val) + *     queue(); + *     unlock(hash_bucket(futex)); + *     schedule();                         if (waiters) + *                                           lock(hash_bucket(futex)); + *                                           wake_waiters(futex); + *                                           unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read -- this + * is guaranteed by the head counter in the hb spinlock; and where (B) + * orders the write to futex and the waiters read -- this is done by the + * barriers in get_futex_key_refs(), through either ihold or atomic_inc, + * depending on the futex type. + * + * This yields the following case (where X:=waiters, Y:=futex): + * + *	X = Y = 0 + * + *	w[X]=1		w[Y]=1 + *	MB		MB + *	r[Y]=y		r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. + */ -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) +int __read_mostly futex_cmpxchg_enabled;  /*   * Futex flags used to encode options to functions and preserve them across @@ -149,9 +236,41 @@ static const struct futex_q futex_q_init = {  struct futex_hash_bucket {  	spinlock_t lock;  	struct plist_head chain; -}; +} ____cacheline_aligned_in_smp; -static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; +static unsigned long __read_mostly futex_hashsize; + +static struct futex_hash_bucket *futex_queues; + +static inline void futex_get_mm(union futex_key *key) +{ +	atomic_inc(&key->private.mm->mm_count); +	/* +	 * Ensure futex_get_mm() implies a full barrier such that +	 * get_futex_key() implies a full barrier. This is relied upon +	 * as full barrier (B), see the ordering comment above. +	 */ +	smp_mb__after_atomic_inc(); +} + +static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP +	/* +	 * Tasks trying to enter the critical region are most likely +	 * potential waiters that will be added to the plist. Ensure +	 * that wakers won't miss to-be-slept tasks in the window between +	 * the wait call and the actual plist_add. +	 */ +	if (spin_is_locked(&hb->lock)) +		return true; +	smp_rmb(); /* Make sure we check the lock state first */ + +	return !plist_head_empty(&hb->chain); +#else +	return true; +#endif +}  /*   * We hash on the keys returned from get_futex_key (see below). @@ -161,7 +280,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)  	u32 hash = jhash2((u32*)&key->both.word,  			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,  			  key->both.offset); -	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; +	return &futex_queues[hash & (futex_hashsize - 1)];  }  /* @@ -187,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key)  	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {  	case FUT_OFF_INODE: -		ihold(key->shared.inode); +		ihold(key->shared.inode); /* implies MB (B) */  		break;  	case FUT_OFF_MMSHARED: -		atomic_inc(&key->private.mm->mm_count); +		futex_get_mm(key); /* implies MB (B) */  		break;  	}  } @@ -264,7 +383,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)  	if (!fshared) {  		key->private.mm = mm;  		key->private.address = address; -		get_futex_key_refs(key); +		get_futex_key_refs(key);  /* implies MB (B) */  		return 0;  	} @@ -371,7 +490,7 @@ again:  		key->shared.pgoff = basepage_index(page);  	} -	get_futex_key_refs(key); +	get_futex_key_refs(key); /* implies MB (B) */  out:  	unlock_page(page_head); @@ -598,13 +717,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  {  	struct futex_pi_state *pi_state = NULL;  	struct futex_q *this, *next; -	struct plist_head *head;  	struct task_struct *p;  	pid_t pid = uval & FUTEX_TID_MASK; -	head = &hb->chain; - -	plist_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, &hb->chain, list) {  		if (match_futex(&this->key, key)) {  			/*  			 * Another waiter already exists - bump up @@ -986,7 +1102,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)  {  	struct futex_hash_bucket *hb;  	struct futex_q *this, *next; -	struct plist_head *head;  	union futex_key key = FUTEX_KEY_INIT;  	int ret; @@ -998,10 +1113,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)  		goto out;  	hb = hash_futex(&key); + +	/* Make sure we really have tasks to wakeup */ +	if (!hb_waiters_pending(hb)) +		goto out_put_key; +  	spin_lock(&hb->lock); -	head = &hb->chain; -	plist_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, &hb->chain, list) {  		if (match_futex (&this->key, &key)) {  			if (this->pi_state || this->rt_waiter) {  				ret = -EINVAL; @@ -1019,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)  	}  	spin_unlock(&hb->lock); +out_put_key:  	put_futex_key(&key);  out:  	return ret; @@ -1034,7 +1154,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,  {  	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;  	struct futex_hash_bucket *hb1, *hb2; -	struct plist_head *head;  	struct futex_q *this, *next;  	int ret, op_ret; @@ -1082,9 +1201,7 @@ retry_private:  		goto retry;  	} -	head = &hb1->chain; - -	plist_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, &hb1->chain, list) {  		if (match_futex (&this->key, &key1)) {  			if (this->pi_state || this->rt_waiter) {  				ret = -EINVAL; @@ -1097,10 +1214,8 @@ retry_private:  	}  	if (op_ret > 0) { -		head = &hb2->chain; -  		op_ret = 0; -		plist_for_each_entry_safe(this, next, head, list) { +		plist_for_each_entry_safe(this, next, &hb2->chain, list) {  			if (match_futex (&this->key, &key2)) {  				if (this->pi_state || this->rt_waiter) {  					ret = -EINVAL; @@ -1270,7 +1385,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,  	int drop_count = 0, task_count = 0, ret;  	struct futex_pi_state *pi_state = NULL;  	struct futex_hash_bucket *hb1, *hb2; -	struct plist_head *head1;  	struct futex_q *this, *next;  	u32 curval2; @@ -1393,8 +1507,7 @@ retry_private:  		}  	} -	head1 = &hb1->chain; -	plist_for_each_entry_safe(this, next, head1, list) { +	plist_for_each_entry_safe(this, next, &hb1->chain, list) {  		if (task_count - nr_wake >= nr_requeue)  			break; @@ -1489,12 +1602,12 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)  	hb = hash_futex(&q->key);  	q->lock_ptr = &hb->lock; -	spin_lock(&hb->lock); +	spin_lock(&hb->lock); /* implies MB (A) */  	return hb;  }  static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +queue_unlock(struct futex_hash_bucket *hb)  	__releases(&hb->lock)  {  	spin_unlock(&hb->lock); @@ -1867,7 +1980,7 @@ retry_private:  	ret = get_futex_value_locked(&uval, uaddr);  	if (ret) { -		queue_unlock(q, *hb); +		queue_unlock(*hb);  		ret = get_user(uval, uaddr);  		if (ret) @@ -1881,7 +1994,7 @@ retry_private:  	}  	if (uval != val) { -		queue_unlock(q, *hb); +		queue_unlock(*hb);  		ret = -EWOULDBLOCK;  	} @@ -2029,7 +2142,7 @@ retry_private:  			 * Task is exiting and we just wait for the  			 * exit to complete.  			 */ -			queue_unlock(&q, hb); +			queue_unlock(hb);  			put_futex_key(&q.key);  			cond_resched();  			goto retry; @@ -2081,7 +2194,7 @@ retry_private:  	goto out_put_key;  out_unlock_put_key: -	queue_unlock(&q, hb); +	queue_unlock(hb);  out_put_key:  	put_futex_key(&q.key); @@ -2091,7 +2204,7 @@ out:  	return ret != -EINTR ? ret : -ERESTARTNOINTR;  uaddr_faulted: -	queue_unlock(&q, hb); +	queue_unlock(hb);  	ret = fault_in_user_writeable(uaddr);  	if (ret) @@ -2113,7 +2226,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)  {  	struct futex_hash_bucket *hb;  	struct futex_q *this, *next; -	struct plist_head *head;  	union futex_key key = FUTEX_KEY_INIT;  	u32 uval, vpid = task_pid_vnr(current);  	int ret; @@ -2153,9 +2265,7 @@ retry:  	 * Ok, other tasks may need to be woken up - check waiters  	 * and do the wakeup if necessary:  	 */ -	head = &hb->chain; - -	plist_for_each_entry_safe(this, next, head, list) { +	plist_for_each_entry_safe(this, next, &hb->chain, list) {  		if (!match_futex (&this->key, &key))  			continue;  		ret = wake_futex_pi(uaddr, uval, this); @@ -2316,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  	 * code while we sleep on uaddr.  	 */  	debug_rt_mutex_init_waiter(&rt_waiter); +	RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); +	RB_CLEAR_NODE(&rt_waiter.tree_entry);  	rt_waiter.task = NULL;  	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); @@ -2734,8 +2846,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,  static int __init futex_init(void)  {  	u32 curval; -	int i; +	unsigned int futex_shift; +	unsigned long i; + +#if CONFIG_BASE_SMALL +	futex_hashsize = 16; +#else +	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif +	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), +					       futex_hashsize, 0, +					       futex_hashsize < 256 ? HASH_SMALL : 0, +					       &futex_shift, NULL, +					       futex_hashsize, futex_hashsize); +	futex_hashsize = 1UL << futex_shift;  	/*  	 * This will fail and we want it. Some arch implementations do  	 * runtime detection of the futex_atomic_cmpxchg_inatomic() @@ -2749,7 +2874,7 @@ static int __init futex_init(void)  	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)  		futex_cmpxchg_enabled = 1; -	for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { +	for (i = 0; i < futex_hashsize; i++) {  		plist_head_init(&futex_queues[i].chain);  		spin_lock_init(&futex_queues[i].lock);  	}  |