diff options
Diffstat (limited to 'fs/eventpoll.c')
| -rw-r--r-- | fs/eventpoll.c | 171 | 
1 files changed, 103 insertions, 68 deletions
| diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eee3c92a9ebf..12eebcdea9c8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,13 +218,18 @@ struct eventpoll {  	struct file *file;  	/* used to optimize loop detection check */ -	int visited;  	struct list_head visited_list_link; +	int visited;  #ifdef CONFIG_NET_RX_BUSY_POLL  	/* used to track busy poll napi_id */  	unsigned int napi_id;  #endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* tracks wakeup nests for lockdep validation */ +	u8 nests; +#endif  };  /* Wait structure used by the poll hooks */ @@ -545,30 +550,47 @@ out_unlock:   */  #ifdef CONFIG_DEBUG_LOCK_ALLOC -static DEFINE_PER_CPU(int, wakeup_nest); - -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)  { +	struct eventpoll *ep_src;  	unsigned long flags; -	int subclass; +	u8 nests = 0; -	local_irq_save(flags); -	preempt_disable(); -	subclass = __this_cpu_read(wakeup_nest); -	spin_lock_nested(&wq->lock, subclass + 1); -	__this_cpu_inc(wakeup_nest); -	wake_up_locked_poll(wq, POLLIN); -	__this_cpu_dec(wakeup_nest); -	spin_unlock(&wq->lock); -	local_irq_restore(flags); -	preempt_enable(); +	/* +	 * To set the subclass or nesting level for spin_lock_irqsave_nested() +	 * it might be natural to create a per-cpu nest count. However, since +	 * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can +	 * schedule() in the -rt kernel, the per-cpu variable are no longer +	 * protected. Thus, we are introducing a per eventpoll nest field. +	 * If we are not being call from ep_poll_callback(), epi is NULL and +	 * we are at the first level of nesting, 0. Otherwise, we are being +	 * called from ep_poll_callback() and if a previous wakeup source is +	 * not an epoll file itself, we are at depth 1 since the wakeup source +	 * is depth 0. If the wakeup source is a previous epoll file in the +	 * wakeup chain then we use its nests value and record ours as +	 * nests + 1. The previous epoll file nests value is stable since its +	 * already holding its own poll_wait.lock. +	 */ +	if (epi) { +		if ((is_file_epoll(epi->ffd.file))) { +			ep_src = epi->ffd.file->private_data; +			nests = ep_src->nests; +		} else { +			nests = 1; +		} +	} +	spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); +	ep->nests = nests + 1; +	wake_up_locked_poll(&ep->poll_wait, EPOLLIN); +	ep->nests = 0; +	spin_unlock_irqrestore(&ep->poll_wait.lock, flags);  }  #else -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)  { -	wake_up_poll(wq, EPOLLIN); +	wake_up_poll(&ep->poll_wait, EPOLLIN);  }  #endif @@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep)  	/* We need to release all tasks waiting for these file */  	if (waitqueue_active(&ep->poll_wait)) -		ep_poll_safewake(&ep->poll_wait); +		ep_poll_safewake(ep, NULL);  	/*  	 * We need to lock this because we could be hit by @@ -1149,6 +1171,10 @@ static inline bool chain_epi_lockless(struct epitem *epi)  {  	struct eventpoll *ep = epi->ep; +	/* Fast preliminary check */ +	if (epi->next != EP_UNACTIVE_PTR) +		return false; +  	/* Check that the same epi has not been just chained from another CPU */  	if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)  		return false; @@ -1215,16 +1241,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v  	 * chained in ep->ovflist and requeued later on.  	 */  	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { -		if (epi->next == EP_UNACTIVE_PTR && -		    chain_epi_lockless(epi)) +		if (chain_epi_lockless(epi)) +			ep_pm_stay_awake_rcu(epi); +	} else if (!ep_is_linked(epi)) { +		/* In the usual case, add event to ready list. */ +		if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))  			ep_pm_stay_awake_rcu(epi); -		goto out_unlock; -	} - -	/* If this file is already in the ready list we exit soon */ -	if (!ep_is_linked(epi) && -	    list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) { -		ep_pm_stay_awake_rcu(epi);  	}  	/* @@ -1258,7 +1280,7 @@ out_unlock:  	/* We have to call this outside the lock */  	if (pwake) -		ep_poll_safewake(&ep->poll_wait); +		ep_poll_safewake(ep, epi);  	if (!(epi->event.events & EPOLLEXCLUSIVE))  		ewake = 1; @@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,  	/* We have to call this outside the lock */  	if (pwake) -		ep_poll_safewake(&ep->poll_wait); +		ep_poll_safewake(ep, NULL);  	return 0; @@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,  	/* We have to call this outside the lock */  	if (pwake) -		ep_poll_safewake(&ep->poll_wait); +		ep_poll_safewake(ep, NULL);  	return 0;  } @@ -1800,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,  {  	int res = 0, eavail, timed_out = 0;  	u64 slack = 0; -	bool waiter = false;  	wait_queue_entry_t wait;  	ktime_t expires, *to = NULL; @@ -1845,55 +1866,75 @@ fetch_events:  	 */  	ep_reset_busy_poll_napi_id(ep); -	/* -	 * We don't have any available event to return to the caller.  We need -	 * to sleep here, and we will be woken by ep_poll_callback() when events -	 * become available. -	 */ -	if (!waiter) { -		waiter = true; -		init_waitqueue_entry(&wait, current); +	do { +		/* +		 * Internally init_wait() uses autoremove_wake_function(), +		 * thus wait entry is removed from the wait queue on each +		 * wakeup. Why it is important? In case of several waiters +		 * each new wakeup will hit the next waiter, giving it the +		 * chance to harvest new event. Otherwise wakeup can be +		 * lost. This is also good performance-wise, because on +		 * normal wakeup path no need to call __remove_wait_queue() +		 * explicitly, thus ep->lock is not taken, which halts the +		 * event delivery. +		 */ +		init_wait(&wait);  		write_lock_irq(&ep->lock); -		__add_wait_queue_exclusive(&ep->wq, &wait); -		write_unlock_irq(&ep->lock); -	} - -	for (;;) {  		/* -		 * We don't want to sleep if the ep_poll_callback() sends us -		 * a wakeup in between. That's why we set the task state -		 * to TASK_INTERRUPTIBLE before doing the checks. +		 * Barrierless variant, waitqueue_active() is called under +		 * the same lock on wakeup ep_poll_callback() side, so it +		 * is safe to avoid an explicit barrier.  		 */ -		set_current_state(TASK_INTERRUPTIBLE); +		__set_current_state(TASK_INTERRUPTIBLE); +  		/* -		 * Always short-circuit for fatal signals to allow -		 * threads to make a timely exit without the chance of -		 * finding more events available and fetching -		 * repeatedly. +		 * Do the final check under the lock. ep_scan_ready_list() +		 * plays with two lists (->rdllist and ->ovflist) and there +		 * is always a race when both lists are empty for short +		 * period of time although events are pending, so lock is +		 * important.  		 */ -		if (fatal_signal_pending(current)) { -			res = -EINTR; -			break; +		eavail = ep_events_available(ep); +		if (!eavail) { +			if (signal_pending(current)) +				res = -EINTR; +			else +				__add_wait_queue_exclusive(&ep->wq, &wait);  		} +		write_unlock_irq(&ep->lock); -		eavail = ep_events_available(ep); -		if (eavail) -			break; -		if (signal_pending(current)) { -			res = -EINTR; +		if (eavail || res)  			break; -		}  		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {  			timed_out = 1;  			break;  		} -	} + +		/* We were woken up, thus go and try to harvest some events */ +		eavail = 1; + +	} while (0);  	__set_current_state(TASK_RUNNING); +	if (!list_empty_careful(&wait.entry)) { +		write_lock_irq(&ep->lock); +		__remove_wait_queue(&ep->wq, &wait); +		write_unlock_irq(&ep->lock); +	} +  send_events: +	if (fatal_signal_pending(current)) { +		/* +		 * Always short-circuit for fatal signals to allow +		 * threads to make a timely exit without the chance of +		 * finding more events available and fetching +		 * repeatedly. +		 */ +		res = -EINTR; +	}  	/*  	 * Try to transfer events to user space. In case we get 0 events and  	 * there's still timeout left over, we go trying again in search of @@ -1903,12 +1944,6 @@ send_events:  	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)  		goto fetch_events; -	if (waiter) { -		write_lock_irq(&ep->lock); -		__remove_wait_queue(&ep->wq, &wait); -		write_unlock_irq(&ep->lock); -	} -  	return res;  } |