diff options
Diffstat (limited to 'fs/eventpoll.c')
| -rw-r--r-- | fs/eventpoll.c | 171 |
1 files changed, 103 insertions, 68 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eee3c92a9ebf..12eebcdea9c8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,13 +218,18 @@ struct eventpoll { struct file *file; /* used to optimize loop detection check */ - int visited; struct list_head visited_list_link; + int visited; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; #endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* tracks wakeup nests for lockdep validation */ + u8 nests; +#endif }; /* Wait structure used by the poll hooks */ @@ -545,30 +550,47 @@ out_unlock: */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static DEFINE_PER_CPU(int, wakeup_nest); - -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) { + struct eventpoll *ep_src; unsigned long flags; - int subclass; + u8 nests = 0; - local_irq_save(flags); - preempt_disable(); - subclass = __this_cpu_read(wakeup_nest); - spin_lock_nested(&wq->lock, subclass + 1); - __this_cpu_inc(wakeup_nest); - wake_up_locked_poll(wq, POLLIN); - __this_cpu_dec(wakeup_nest); - spin_unlock(&wq->lock); - local_irq_restore(flags); - preempt_enable(); + /* + * To set the subclass or nesting level for spin_lock_irqsave_nested() + * it might be natural to create a per-cpu nest count. However, since + * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can + * schedule() in the -rt kernel, the per-cpu variable are no longer + * protected. Thus, we are introducing a per eventpoll nest field. + * If we are not being call from ep_poll_callback(), epi is NULL and + * we are at the first level of nesting, 0. Otherwise, we are being + * called from ep_poll_callback() and if a previous wakeup source is + * not an epoll file itself, we are at depth 1 since the wakeup source + * is depth 0. If the wakeup source is a previous epoll file in the + * wakeup chain then we use its nests value and record ours as + * nests + 1. The previous epoll file nests value is stable since its + * already holding its own poll_wait.lock. + */ + if (epi) { + if ((is_file_epoll(epi->ffd.file))) { + ep_src = epi->ffd.file->private_data; + nests = ep_src->nests; + } else { + nests = 1; + } + } + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); + ep->nests = nests + 1; + wake_up_locked_poll(&ep->poll_wait, EPOLLIN); + ep->nests = 0; + spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) { - wake_up_poll(wq, EPOLLIN); + wake_up_poll(&ep->poll_wait, EPOLLIN); } #endif @@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep) /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); /* * We need to lock this because we could be hit by @@ -1149,6 +1171,10 @@ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; + /* Fast preliminary check */ + if (epi->next != EP_UNACTIVE_PTR) + return false; + /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; @@ -1215,16 +1241,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (epi->next == EP_UNACTIVE_PTR && - chain_epi_lockless(epi)) + if (chain_epi_lockless(epi)) + ep_pm_stay_awake_rcu(epi); + } else if (!ep_is_linked(epi)) { + /* In the usual case, add event to ready list. */ + if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); - goto out_unlock; - } - - /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(epi) && - list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) { - ep_pm_stay_awake_rcu(epi); } /* @@ -1258,7 +1280,7 @@ out_unlock: /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, epi); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; @@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); return 0; @@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); return 0; } @@ -1800,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; u64 slack = 0; - bool waiter = false; wait_queue_entry_t wait; ktime_t expires, *to = NULL; @@ -1845,55 +1866,75 @@ fetch_events: */ ep_reset_busy_poll_napi_id(ep); - /* - * We don't have any available event to return to the caller. We need - * to sleep here, and we will be woken by ep_poll_callback() when events - * become available. - */ - if (!waiter) { - waiter = true; - init_waitqueue_entry(&wait, current); + do { + /* + * Internally init_wait() uses autoremove_wake_function(), + * thus wait entry is removed from the wait queue on each + * wakeup. Why it is important? In case of several waiters + * each new wakeup will hit the next waiter, giving it the + * chance to harvest new event. Otherwise wakeup can be + * lost. This is also good performance-wise, because on + * normal wakeup path no need to call __remove_wait_queue() + * explicitly, thus ep->lock is not taken, which halts the + * event delivery. + */ + init_wait(&wait); write_lock_irq(&ep->lock); - __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); - } - - for (;;) { /* - * We don't want to sleep if the ep_poll_callback() sends us - * a wakeup in between. That's why we set the task state - * to TASK_INTERRUPTIBLE before doing the checks. + * Barrierless variant, waitqueue_active() is called under + * the same lock on wakeup ep_poll_callback() side, so it + * is safe to avoid an explicit barrier. */ - set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_INTERRUPTIBLE); + /* - * Always short-circuit for fatal signals to allow - * threads to make a timely exit without the chance of - * finding more events available and fetching - * repeatedly. + * Do the final check under the lock. ep_scan_ready_list() + * plays with two lists (->rdllist and ->ovflist) and there + * is always a race when both lists are empty for short + * period of time although events are pending, so lock is + * important. */ - if (fatal_signal_pending(current)) { - res = -EINTR; - break; + eavail = ep_events_available(ep); + if (!eavail) { + if (signal_pending(current)) + res = -EINTR; + else + __add_wait_queue_exclusive(&ep->wq, &wait); } + write_unlock_irq(&ep->lock); - eavail = ep_events_available(ep); - if (eavail) - break; - if (signal_pending(current)) { - res = -EINTR; + if (eavail || res) break; - } if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { timed_out = 1; break; } - } + + /* We were woken up, thus go and try to harvest some events */ + eavail = 1; + + } while (0); __set_current_state(TASK_RUNNING); + if (!list_empty_careful(&wait.entry)) { + write_lock_irq(&ep->lock); + __remove_wait_queue(&ep->wq, &wait); + write_unlock_irq(&ep->lock); + } + send_events: + if (fatal_signal_pending(current)) { + /* + * Always short-circuit for fatal signals to allow + * threads to make a timely exit without the chance of + * finding more events available and fetching + * repeatedly. + */ + res = -EINTR; + } /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of @@ -1903,12 +1944,6 @@ send_events: !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; - if (waiter) { - write_lock_irq(&ep->lock); - __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); - } - return res; } |