epoll: atomically remove wait entry on wake up

author Roman Penyaev <rpenyaev@suse.de>

Fri, 8 May 2020 01:36:16 +0000 (18:36 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 8 May 2020 02:27:21 +0000 (19:27 -0700)
author Roman Penyaev <rpenyaev@suse.de>
Fri, 8 May 2020 01:36:16 +0000 (18:36 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 8 May 2020 02:27:21 +0000 (19:27 -0700)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c

index d6ba0e5..aba03ee 100644 (file)
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
  {
         int res = 0, eavail, timed_out = 0;
         u64 slack = 0;
-       bool waiter = false;
         wait_queue_entry_t wait;
         ktime_t expires, *to = NULL;
  
@@ -1867,21 +1866,23 @@ fetch_events:
          */
         ep_reset_busy_poll_napi_id(ep);
  
-       /*
-        * We don't have any available event to return to the caller.  We need
-        * to sleep here, and we will be woken by ep_poll_callback() when events
-        * become available.
-        */
-       if (!waiter) {
-               waiter = true;
-               init_waitqueue_entry(&wait, current);
-
+       do {
+               /*
+                * Internally init_wait() uses autoremove_wake_function(),
+                * thus wait entry is removed from the wait queue on each
+                * wakeup. Why it is important? In case of several waiters
+                * each new wakeup will hit the next waiter, giving it the
+                * chance to harvest new event. Otherwise wakeup can be
+                * lost. This is also good performance-wise, because on
+                * normal wakeup path no need to call __remove_wait_queue()
+                * explicitly, thus ep->lock is not taken, which halts the
+                * event delivery.
+                */
+               init_wait(&wait);
                 write_lock_irq(&ep->lock);
                 __add_wait_queue_exclusive(&ep->wq, &wait);
                 write_unlock_irq(&ep->lock);
-       }
  
-       for (;;) {
                 /*
                  * We don't want to sleep if the ep_poll_callback() sends us
                  * a wakeup in between. That's why we set the task state
@@ -1911,10 +1912,20 @@ fetch_events:
                         timed_out = 1;
                         break;
                 }
-       }
+
+               /* We were woken up, thus go and try to harvest some events */
+               eavail = 1;
+
+       } while (0);
  
         __set_current_state(TASK_RUNNING);
  
+       if (!list_empty_careful(&wait.entry)) {
+               write_lock_irq(&ep->lock);
+               __remove_wait_queue(&ep->wq, &wait);
+               write_unlock_irq(&ep->lock);
+       }
+
  send_events:
         /*
          * Try to transfer events to user space. In case we get 0 events and
@@ -1925,12 +1936,6 @@ send_events:
             !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
                 goto fetch_events;
  
-       if (waiter) {
-               write_lock_irq(&ep->lock);
-               __remove_wait_queue(&ep->wq, &wait);
-               write_unlock_irq(&ep->lock);
-       }
-
         return res;
  }
author	Roman Penyaev <rpenyaev@suse.de>
	Fri, 8 May 2020 01:36:16 +0000 (18:36 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 8 May 2020 02:27:21 +0000 (19:27 -0700)