fs/userfaultfd.c: disable irqs for fault_pending and event locks

author Eric Biggers <ebiggers@google.com>

Thu, 4 Jul 2019 22:14:39 +0000 (15:14 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 5 Jul 2019 02:12:07 +0000 (11:12 +0900)
author Eric Biggers <ebiggers@google.com>
Thu, 4 Jul 2019 22:14:39 +0000 (15:14 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 5 Jul 2019 02:12:07 +0000 (11:12 +0900)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c

index ae0b8b5f69e6e205e4d9288df90ee4a2c8f64b69..ccbdbd62f0d8ecf16a19f2f191d5809e317da9d5 100644 (file)
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -40,6 +40,16 @@ enum userfaultfd_state {
  /*
   * Start with fault_pending_wqh and fault_wqh so they're more likely
   * to be in the same cacheline.
+ *
+ * Locking order:
+ *     fd_wqh.lock
+ *             fault_pending_wqh.lock
+ *                     fault_wqh.lock
+ *             event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
   */
  struct userfaultfd_ctx {
         /* waitqueue head for the pending (i.e. not read) userfaults */
@@ -458,7 +468,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
         blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
                          TASK_KILLABLE;
  
-       spin_lock(&ctx->fault_pending_wqh.lock);
+       spin_lock_irq(&ctx->fault_pending_wqh.lock);
         /*
          * After the __add_wait_queue the uwq is visible to userland
          * through poll/read().
@@ -470,7 +480,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
          * __add_wait_queue.
          */
         set_current_state(blocking_state);
-       spin_unlock(&ctx->fault_pending_wqh.lock);
+       spin_unlock_irq(&ctx->fault_pending_wqh.lock);
  
         if (!is_vm_hugetlb_page(vmf->vma))
                 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
@@ -552,13 +562,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
          * kernel stack can be released after the list_del_init.
          */
         if (!list_empty_careful(&uwq.wq.entry)) {
-               spin_lock(&ctx->fault_pending_wqh.lock);
+               spin_lock_irq(&ctx->fault_pending_wqh.lock);
                 /*
                  * No need of list_del_init(), the uwq on the stack
                  * will be freed shortly anyway.
                  */
                 list_del(&uwq.wq.entry);
-               spin_unlock(&ctx->fault_pending_wqh.lock);
+               spin_unlock_irq(&ctx->fault_pending_wqh.lock);
         }
  
         /*
@@ -583,7 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
         init_waitqueue_entry(&ewq->wq, current);
         release_new_ctx = NULL;
  
-       spin_lock(&ctx->event_wqh.lock);
+       spin_lock_irq(&ctx->event_wqh.lock);
         /*
          * After the __add_wait_queue the uwq is visible to userland
          * through poll/read().
@@ -613,15 +623,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
                         break;
                 }
  
-               spin_unlock(&ctx->event_wqh.lock);
+               spin_unlock_irq(&ctx->event_wqh.lock);
  
                 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                 schedule();
  
-               spin_lock(&ctx->event_wqh.lock);
+               spin_lock_irq(&ctx->event_wqh.lock);
         }
         __set_current_state(TASK_RUNNING);
-       spin_unlock(&ctx->event_wqh.lock);
+       spin_unlock_irq(&ctx->event_wqh.lock);
  
         if (release_new_ctx) {
                 struct vm_area_struct *vma;
@@ -918,10 +928,10 @@ wakeup:
          * the last page faults that may have been already waiting on
          * the fault_*wqh.
          */
-       spin_lock(&ctx->fault_pending_wqh.lock);
+       spin_lock_irq(&ctx->fault_pending_wqh.lock);
         __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
         __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
-       spin_unlock(&ctx->fault_pending_wqh.lock);
+       spin_unlock_irq(&ctx->fault_pending_wqh.lock);
  
         /* Flush pending events that may still wait on event_wqh */
         wake_up_all(&ctx->event_wqh);
@@ -1134,7 +1144,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
  
         if (!ret && msg->event == UFFD_EVENT_FORK) {
                 ret = resolve_userfault_fork(ctx, fork_nctx, msg);
-               spin_lock(&ctx->event_wqh.lock);
+               spin_lock_irq(&ctx->event_wqh.lock);
                 if (!list_empty(&fork_event)) {
                         /*
                          * The fork thread didn't abort, so we can
@@ -1180,7 +1190,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                         if (ret)
                                 userfaultfd_ctx_put(fork_nctx);
                 }
-               spin_unlock(&ctx->event_wqh.lock);
+               spin_unlock_irq(&ctx->event_wqh.lock);
         }
  
         return ret;
@@ -1219,14 +1229,14 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
  static void __wake_userfault(struct userfaultfd_ctx *ctx,
                              struct userfaultfd_wake_range *range)
  {
-       spin_lock(&ctx->fault_pending_wqh.lock);
+       spin_lock_irq(&ctx->fault_pending_wqh.lock);
         /* wake all in the range and autoremove */
         if (waitqueue_active(&ctx->fault_pending_wqh))
                 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
                                      range);
         if (waitqueue_active(&ctx->fault_wqh))
                 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
-       spin_unlock(&ctx->fault_pending_wqh.lock);
+       spin_unlock_irq(&ctx->fault_pending_wqh.lock);
  }
  
  static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
@@ -1881,7 +1891,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
         wait_queue_entry_t *wq;
         unsigned long pending = 0, total = 0;
  
-       spin_lock(&ctx->fault_pending_wqh.lock);
+       spin_lock_irq(&ctx->fault_pending_wqh.lock);
         list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
                 pending++;
                 total++;
@@ -1889,7 +1899,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
         list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
                 total++;
         }
-       spin_unlock(&ctx->fault_pending_wqh.lock);
+       spin_unlock_irq(&ctx->fault_pending_wqh.lock);
  
         /*
          * If more protocols will be added, there will be all shown
author	Eric Biggers <ebiggers@google.com>
	Thu, 4 Jul 2019 22:14:39 +0000 (15:14 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 5 Jul 2019 02:12:07 +0000 (11:12 +0900)