userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor
authorPavel Emelyanov <xemul@parallels.com>
Wed, 22 Feb 2017 23:42:21 +0000 (15:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 23 Feb 2017 00:41:28 +0000 (16:41 -0800)
The custom events are queued in ctx->event_wqh not to disturb the
fast-path-ed PF queue-wait-wakeup functions.

The events to be generated (other than PF-s) are requested in UFFD_API
ioctl with the uffd_api.features bits. Those, known by the kernel, are
then turned on and reported back to the user-space.

Link: http://lkml.kernel.org/r/20161216144821.5183-7-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/userfaultfd.c

index a209588..b5074a3 100644 (file)
@@ -12,6 +12,7 @@
  *  mm/ksm.c (mm hashing).
  */
 
+#include <linux/list.h>
 #include <linux/hashtable.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -45,12 +46,16 @@ struct userfaultfd_ctx {
        wait_queue_head_t fault_wqh;
        /* waitqueue head for the pseudo fd to wakeup poll/read */
        wait_queue_head_t fd_wqh;
+       /* waitqueue head for events */
+       wait_queue_head_t event_wqh;
        /* a refile sequence protected by fault_pending_wqh lock */
        struct seqcount refile_seq;
        /* pseudo fd refcounting */
        atomic_t refcount;
        /* userfaultfd syscall flags */
        unsigned int flags;
+       /* features requested from the userspace */
+       unsigned int features;
        /* state machine */
        enum userfaultfd_state state;
        /* released */
@@ -142,6 +147,8 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
                VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+               VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
+               VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
                mmdrop(ctx->mm);
@@ -458,6 +465,59 @@ out:
        return ret;
 }
 
+static int __maybe_unused userfaultfd_event_wait_completion(
+               struct userfaultfd_ctx *ctx,
+               struct userfaultfd_wait_queue *ewq)
+{
+       int ret = 0;
+
+       ewq->ctx = ctx;
+       init_waitqueue_entry(&ewq->wq, current);
+
+       spin_lock(&ctx->event_wqh.lock);
+       /*
+        * After the __add_wait_queue the uwq is visible to userland
+        * through poll/read().
+        */
+       __add_wait_queue(&ctx->event_wqh, &ewq->wq);
+       for (;;) {
+               set_current_state(TASK_KILLABLE);
+               if (ewq->msg.event == 0)
+                       break;
+               if (ACCESS_ONCE(ctx->released) ||
+                   fatal_signal_pending(current)) {
+                       ret = -1;
+                       __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+                       break;
+               }
+
+               spin_unlock(&ctx->event_wqh.lock);
+
+               wake_up_poll(&ctx->fd_wqh, POLLIN);
+               schedule();
+
+               spin_lock(&ctx->event_wqh.lock);
+       }
+       __set_current_state(TASK_RUNNING);
+       spin_unlock(&ctx->event_wqh.lock);
+
+       /*
+        * ctx may go away after this if the userfault pseudo fd is
+        * already released.
+        */
+
+       userfaultfd_ctx_put(ctx);
+       return ret;
+}
+
+static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
+                                      struct userfaultfd_wait_queue *ewq)
+{
+       ewq->msg.event = 0;
+       wake_up_locked(&ctx->event_wqh);
+       __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
        struct userfaultfd_ctx *ctx = file->private_data;
@@ -546,6 +606,12 @@ static inline struct userfaultfd_wait_queue *find_userfault(
        return find_userfault_in(&ctx->fault_pending_wqh);
 }
 
+static inline struct userfaultfd_wait_queue *find_userfault_evt(
+               struct userfaultfd_ctx *ctx)
+{
+       return find_userfault_in(&ctx->event_wqh);
+}
+
 static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 {
        struct userfaultfd_ctx *ctx = file->private_data;
@@ -577,6 +643,9 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
                smp_mb();
                if (waitqueue_active(&ctx->fault_pending_wqh))
                        ret = POLLIN;
+               else if (waitqueue_active(&ctx->event_wqh))
+                       ret = POLLIN;
+
                return ret;
        default:
                WARN_ON_ONCE(1);
@@ -641,6 +710,19 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                        break;
                }
                spin_unlock(&ctx->fault_pending_wqh.lock);
+
+               spin_lock(&ctx->event_wqh.lock);
+               uwq = find_userfault_evt(ctx);
+               if (uwq) {
+                       *msg = uwq->msg;
+
+                       userfaultfd_event_complete(ctx, uwq);
+                       spin_unlock(&ctx->event_wqh.lock);
+                       ret = 0;
+                       break;
+               }
+               spin_unlock(&ctx->event_wqh.lock);
+
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
@@ -1184,6 +1266,14 @@ out:
        return ret;
 }
 
+static inline unsigned int uffd_ctx_features(__u64 user_features)
+{
+       /*
+        * For the current set of features the bits just coincide
+        */
+       return (unsigned int)user_features;
+}
+
 /*
  * userland asks for a certain API version and we return which bits
  * and ioctl commands are implemented in this kernel for such API
@@ -1202,19 +1292,21 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
        ret = -EFAULT;
        if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
                goto out;
-       if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+       if (uffdio_api.api != UFFD_API ||
+           (uffdio_api.features & ~UFFD_API_FEATURES)) {
                memset(&uffdio_api, 0, sizeof(uffdio_api));
                if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                        goto out;
                ret = -EINVAL;
                goto out;
        }
-       uffdio_api.features = UFFD_API_FEATURES;
+       uffdio_api.features &= UFFD_API_FEATURES;
        uffdio_api.ioctls = UFFD_API_IOCTLS;
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                goto out;
        ctx->state = UFFD_STATE_RUNNING;
+       ctx->features = uffd_ctx_features(uffdio_api.features);
        ret = 0;
 out:
        return ret;
@@ -1301,6 +1393,7 @@ static void init_once_userfaultfd_ctx(void *mem)
 
        init_waitqueue_head(&ctx->fault_pending_wqh);
        init_waitqueue_head(&ctx->fault_wqh);
+       init_waitqueue_head(&ctx->event_wqh);
        init_waitqueue_head(&ctx->fd_wqh);
        seqcount_init(&ctx->refile_seq);
 }
@@ -1341,6 +1434,7 @@ static struct file *userfaultfd_file_create(int flags)
 
        atomic_set(&ctx->refcount, 1);
        ctx->flags = flags;
+       ctx->features = 0;
        ctx->state = UFFD_STATE_WAIT_API;
        ctx->released = false;
        ctx->mm = current->mm;