fs/eventfd.c

   1 /*
   2  *  fs/eventfd.c
   3  *
   4  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   5  *
   6  */
   7
   8 #include <linux/file.h>
   9 #include <linux/poll.h>
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/sched/signal.h>
  13 #include <linux/kernel.h>
  14 #include <linux/slab.h>
  15 #include <linux/list.h>
  16 #include <linux/spinlock.h>
  17 #include <linux/anon_inodes.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/export.h>
  20 #include <linux/kref.h>
  21 #include <linux/eventfd.h>
  22 #include <linux/proc_fs.h>
  23 #include <linux/seq_file.h>
  24
  25 struct eventfd_ctx {
  26         struct kref kref;
  27         wait_queue_head_t wqh;
  28         /*
  29          * Every time that a write(2) is performed on an eventfd, the
  30          * value of the __u64 being written is added to "count" and a
  31          * wakeup is performed on "wqh". A read(2) will return the "count"
  32          * value to userspace, and will reset "count" to zero. The kernel
  33          * side eventfd_signal() also, adds to the "count" counter and
  34          * issue a wakeup.
  35          */
  36         __u64 count;
  37         unsigned int flags;
  38 };
  39
  40 /**
  41  * eventfd_signal - Adds @n to the eventfd counter.
  42  * @ctx: [in] Pointer to the eventfd context.
  43  * @n: [in] Value of the counter to be added to the eventfd internal counter.
  44  *          The value cannot be negative.
  45  *
  46  * This function is supposed to be called by the kernel in paths that do not
  47  * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  48  * value, and we signal this as overflow condition by returning a EPOLLERR
  49  * to poll(2).
  50  *
  51  * Returns the amount by which the counter was incremented.  This will be less
  52  * than @n if the counter has overflowed.
  53  */
  54 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  55 {
  56         unsigned long flags;
  57
  58         spin_lock_irqsave(&ctx->wqh.lock, flags);
  59         if (ULLONG_MAX - ctx->count < n)
  60                 n = ULLONG_MAX - ctx->count;
  61         ctx->count += n;
  62         if (waitqueue_active(&ctx->wqh))
  63                 wake_up_locked_poll(&ctx->wqh, EPOLLIN);
  64         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  65
  66         return n;
  67 }
  68 EXPORT_SYMBOL_GPL(eventfd_signal);
  69
  70 static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  71 {
  72         kfree(ctx);
  73 }
  74
  75 static void eventfd_free(struct kref *kref)
  76 {
  77         struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  78
  79         eventfd_free_ctx(ctx);
  80 }
  81
  82 /**
  83  * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  84  * @ctx: [in] Pointer to eventfd context.
  85  *
  86  * The eventfd context reference must have been previously acquired either
  87  * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
  88  */
  89 void eventfd_ctx_put(struct eventfd_ctx *ctx)
  90 {
  91         kref_put(&ctx->kref, eventfd_free);
  92 }
  93 EXPORT_SYMBOL_GPL(eventfd_ctx_put);
  94
  95 static int eventfd_release(struct inode *inode, struct file *file)
  96 {
  97         struct eventfd_ctx *ctx = file->private_data;
  98
  99         wake_up_poll(&ctx->wqh, EPOLLHUP);
 100         eventfd_ctx_put(ctx);
 101         return 0;
 102 }
 103
 104 static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 105 {
 106         struct eventfd_ctx *ctx = file->private_data;
 107         __poll_t events = 0;
 108         u64 count;
 109
 110         poll_wait(file, &ctx->wqh, wait);
 111
 112         /*
 113          * All writes to ctx->count occur within ctx->wqh.lock.  This read
 114          * can be done outside ctx->wqh.lock because we know that poll_wait
 115          * takes that lock (through add_wait_queue) if our caller will sleep.
 116          *
 117          * The read _can_ therefore seep into add_wait_queue's critical
 118          * section, but cannot move above it!  add_wait_queue's spin_lock acts
 119          * as an acquire barrier and ensures that the read be ordered properly
 120          * against the writes.  The following CAN happen and is safe:
 121          *
 122          *     poll                               write
 123          *     -----------------                  ------------
 124          *     lock ctx->wqh.lock (in poll_wait)
 125          *     count = ctx->count
 126          *     __add_wait_queue
 127          *     unlock ctx->wqh.lock
 128          *                                        lock ctx->qwh.lock
 129          *                                        ctx->count += n
 130          *                                        if (waitqueue_active)
 131          *                                          wake_up_locked_poll
 132          *                                        unlock ctx->qwh.lock
 133          *     eventfd_poll returns 0
 134          *
 135          * but the following, which would miss a wakeup, cannot happen:
 136          *
 137          *     poll                               write
 138          *     -----------------                  ------------
 139          *     count = ctx->count (INVALID!)
 140          *                                        lock ctx->qwh.lock
 141          *                                        ctx->count += n
 142          *                                        **waitqueue_active is false**
 143          *                                        **no wake_up_locked_poll!**
 144          *                                        unlock ctx->qwh.lock
 145          *     lock ctx->wqh.lock (in poll_wait)
 146          *     __add_wait_queue
 147          *     unlock ctx->wqh.lock
 148          *     eventfd_poll returns 0
 149          */
 150         count = READ_ONCE(ctx->count);
 151
 152         if (count > 0)
 153                 events |= EPOLLIN;
 154         if (count == ULLONG_MAX)
 155                 events |= EPOLLERR;
 156         if (ULLONG_MAX - 1 > count)
 157                 events |= EPOLLOUT;
 158
 159         return events;
 160 }
 161
 162 static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 163 {
 164         *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 165         ctx->count -= *cnt;
 166 }
 167
 168 /**
 169  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 170  * @ctx: [in] Pointer to eventfd context.
 171  * @wait: [in] Wait queue to be removed.
 172  * @cnt: [out] Pointer to the 64-bit counter value.
 173  *
 174  * Returns %0 if successful, or the following error codes:
 175  *
 176  * -EAGAIN      : The operation would have blocked.
 177  *
 178  * This is used to atomically remove a wait queue entry from the eventfd wait
 179  * queue head, and read/reset the counter value.
 180  */
 181 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 182                                   __u64 *cnt)
 183 {
 184         unsigned long flags;
 185
 186         spin_lock_irqsave(&ctx->wqh.lock, flags);
 187         eventfd_ctx_do_read(ctx, cnt);
 188         __remove_wait_queue(&ctx->wqh, wait);
 189         if (*cnt != 0 && waitqueue_active(&ctx->wqh))
 190                 wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 191         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 192
 193         return *cnt != 0 ? 0 : -EAGAIN;
 194 }
 195 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 196
 197 static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 198                             loff_t *ppos)
 199 {
 200         struct eventfd_ctx *ctx = file->private_data;
 201         ssize_t res;
 202         __u64 ucnt = 0;
 203         DECLARE_WAITQUEUE(wait, current);
 204
 205         if (count < sizeof(ucnt))
 206                 return -EINVAL;
 207
 208         spin_lock_irq(&ctx->wqh.lock);
 209         res = -EAGAIN;
 210         if (ctx->count > 0)
 211                 res = sizeof(ucnt);
 212         else if (!(file->f_flags & O_NONBLOCK)) {
 213                 __add_wait_queue(&ctx->wqh, &wait);
 214                 for (;;) {
 215                         set_current_state(TASK_INTERRUPTIBLE);
 216                         if (ctx->count > 0) {
 217                                 res = sizeof(ucnt);
 218                                 break;
 219                         }
 220                         if (signal_pending(current)) {
 221                                 res = -ERESTARTSYS;
 222                                 break;
 223                         }
 224                         spin_unlock_irq(&ctx->wqh.lock);
 225                         schedule();
 226                         spin_lock_irq(&ctx->wqh.lock);
 227                 }
 228                 __remove_wait_queue(&ctx->wqh, &wait);
 229                 __set_current_state(TASK_RUNNING);
 230         }
 231         if (likely(res > 0)) {
 232                 eventfd_ctx_do_read(ctx, &ucnt);
 233                 if (waitqueue_active(&ctx->wqh))
 234                         wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 235         }
 236         spin_unlock_irq(&ctx->wqh.lock);
 237
 238         if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
 239                 return -EFAULT;
 240
 241         return res;
 242 }
 243
 244 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 245                              loff_t *ppos)
 246 {
 247         struct eventfd_ctx *ctx = file->private_data;
 248         ssize_t res;
 249         __u64 ucnt;
 250         DECLARE_WAITQUEUE(wait, current);
 251
 252         if (count < sizeof(ucnt))
 253                 return -EINVAL;
 254         if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 255                 return -EFAULT;
 256         if (ucnt == ULLONG_MAX)
 257                 return -EINVAL;
 258         spin_lock_irq(&ctx->wqh.lock);
 259         res = -EAGAIN;
 260         if (ULLONG_MAX - ctx->count > ucnt)
 261                 res = sizeof(ucnt);
 262         else if (!(file->f_flags & O_NONBLOCK)) {
 263                 __add_wait_queue(&ctx->wqh, &wait);
 264                 for (res = 0;;) {
 265                         set_current_state(TASK_INTERRUPTIBLE);
 266                         if (ULLONG_MAX - ctx->count > ucnt) {
 267                                 res = sizeof(ucnt);
 268                                 break;
 269                         }
 270                         if (signal_pending(current)) {
 271                                 res = -ERESTARTSYS;
 272                                 break;
 273                         }
 274                         spin_unlock_irq(&ctx->wqh.lock);
 275                         schedule();
 276                         spin_lock_irq(&ctx->wqh.lock);
 277                 }
 278                 __remove_wait_queue(&ctx->wqh, &wait);
 279                 __set_current_state(TASK_RUNNING);
 280         }
 281         if (likely(res > 0)) {
 282                 ctx->count += ucnt;
 283                 if (waitqueue_active(&ctx->wqh))
 284                         wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 285         }
 286         spin_unlock_irq(&ctx->wqh.lock);
 287
 288         return res;
 289 }
 290
 291 #ifdef CONFIG_PROC_FS
 292 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 293 {
 294         struct eventfd_ctx *ctx = f->private_data;
 295
 296         spin_lock_irq(&ctx->wqh.lock);
 297         seq_printf(m, "eventfd-count: %16llx\n",
 298                    (unsigned long long)ctx->count);
 299         spin_unlock_irq(&ctx->wqh.lock);
 300 }
 301 #endif
 302
 303 static const struct file_operations eventfd_fops = {
 304 #ifdef CONFIG_PROC_FS
 305         .show_fdinfo    = eventfd_show_fdinfo,
 306 #endif
 307         .release        = eventfd_release,
 308         .poll           = eventfd_poll,
 309         .read           = eventfd_read,
 310         .write          = eventfd_write,
 311         .llseek         = noop_llseek,
 312 };
 313
 314 /**
 315  * eventfd_fget - Acquire a reference of an eventfd file descriptor.
 316  * @fd: [in] Eventfd file descriptor.
 317  *
 318  * Returns a pointer to the eventfd file structure in case of success, or the
 319  * following error pointer:
 320  *
 321  * -EBADF    : Invalid @fd file descriptor.
 322  * -EINVAL   : The @fd file descriptor is not an eventfd file.
 323  */
 324 struct file *eventfd_fget(int fd)
 325 {
 326         struct file *file;
 327
 328         file = fget(fd);
 329         if (!file)
 330                 return ERR_PTR(-EBADF);
 331         if (file->f_op != &eventfd_fops) {
 332                 fput(file);
 333                 return ERR_PTR(-EINVAL);
 334         }
 335
 336         return file;
 337 }
 338 EXPORT_SYMBOL_GPL(eventfd_fget);
 339
 340 /**
 341  * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
 342  * @fd: [in] Eventfd file descriptor.
 343  *
 344  * Returns a pointer to the internal eventfd context, otherwise the error
 345  * pointers returned by the following functions:
 346  *
 347  * eventfd_fget
 348  */
 349 struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 350 {
 351         struct eventfd_ctx *ctx;
 352         struct fd f = fdget(fd);
 353         if (!f.file)
 354                 return ERR_PTR(-EBADF);
 355         ctx = eventfd_ctx_fileget(f.file);
 356         fdput(f);
 357         return ctx;
 358 }
 359 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
 360
 361 /**
 362  * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
 363  * @file: [in] Eventfd file pointer.
 364  *
 365  * Returns a pointer to the internal eventfd context, otherwise the error
 366  * pointer:
 367  *
 368  * -EINVAL   : The @fd file descriptor is not an eventfd file.
 369  */
 370 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 371 {
 372         struct eventfd_ctx *ctx;
 373
 374         if (file->f_op != &eventfd_fops)
 375                 return ERR_PTR(-EINVAL);
 376
 377         ctx = file->private_data;
 378         kref_get(&ctx->kref);
 379         return ctx;
 380 }
 381 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 382
 383 static int do_eventfd(unsigned int count, int flags)
 384 {
 385         struct eventfd_ctx *ctx;
 386         int fd;
 387
 388         /* Check the EFD_* constants for consistency.  */
 389         BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 390         BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 391
 392         if (flags & ~EFD_FLAGS_SET)
 393                 return -EINVAL;
 394
 395         ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 396         if (!ctx)
 397                 return -ENOMEM;
 398
 399         kref_init(&ctx->kref);
 400         init_waitqueue_head(&ctx->wqh);
 401         ctx->count = count;
 402         ctx->flags = flags;
 403
 404         fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
 405                               O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
 406         if (fd < 0)
 407                 eventfd_free_ctx(ctx);
 408
 409         return fd;
 410 }
 411
 412 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 413 {
 414         return do_eventfd(count, flags);
 415 }
 416
 417 SYSCALL_DEFINE1(eventfd, unsigned int, count)
 418 {
 419         return do_eventfd(count, 0);
 420 }
 421