aio: add missing smp_rmb() in read_events_ring

[platform/adaptation/renesas_rcar/renesas_kernel.git] / fs / aio.c
diff --git a/fs/aio.c b/fs/aio.c

index 062a5f6..c4b6b82 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -52,7 +52,8 @@
  struct aio_ring {
         unsigned        id;     /* kernel internal index number */
         unsigned        nr;     /* number of io_events */
-       unsigned        head;
+       unsigned        head;   /* Written to by userland or under ring_lock
+                                * mutex by aio_read_events_ring(). */
         unsigned        tail;
  
         unsigned        magic;
@@ -111,6 +112,11 @@ struct kioctx {
  
         struct work_struct      free_work;
  
+       /*
+        * signals when all in-flight requests are done
+        */
+       struct completion *requests_done;
+
         struct {
                 /*
                  * This counts the number of available slots in the ringbuffer,
@@ -135,6 +141,7 @@ struct kioctx {
  
         struct {
                 unsigned        tail;
+               unsigned        completed_events;
                 spinlock_t      completion_lock;
         } ____cacheline_aligned_in_smp;
  
@@ -243,6 +250,11 @@ static void aio_free_ring(struct kioctx *ctx)
  {
         int i;
  
+       /* Disconnect the kiotx from the ring file.  This prevents future
+        * accesses to the kioctx from page migration.
+        */
+       put_aio_ring_file(ctx);
+
         for (i = 0; i < ctx->nr_pages; i++) {
                 struct page *page;
                 pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
@@ -254,8 +266,6 @@ static void aio_free_ring(struct kioctx *ctx)
                 put_page(page);
         }
  
-       put_aio_ring_file(ctx);
-
         if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
                 kfree(ctx->ring_pages);
                 ctx->ring_pages = NULL;
@@ -283,29 +293,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
  {
         struct kioctx *ctx;
         unsigned long flags;
+       pgoff_t idx;
         int rc;
  
         rc = 0;
  
-       /* Make sure the old page hasn't already been changed */
+       /* mapping->private_lock here protects against the kioctx teardown.  */
         spin_lock(&mapping->private_lock);
         ctx = mapping->private_data;
-       if (ctx) {
-               pgoff_t idx;
-               spin_lock_irqsave(&ctx->completion_lock, flags);
-               idx = old->index;
-               if (idx < (pgoff_t)ctx->nr_pages) {
-                       if (ctx->ring_pages[idx] != old)
-                               rc = -EAGAIN;
-               } else
-                       rc = -EINVAL;
-               spin_unlock_irqrestore(&ctx->completion_lock, flags);
+       if (!ctx) {
+               rc = -EINVAL;
+               goto out;
+       }
+
+       /* The ring_lock mutex.  The prevents aio_read_events() from writing
+        * to the ring's head, and prevents page migration from mucking in
+        * a partially initialized kiotx.
+        */
+       if (!mutex_trylock(&ctx->ring_lock)) {
+               rc = -EAGAIN;
+               goto out;
+       }
+
+       idx = old->index;
+       if (idx < (pgoff_t)ctx->nr_pages) {
+               /* Make sure the old page hasn't already been changed */
+               if (ctx->ring_pages[idx] != old)
+                       rc = -EAGAIN;
         } else
                 rc = -EINVAL;
-       spin_unlock(&mapping->private_lock);
  
         if (rc != 0)
-               return rc;
+               goto out_unlock;
  
         /* Writeback must be complete */
         BUG_ON(PageWriteback(old));
@@ -314,38 +333,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
         rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
         if (rc != MIGRATEPAGE_SUCCESS) {
                 put_page(new);
-               return rc;
+               goto out_unlock;
         }
  
-       /* We can potentially race against kioctx teardown here.  Use the
-        * address_space's private data lock to protect the mapping's
-        * private_data.
+       /* Take completion_lock to prevent other writes to the ring buffer
+        * while the old page is copied to the new.  This prevents new
+        * events from being lost.
          */
-       spin_lock(&mapping->private_lock);
-       ctx = mapping->private_data;
-       if (ctx) {
-               pgoff_t idx;
-               spin_lock_irqsave(&ctx->completion_lock, flags);
-               migrate_page_copy(new, old);
-               idx = old->index;
-               if (idx < (pgoff_t)ctx->nr_pages) {
-                       /* And only do the move if things haven't changed */
-                       if (ctx->ring_pages[idx] == old)
-                               ctx->ring_pages[idx] = new;
-                       else
-                               rc = -EAGAIN;
-               } else
-                       rc = -EINVAL;
-               spin_unlock_irqrestore(&ctx->completion_lock, flags);
-       } else
-               rc = -EBUSY;
-       spin_unlock(&mapping->private_lock);
+       spin_lock_irqsave(&ctx->completion_lock, flags);
+       migrate_page_copy(new, old);
+       BUG_ON(ctx->ring_pages[idx] != old);
+       ctx->ring_pages[idx] = new;
+       spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
-       if (rc == MIGRATEPAGE_SUCCESS)
-               put_page(old);
-       else
-               put_page(new);
+       /* The old page is no longer accessible. */
+       put_page(old);
  
+out_unlock:
+       mutex_unlock(&ctx->ring_lock);
+out:
+       spin_unlock(&mapping->private_lock);
         return rc;
  }
  #endif
@@ -380,7 +387,7 @@ static int aio_setup_ring(struct kioctx *ctx)
         file = aio_private_file(ctx, nr_pages);
         if (IS_ERR(file)) {
                 ctx->aio_ring_file = NULL;
-               return -EAGAIN;
+               return -ENOMEM;
         }
  
         ctx->aio_ring_file = file;
@@ -415,7 +422,7 @@ static int aio_setup_ring(struct kioctx *ctx)
  
         if (unlikely(i != nr_pages)) {
                 aio_free_ring(ctx);
-               return -EAGAIN;
+               return -ENOMEM;
         }
  
         ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -429,7 +436,7 @@ static int aio_setup_ring(struct kioctx *ctx)
         if (IS_ERR((void *)ctx->mmap_base)) {
                 ctx->mmap_size = 0;
                 aio_free_ring(ctx);
-               return -EAGAIN;
+               return -ENOMEM;
         }
  
         pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
@@ -507,6 +514,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
  {
         struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
  
+       /* At this point we know that there are no any in-flight requests */
+       if (ctx->requests_done)
+               complete(ctx->requests_done);
+
         INIT_WORK(&ctx->free_work, free_ioctx);
         schedule_work(&ctx->free_work);
  }
@@ -556,6 +567,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
                                         rcu_read_unlock();
                                         spin_unlock(&mm->ioctx_lock);
  
+                                       /* While kioctx setup is in progress,
+                                        * we are protected from page migration
+                                        * changes ring_pages by ->ring_lock.
+                                        */
                                         ring = kmap_atomic(ctx->ring_pages[0]);
                                         ring->id = ctx->id;
                                         kunmap_atomic(ring);
@@ -640,24 +655,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
  
         ctx->max_reqs = nr_events;
  
-       if (percpu_ref_init(&ctx->users, free_ioctx_users))
-               goto err;
-
-       if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
-               goto err;
-
         spin_lock_init(&ctx->ctx_lock);
         spin_lock_init(&ctx->completion_lock);
         mutex_init(&ctx->ring_lock);
+       /* Protect against page migration throughout kiotx setup by keeping
+        * the ring_lock mutex held until setup is complete. */
+       mutex_lock(&ctx->ring_lock);
         init_waitqueue_head(&ctx->wait);
  
         INIT_LIST_HEAD(&ctx->active_reqs);
  
+       if (percpu_ref_init(&ctx->users, free_ioctx_users))
+               goto err;
+
+       if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+               goto err;
+
         ctx->cpu = alloc_percpu(struct kioctx_cpu);
         if (!ctx->cpu)
                 goto err;
  
-       if (aio_setup_ring(ctx) < 0)
+       err = aio_setup_ring(ctx);
+       if (err < 0)
                 goto err;
  
         atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
@@ -683,6 +702,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
         if (err)
                 goto err_cleanup;
  
+       /* Release the ring_lock mutex now that all setup is complete. */
+       mutex_unlock(&ctx->ring_lock);
+
         pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
                  ctx, ctx->user_id, mm, ctx->nr_events);
         return ctx;
@@ -692,6 +714,7 @@ err_cleanup:
  err_ctx:
         aio_free_ring(ctx);
  err:
+       mutex_unlock(&ctx->ring_lock);
         free_percpu(ctx->cpu);
         free_percpu(ctx->reqs.pcpu_count);
         free_percpu(ctx->users.pcpu_count);
@@ -705,7 +728,8 @@ err:
   *     when the processes owning a context have all exited to encourage
   *     the rapid destruction of the kioctx.
   */
-static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
+static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+               struct completion *requests_done)
  {
         if (!atomic_xchg(&ctx->dead, 1)) {
                 struct kioctx_table *table;
@@ -734,7 +758,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
                 if (ctx->mmap_size)
                         vm_munmap(ctx->mmap_base, ctx->mmap_size);
  
+               ctx->requests_done = requests_done;
                 percpu_ref_kill(&ctx->users);
+       } else {
+               if (requests_done)
+                       complete(requests_done);
         }
  }
  
@@ -796,23 +824,27 @@ void exit_aio(struct mm_struct *mm)
                  */
                 ctx->mmap_size = 0;
  
-               kill_ioctx(mm, ctx);
+               kill_ioctx(mm, ctx, NULL);
         }
  }
  
  static void put_reqs_available(struct kioctx *ctx, unsigned nr)
  {
         struct kioctx_cpu *kcpu;
+       unsigned long flags;
  
         preempt_disable();
         kcpu = this_cpu_ptr(ctx->cpu);
  
+       local_irq_save(flags);
         kcpu->reqs_available += nr;
+
         while (kcpu->reqs_available >= ctx->req_batch * 2) {
                 kcpu->reqs_available -= ctx->req_batch;
                 atomic_add(ctx->req_batch, &ctx->reqs_available);
         }
  
+       local_irq_restore(flags);
         preempt_enable();
  }
  
@@ -820,10 +852,12 @@ static bool get_reqs_available(struct kioctx *ctx)
  {
         struct kioctx_cpu *kcpu;
         bool ret = false;
+       unsigned long flags;
  
         preempt_disable();
         kcpu = this_cpu_ptr(ctx->cpu);
  
+       local_irq_save(flags);
         if (!kcpu->reqs_available) {
                 int old, avail = atomic_read(&ctx->reqs_available);
  
@@ -842,10 +876,73 @@ static bool get_reqs_available(struct kioctx *ctx)
         ret = true;
         kcpu->reqs_available--;
  out:
+       local_irq_restore(flags);
         preempt_enable();
         return ret;
  }
  
+/* refill_reqs_available
+ *     Updates the reqs_available reference counts used for tracking the
+ *     number of free slots in the completion ring.  This can be called
+ *     from aio_complete() (to optimistically update reqs_available) or
+ *     from aio_get_req() (the we're out of events case).  It must be
+ *     called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+                                  unsigned tail)
+{
+       unsigned events_in_ring, completed;
+
+       /* Clamp head since userland can write to it. */
+       head %= ctx->nr_events;
+       if (head <= tail)
+               events_in_ring = tail - head;
+       else
+               events_in_ring = ctx->nr_events - (head - tail);
+
+       completed = ctx->completed_events;
+       if (events_in_ring < completed)
+               completed -= events_in_ring;
+       else
+               completed = 0;
+
+       if (!completed)
+               return;
+
+       ctx->completed_events -= completed;
+       put_reqs_available(ctx, completed);
+}
+
+/* user_refill_reqs_available
+ *     Called to refill reqs_available when aio_get_req() encounters an
+ *     out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+       spin_lock_irq(&ctx->completion_lock);
+       if (ctx->completed_events) {
+               struct aio_ring *ring;
+               unsigned head;
+
+               /* Access of ring->head may race with aio_read_events_ring()
+                * here, but that's okay since whether we read the old version
+                * or the new version, and either will be valid.  The important
+                * part is that head cannot pass tail since we prevent
+                * aio_complete() from updating tail by holding
+                * ctx->completion_lock.  Even if head is invalid, the check
+                * against ctx->completed_events below will make sure we do the
+                * safe/right thing.
+                */
+               ring = kmap_atomic(ctx->ring_pages[0]);
+               head = ring->head;
+               kunmap_atomic(ring);
+
+               refill_reqs_available(ctx, head, ctx->tail);
+       }
+
+       spin_unlock_irq(&ctx->completion_lock);
+}
+
  /* aio_get_req
   *     Allocate a slot for an aio request.
   * Returns NULL if no requests are free.
@@ -854,8 +951,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
  {
         struct kiocb *req;
  
-       if (!get_reqs_available(ctx))
-               return NULL;
+       if (!get_reqs_available(ctx)) {
+               user_refill_reqs_available(ctx);
+               if (!get_reqs_available(ctx))
+                       return NULL;
+       }
  
         req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
         if (unlikely(!req))
@@ -914,8 +1014,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
         struct kioctx   *ctx = iocb->ki_ctx;
         struct aio_ring *ring;
         struct io_event *ev_page, *event;
+       unsigned tail, pos, head;
         unsigned long   flags;
-       unsigned tail, pos;
  
         /*
          * Special case handling for sync iocbs:
@@ -976,10 +1076,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
         ctx->tail = tail;
  
         ring = kmap_atomic(ctx->ring_pages[0]);
+       head = ring->head;
         ring->tail = tail;
         kunmap_atomic(ring);
         flush_dcache_page(ctx->ring_pages[0]);
  
+       ctx->completed_events++;
+       if (ctx->completed_events > 1)
+               refill_reqs_available(ctx, head, tail);
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
         pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1024,16 +1128,26 @@ static long aio_read_events_ring(struct kioctx *ctx,
  
         mutex_lock(&ctx->ring_lock);
  
+       /* Access to ->ring_pages here is protected by ctx->ring_lock. */
         ring = kmap_atomic(ctx->ring_pages[0]);
         head = ring->head;
         tail = ring->tail;
         kunmap_atomic(ring);
  
+       /*
+        * Ensure that once we've read the current tail pointer, that
+        * we also see the events that were stored up to the tail.
+        */
+       smp_rmb();
+
         pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
  
         if (head == tail)
                 goto out;
  
+       head %= ctx->nr_events;
+       tail %= ctx->nr_events;
+
         while (ret < nr) {
                 long avail;
                 struct io_event *ev;
@@ -1072,8 +1186,6 @@ static long aio_read_events_ring(struct kioctx *ctx,
         flush_dcache_page(ctx->ring_pages[0]);
  
         pr_debug("%li  h%u t%u\n", ret, head, tail);
-
-       put_reqs_available(ctx, ret);
  out:
         mutex_unlock(&ctx->ring_lock);
  
@@ -1171,7 +1283,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
         if (!IS_ERR(ioctx)) {
                 ret = put_user(ioctx->user_id, ctxp);
                 if (ret)
-                       kill_ioctx(current->mm, ioctx);
+                       kill_ioctx(current->mm, ioctx, NULL);
                 percpu_ref_put(&ioctx->users);
         }
  
@@ -1189,8 +1301,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
  {
         struct kioctx *ioctx = lookup_ioctx(ctx);
         if (likely(NULL != ioctx)) {
-               kill_ioctx(current->mm, ioctx);
+               struct completion requests_done =
+                       COMPLETION_INITIALIZER_ONSTACK(requests_done);
+
+               /* Pass requests_done to kill_ioctx() where it can be set
+                * in a thread-safe way. If we try to set it here then we have
+                * a race condition if two io_destroy() called simultaneously.
+                */
+               kill_ioctx(current->mm, ioctx, &requests_done);
                 percpu_ref_put(&ioctx->users);
+
+               /* Wait until all IO for the context are done. Otherwise kernel
+                * keep using user-space buffers even if user thinks the context
+                * is destroyed.
+                */
+               wait_for_completion(&requests_done);
+
                 return 0;
         }
         pr_debug("EINVAL: io_destroy: invalid context id\n");
@@ -1285,10 +1411,8 @@ rw_common:
                                                 &iovec, compat)
                         : aio_setup_single_vector(req, rw, buf, &nr_segs,
                                                   iovec);
-               if (ret)
-                       return ret;
-
-               ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+               if (!ret)
+                       ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
                 if (ret < 0) {
                         if (iovec != &inline_vec)
                                 kfree(iovec);