struct aio_ring {
unsigned id; /* kernel internal index number */
unsigned nr; /* number of io_events */
- unsigned head;
+ unsigned head; /* Written to by userland or under ring_lock
+ * mutex by aio_read_events_ring(). */
unsigned tail;
unsigned magic;
struct work_struct free_work;
+ /*
+ * signals when all in-flight requests are done
+ */
+ struct completion *requests_done;
+
struct {
/*
* This counts the number of available slots in the ringbuffer,
struct {
unsigned tail;
+ unsigned completed_events;
spinlock_t completion_lock;
} ____cacheline_aligned_in_smp;
{
int i;
+ /* Disconnect the kiotx from the ring file. This prevents future
+ * accesses to the kioctx from page migration.
+ */
+ put_aio_ring_file(ctx);
+
for (i = 0; i < ctx->nr_pages; i++) {
struct page *page;
pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
put_page(page);
}
- put_aio_ring_file(ctx);
-
if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
kfree(ctx->ring_pages);
ctx->ring_pages = NULL;
{
struct kioctx *ctx;
unsigned long flags;
+ pgoff_t idx;
int rc;
rc = 0;
- /* Make sure the old page hasn't already been changed */
+ /* mapping->private_lock here protects against the kioctx teardown. */
spin_lock(&mapping->private_lock);
ctx = mapping->private_data;
- if (ctx) {
- pgoff_t idx;
- spin_lock_irqsave(&ctx->completion_lock, flags);
- idx = old->index;
- if (idx < (pgoff_t)ctx->nr_pages) {
- if (ctx->ring_pages[idx] != old)
- rc = -EAGAIN;
- } else
- rc = -EINVAL;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ if (!ctx) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* The ring_lock mutex. The prevents aio_read_events() from writing
+ * to the ring's head, and prevents page migration from mucking in
+ * a partially initialized kiotx.
+ */
+ if (!mutex_trylock(&ctx->ring_lock)) {
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ idx = old->index;
+ if (idx < (pgoff_t)ctx->nr_pages) {
+ /* Make sure the old page hasn't already been changed */
+ if (ctx->ring_pages[idx] != old)
+ rc = -EAGAIN;
} else
rc = -EINVAL;
- spin_unlock(&mapping->private_lock);
if (rc != 0)
- return rc;
+ goto out_unlock;
/* Writeback must be complete */
BUG_ON(PageWriteback(old));
rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
put_page(new);
- return rc;
+ goto out_unlock;
}
- /* We can potentially race against kioctx teardown here. Use the
- * address_space's private data lock to protect the mapping's
- * private_data.
+ /* Take completion_lock to prevent other writes to the ring buffer
+ * while the old page is copied to the new. This prevents new
+ * events from being lost.
*/
- spin_lock(&mapping->private_lock);
- ctx = mapping->private_data;
- if (ctx) {
- pgoff_t idx;
- spin_lock_irqsave(&ctx->completion_lock, flags);
- migrate_page_copy(new, old);
- idx = old->index;
- if (idx < (pgoff_t)ctx->nr_pages) {
- /* And only do the move if things haven't changed */
- if (ctx->ring_pages[idx] == old)
- ctx->ring_pages[idx] = new;
- else
- rc = -EAGAIN;
- } else
- rc = -EINVAL;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else
- rc = -EBUSY;
- spin_unlock(&mapping->private_lock);
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ migrate_page_copy(new, old);
+ BUG_ON(ctx->ring_pages[idx] != old);
+ ctx->ring_pages[idx] = new;
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
- if (rc == MIGRATEPAGE_SUCCESS)
- put_page(old);
- else
- put_page(new);
+ /* The old page is no longer accessible. */
+ put_page(old);
+out_unlock:
+ mutex_unlock(&ctx->ring_lock);
+out:
+ spin_unlock(&mapping->private_lock);
return rc;
}
#endif
file = aio_private_file(ctx, nr_pages);
if (IS_ERR(file)) {
ctx->aio_ring_file = NULL;
- return -EAGAIN;
+ return -ENOMEM;
}
ctx->aio_ring_file = file;
if (unlikely(i != nr_pages)) {
aio_free_ring(ctx);
- return -EAGAIN;
+ return -ENOMEM;
}
ctx->mmap_size = nr_pages * PAGE_SIZE;
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
aio_free_ring(ctx);
- return -EAGAIN;
+ return -ENOMEM;
}
pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
{
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+ /* At this point we know that there are no any in-flight requests */
+ if (ctx->requests_done)
+ complete(ctx->requests_done);
+
INIT_WORK(&ctx->free_work, free_ioctx);
schedule_work(&ctx->free_work);
}
rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
+ /* While kioctx setup is in progress,
+ * we are protected from page migration
+ * changes ring_pages by ->ring_lock.
+ */
ring = kmap_atomic(ctx->ring_pages[0]);
ring->id = ctx->id;
kunmap_atomic(ring);
ctx->max_reqs = nr_events;
- if (percpu_ref_init(&ctx->users, free_ioctx_users))
- goto err;
-
- if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
- goto err;
-
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->completion_lock);
mutex_init(&ctx->ring_lock);
+ /* Protect against page migration throughout kiotx setup by keeping
+ * the ring_lock mutex held until setup is complete. */
+ mutex_lock(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
+ if (percpu_ref_init(&ctx->users, free_ioctx_users))
+ goto err;
+
+ if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+ goto err;
+
ctx->cpu = alloc_percpu(struct kioctx_cpu);
if (!ctx->cpu)
goto err;
- if (aio_setup_ring(ctx) < 0)
+ err = aio_setup_ring(ctx);
+ if (err < 0)
goto err;
atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
if (err)
goto err_cleanup;
+ /* Release the ring_lock mutex now that all setup is complete. */
+ mutex_unlock(&ctx->ring_lock);
+
pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, mm, ctx->nr_events);
return ctx;
err_ctx:
aio_free_ring(ctx);
err:
+ mutex_unlock(&ctx->ring_lock);
free_percpu(ctx->cpu);
free_percpu(ctx->reqs.pcpu_count);
free_percpu(ctx->users.pcpu_count);
* when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
+static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+ struct completion *requests_done)
{
if (!atomic_xchg(&ctx->dead, 1)) {
struct kioctx_table *table;
if (ctx->mmap_size)
vm_munmap(ctx->mmap_base, ctx->mmap_size);
+ ctx->requests_done = requests_done;
percpu_ref_kill(&ctx->users);
+ } else {
+ if (requests_done)
+ complete(requests_done);
}
}
*/
ctx->mmap_size = 0;
- kill_ioctx(mm, ctx);
+ kill_ioctx(mm, ctx, NULL);
}
}
static void put_reqs_available(struct kioctx *ctx, unsigned nr)
{
struct kioctx_cpu *kcpu;
+ unsigned long flags;
preempt_disable();
kcpu = this_cpu_ptr(ctx->cpu);
+ local_irq_save(flags);
kcpu->reqs_available += nr;
+
while (kcpu->reqs_available >= ctx->req_batch * 2) {
kcpu->reqs_available -= ctx->req_batch;
atomic_add(ctx->req_batch, &ctx->reqs_available);
}
+ local_irq_restore(flags);
preempt_enable();
}
{
struct kioctx_cpu *kcpu;
bool ret = false;
+ unsigned long flags;
preempt_disable();
kcpu = this_cpu_ptr(ctx->cpu);
+ local_irq_save(flags);
if (!kcpu->reqs_available) {
int old, avail = atomic_read(&ctx->reqs_available);
ret = true;
kcpu->reqs_available--;
out:
+ local_irq_restore(flags);
preempt_enable();
return ret;
}
+/* refill_reqs_available
+ * Updates the reqs_available reference counts used for tracking the
+ * number of free slots in the completion ring. This can be called
+ * from aio_complete() (to optimistically update reqs_available) or
+ * from aio_get_req() (the we're out of events case). It must be
+ * called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+ unsigned tail)
+{
+ unsigned events_in_ring, completed;
+
+ /* Clamp head since userland can write to it. */
+ head %= ctx->nr_events;
+ if (head <= tail)
+ events_in_ring = tail - head;
+ else
+ events_in_ring = ctx->nr_events - (head - tail);
+
+ completed = ctx->completed_events;
+ if (events_in_ring < completed)
+ completed -= events_in_ring;
+ else
+ completed = 0;
+
+ if (!completed)
+ return;
+
+ ctx->completed_events -= completed;
+ put_reqs_available(ctx, completed);
+}
+
+/* user_refill_reqs_available
+ * Called to refill reqs_available when aio_get_req() encounters an
+ * out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+ spin_lock_irq(&ctx->completion_lock);
+ if (ctx->completed_events) {
+ struct aio_ring *ring;
+ unsigned head;
+
+ /* Access of ring->head may race with aio_read_events_ring()
+ * here, but that's okay since whether we read the old version
+ * or the new version, and either will be valid. The important
+ * part is that head cannot pass tail since we prevent
+ * aio_complete() from updating tail by holding
+ * ctx->completion_lock. Even if head is invalid, the check
+ * against ctx->completed_events below will make sure we do the
+ * safe/right thing.
+ */
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
+ kunmap_atomic(ring);
+
+ refill_reqs_available(ctx, head, ctx->tail);
+ }
+
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
/* aio_get_req
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
{
struct kiocb *req;
- if (!get_reqs_available(ctx))
- return NULL;
+ if (!get_reqs_available(ctx)) {
+ user_refill_reqs_available(ctx);
+ if (!get_reqs_available(ctx))
+ return NULL;
+ }
req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
if (unlikely(!req))
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
+ unsigned tail, pos, head;
unsigned long flags;
- unsigned tail, pos;
/*
* Special case handling for sync iocbs:
ctx->tail = tail;
ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
ring->tail = tail;
kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);
+ ctx->completed_events++;
+ if (ctx->completed_events > 1)
+ refill_reqs_available(ctx, head, tail);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
pr_debug("added to ring %p at [%u]\n", iocb, tail);
mutex_lock(&ctx->ring_lock);
+ /* Access to ->ring_pages here is protected by ctx->ring_lock. */
ring = kmap_atomic(ctx->ring_pages[0]);
head = ring->head;
tail = ring->tail;
kunmap_atomic(ring);
+ /*
+ * Ensure that once we've read the current tail pointer, that
+ * we also see the events that were stored up to the tail.
+ */
+ smp_rmb();
+
pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
if (head == tail)
goto out;
+ head %= ctx->nr_events;
+ tail %= ctx->nr_events;
+
while (ret < nr) {
long avail;
struct io_event *ev;
flush_dcache_page(ctx->ring_pages[0]);
pr_debug("%li h%u t%u\n", ret, head, tail);
-
- put_reqs_available(ctx, ret);
out:
mutex_unlock(&ctx->ring_lock);
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- kill_ioctx(current->mm, ioctx);
+ kill_ioctx(current->mm, ioctx, NULL);
percpu_ref_put(&ioctx->users);
}
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- kill_ioctx(current->mm, ioctx);
+ struct completion requests_done =
+ COMPLETION_INITIALIZER_ONSTACK(requests_done);
+
+ /* Pass requests_done to kill_ioctx() where it can be set
+ * in a thread-safe way. If we try to set it here then we have
+ * a race condition if two io_destroy() called simultaneously.
+ */
+ kill_ioctx(current->mm, ioctx, &requests_done);
percpu_ref_put(&ioctx->users);
+
+ /* Wait until all IO for the context are done. Otherwise kernel
+ * keep using user-space buffers even if user thinks the context
+ * is destroyed.
+ */
+ wait_for_completion(&requests_done);
+
return 0;
}
pr_debug("EINVAL: io_destroy: invalid context id\n");
&iovec, compat)
: aio_setup_single_vector(req, rw, buf, &nr_segs,
iovec);
- if (ret)
- return ret;
-
- ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ if (!ret)
+ ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
if (ret < 0) {
if (iovec != &inline_vec)
kfree(iovec);