int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
- eventfd_signal(ev_fd->cq_ev_fd, 1);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
goto out;
if (likely(eventfd_signal_allowed())) {
- eventfd_signal(ev_fd->cq_ev_fd, 1);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
return filled;
}
-static void __io_req_complete_put(struct io_kiocb *req)
+void io_req_complete_post(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_cq_lock(ctx);
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(ctx, req);
+
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
*/
if (req_ref_put_and_test(req)) {
- struct io_ring_ctx *ctx = req->ctx;
-
if (req->flags & IO_REQ_LINK_FLAGS) {
if (req->flags & IO_DISARM_MASK)
io_disarm_next(req);
wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
}
-}
-
-void __io_req_complete_post(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req->ctx, req);
- __io_req_complete_put(req);
-}
-
-void io_req_complete_post(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- io_cq_lock(ctx);
- __io_req_complete_post(req);
io_cq_unlock_post(ctx);
}
}
void io_req_complete_failed(struct io_kiocb *req, s32 res)
+ __must_hold(&ctx->uring_lock)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
+ lockdep_assert_held(&req->ctx->uring_lock);
+
req_set_fail(req);
io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
if (def->fail)
/* if not contended, grab and improve batching */
*locked = mutex_trylock(&(*ctx)->uring_lock);
percpu_ref_get(&(*ctx)->refs);
- }
+ } else if (!*locked)
+ *locked = mutex_trylock(&(*ctx)->uring_lock);
req->io_task_work.func(req, locked);
node = next;
count++;
+ if (unlikely(need_resched())) {
+ ctx_flush_and_put(*ctx, locked);
+ *ctx = NULL;
+ cond_resched();
+ }
}
return count;
{
struct io_ring_ctx *ctx = req->ctx;
- if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
+ percpu_ref_get(&ctx->refs);
+
+ if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) {
+ percpu_ref_put(&ctx->refs);
return;
+ }
/* need it for the following io_cqring_wake() */
smp_mb__after_atomic();
if (unlikely(atomic_read(&req->task->io_uring->in_idle))) {
io_move_task_work_from_local(ctx);
+ percpu_ref_put(&ctx->refs);
return;
}
if (ctx->has_evfd)
io_eventfd_signal(ctx);
__io_cqring_wake(ctx);
+ percpu_ref_put(&ctx->refs);
}
-static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
+void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
{
struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
}
}
-void io_req_task_work_add(struct io_kiocb *req)
-{
- __io_req_task_work_add(req, true);
-}
-
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{
struct llist_node *node;
return ret;
}
-static void io_req_tw_post(struct io_kiocb *req, bool *locked)
-{
- io_req_complete_post(req);
-}
-
-void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
-{
- io_req_set_res(req, res, cflags);
- req->io_task_work.func = io_req_tw_post;
- io_req_task_work_add(req);
-}
-
static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
{
/* not needed for normal modes, but SQPOLL depends on it */
const struct io_op_def *def = &io_op_defs[req->opcode];
/* assign early for deferred execution for non-fixed file */
- if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
+ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
req->file = io_file_get_normal(req, req->cqe.fd);
if (!def->prep_async)
return 0;
}
static __cold void io_drain_req(struct io_kiocb *req)
+ __must_hold(&ctx->uring_lock)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
}
spin_unlock(&ctx->completion_lock);
- ret = io_req_prep_async(req);
- if (ret) {
-fail:
- io_req_complete_failed(req, ret);
- return;
- }
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
- goto fail;
+ io_req_complete_failed(req, ret);
+ return;
}
spin_lock(&ctx->completion_lock);
return ret;
/* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
io_iopoll_req_issued(req, issue_flags);
return 0;
req->flags &= ~REQ_F_HARDLINK;
req->flags |= REQ_F_LINK;
io_req_complete_failed(req, req->cqe.res);
- } else if (unlikely(req->ctx->drain_active)) {
- io_drain_req(req);
} else {
int ret = io_req_prep_async(req);
- if (unlikely(ret))
+ if (unlikely(ret)) {
io_req_complete_failed(req, ret);
+ return;
+ }
+
+ if (unlikely(req->ctx->drain_active))
+ io_drain_req(req);
else
io_queue_iowq(req, NULL);
}
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
- ktime_t timeout)
+ ktime_t *timeout)
{
- int ret;
+ int token, ret;
unsigned long check_cq;
/* make sure we run task_work before checking for signals */
if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
return -EBADR;
}
- if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
- return -ETIME;
- return 1;
+
+ /*
+ * Use io_schedule_prepare/finish, so cpufreq can take into account
+ * that the task is waiting for IO - turns out to be important for low
+ * QD IO.
+ */
+ token = io_schedule_prepare();
+ ret = 1;
+ if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
+ ret = -ETIME;
+ io_schedule_finish(token);
+ return ret;
}
/*
}
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
+ ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
cond_resched();
} while (ret > 0);
io_eventfd_unregister(ctx);
io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
- mutex_unlock(&ctx->uring_lock);
io_destroy_buffers(ctx);
+ mutex_unlock(&ctx->uring_lock);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
if (ctx->submitter_task)
* pushs them to do the flush.
*/
- if (io_cqring_events(ctx) || io_has_work(ctx))
+ if (__io_cqring_events_user(ctx) || io_has_work(ctx))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
/*
* When @in_idle, we're in cancellation and it's racy to remove the
* node. It'll be removed by the end of cancellation, just ignore it.
+ * tctx can be NULL if the queueing of this task_work raced with
+ * work cancelation off the exec path.
*/
- if (!atomic_read(&tctx->in_idle))
+ if (tctx && !atomic_read(&tctx->in_idle))
io_uring_del_tctx_node((unsigned long)work->ctx);
complete(&work->completion);
}
/* there is little hope left, don't run it too often */
interval = HZ * 60;
}
- } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
+ /*
+ * This is really an uninterruptible wait, as it has to be
+ * complete. But it's also run from a kworker, which doesn't
+ * take signals, so it's fine to make it interruptible. This
+ * avoids scenarios where we knowingly can wait much longer
+ * on completions, for example if someone does a SIGSTOP on
+ * a task that needs to finish task_work to make this loop
+ * complete. That's a synthetic situation that should not
+ * cause a stuck task backtrace, and hence a potential panic
+ * on stuck tasks if that is enabled.
+ */
+ } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
init_completion(&exit.completion);
init_task_work(&exit.task_work, io_tctx_exit_cb);
continue;
mutex_unlock(&ctx->uring_lock);
- wait_for_completion(&exit.completion);
+ /*
+ * See comment above for
+ * wait_for_completion_interruptible_timeout() on why this
+ * wait is marked as interruptible.
+ */
+ wait_for_completion_interruptible(&exit.completion);
mutex_lock(&ctx->uring_lock);
}
mutex_unlock(&ctx->uring_lock);
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
list_del_init(&de->list);
- io_req_complete_failed(de->req, -ECANCELED);
+ io_req_task_queue_fail(de->req, -ECANCELED);
kfree(de);
}
return true;
while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
+ cond_resched();
}
}
return -EEXIST;
if (ctx->restricted) {
- if (opcode >= IORING_REGISTER_LAST)
- return -EINVAL;
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
long ret = -EBADF;
struct fd f;
+ if (opcode >= IORING_REGISTER_LAST)
+ return -EINVAL;
+
f = fdget(fd);
if (!f.file)
return -EBADF;