io_uring/io_uring.h

   1 #ifndef IOU_CORE_H
   2 #define IOU_CORE_H
   3
   4 #include <linux/errno.h>
   5 #include <linux/lockdep.h>
   6 #include <linux/io_uring_types.h>
   7 #include <uapi/linux/eventpoll.h>
   8 #include "io-wq.h"
   9 #include "slist.h"
  10 #include "filetable.h"
  11
  12 #ifndef CREATE_TRACE_POINTS
  13 #include <trace/events/io_uring.h>
  14 #endif
  15
  16 enum {
  17         IOU_OK                  = 0,
  18         IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
  19
  20         /*
  21          * Intended only when both IO_URING_F_MULTISHOT is passed
  22          * to indicate to the poll runner that multishot should be
  23          * removed and the result is set on req->cqe.res.
  24          */
  25         IOU_STOP_MULTISHOT      = -ECANCELED,
  26 };
  27
  28 struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
  29 bool io_req_cqe_overflow(struct io_kiocb *req);
  30 int io_run_task_work_sig(struct io_ring_ctx *ctx);
  31 int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
  32 int io_run_local_work(struct io_ring_ctx *ctx);
  33 void io_req_complete_failed(struct io_kiocb *req, s32 res);
  34 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
  35 void io_req_complete_post(struct io_kiocb *req);
  36 void __io_req_complete_post(struct io_kiocb *req);
  37 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
  38                      bool allow_overflow);
  39 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
  40                      bool allow_overflow);
  41 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
  42
  43 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
  44
  45 struct file *io_file_get_normal(struct io_kiocb *req, int fd);
  46 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
  47                                unsigned issue_flags);
  48
  49 static inline bool io_req_ffs_set(struct io_kiocb *req)
  50 {
  51         return req->flags & REQ_F_FIXED_FILE;
  52 }
  53
  54 bool io_is_uring_fops(struct file *file);
  55 bool io_alloc_async_data(struct io_kiocb *req);
  56 void io_req_task_work_add(struct io_kiocb *req);
  57 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
  58 void io_req_task_queue(struct io_kiocb *req);
  59 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
  60 void io_req_task_complete(struct io_kiocb *req, bool *locked);
  61 void io_req_task_queue_fail(struct io_kiocb *req, int ret);
  62 void io_req_task_submit(struct io_kiocb *req, bool *locked);
  63 void tctx_task_work(struct callback_head *cb);
  64 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
  65 int io_uring_alloc_task_context(struct task_struct *task,
  66                                 struct io_ring_ctx *ctx);
  67
  68 int io_poll_issue(struct io_kiocb *req, bool *locked);
  69 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
  70 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
  71 void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
  72 int io_req_prep_async(struct io_kiocb *req);
  73
  74 struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
  75 void io_wq_submit_work(struct io_wq_work *work);
  76
  77 void io_free_req(struct io_kiocb *req);
  78 void io_queue_next(struct io_kiocb *req);
  79 void __io_put_task(struct task_struct *task, int nr);
  80 void io_task_refs_refill(struct io_uring_task *tctx);
  81 bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
  82
  83 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
  84                         bool cancel_all);
  85
  86 #define io_for_each_link(pos, head) \
  87         for (pos = (head); pos; pos = pos->link)
  88
  89 static inline void io_cq_lock(struct io_ring_ctx *ctx)
  90         __acquires(ctx->completion_lock)
  91 {
  92         spin_lock(&ctx->completion_lock);
  93 }
  94
  95 void io_cq_unlock_post(struct io_ring_ctx *ctx);
  96
  97 static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
  98                                                        bool overflow)
  99 {
 100         if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
 101                 struct io_uring_cqe *cqe = ctx->cqe_cached;
 102
 103                 ctx->cached_cq_tail++;
 104                 ctx->cqe_cached++;
 105                 if (ctx->flags & IORING_SETUP_CQE32)
 106                         ctx->cqe_cached++;
 107                 return cqe;
 108         }
 109
 110         return __io_get_cqe(ctx, overflow);
 111 }
 112
 113 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 114 {
 115         return io_get_cqe_overflow(ctx, false);
 116 }
 117
 118 static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
 119                                      struct io_kiocb *req)
 120 {
 121         struct io_uring_cqe *cqe;
 122
 123         /*
 124          * If we can't get a cq entry, userspace overflowed the
 125          * submission (by quite a lot). Increment the overflow count in
 126          * the ring.
 127          */
 128         cqe = io_get_cqe(ctx);
 129         if (unlikely(!cqe))
 130                 return io_req_cqe_overflow(req);
 131
 132         trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
 133                                 req->cqe.res, req->cqe.flags,
 134                                 (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
 135                                 (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
 136
 137         memcpy(cqe, &req->cqe, sizeof(*cqe));
 138
 139         if (ctx->flags & IORING_SETUP_CQE32) {
 140                 u64 extra1 = 0, extra2 = 0;
 141
 142                 if (req->flags & REQ_F_CQE32_INIT) {
 143                         extra1 = req->extra1;
 144                         extra2 = req->extra2;
 145                 }
 146
 147                 WRITE_ONCE(cqe->big_cqe[0], extra1);
 148                 WRITE_ONCE(cqe->big_cqe[1], extra2);
 149         }
 150         return true;
 151 }
 152
 153 static inline void req_set_fail(struct io_kiocb *req)
 154 {
 155         req->flags |= REQ_F_FAIL;
 156         if (req->flags & REQ_F_CQE_SKIP) {
 157                 req->flags &= ~REQ_F_CQE_SKIP;
 158                 req->flags |= REQ_F_SKIP_LINK_CQES;
 159         }
 160 }
 161
 162 static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
 163 {
 164         req->cqe.res = res;
 165         req->cqe.flags = cflags;
 166 }
 167
 168 static inline bool req_has_async_data(struct io_kiocb *req)
 169 {
 170         return req->flags & REQ_F_ASYNC_DATA;
 171 }
 172
 173 static inline void io_put_file(struct file *file)
 174 {
 175         if (file)
 176                 fput(file);
 177 }
 178
 179 static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
 180                                          unsigned issue_flags)
 181 {
 182         lockdep_assert_held(&ctx->uring_lock);
 183         if (issue_flags & IO_URING_F_UNLOCKED)
 184                 mutex_unlock(&ctx->uring_lock);
 185 }
 186
 187 static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
 188                                        unsigned issue_flags)
 189 {
 190         /*
 191          * "Normal" inline submissions always hold the uring_lock, since we
 192          * grab it from the system call. Same is true for the SQPOLL offload.
 193          * The only exception is when we've detached the request and issue it
 194          * from an async worker thread, grab the lock for that case.
 195          */
 196         if (issue_flags & IO_URING_F_UNLOCKED)
 197                 mutex_lock(&ctx->uring_lock);
 198         lockdep_assert_held(&ctx->uring_lock);
 199 }
 200
 201 static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 202 {
 203         /* order cqe stores with ring update */
 204         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 205 }
 206
 207 /* requires smb_mb() prior, see wq_has_sleeper() */
 208 static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
 209 {
 210         /*
 211          * Trigger waitqueue handler on all waiters on our waitqueue. This
 212          * won't necessarily wake up all the tasks, io_should_wake() will make
 213          * that decision.
 214          *
 215          * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
 216          * set in the mask so that if we recurse back into our own poll
 217          * waitqueue handlers, we know we have a dependency between eventfd or
 218          * epoll and should terminate multishot poll at that point.
 219          */
 220         if (waitqueue_active(&ctx->cq_wait))
 221                 __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
 222                                 poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
 223 }
 224
 225 static inline void io_cqring_wake(struct io_ring_ctx *ctx)
 226 {
 227         smp_mb();
 228         __io_cqring_wake(ctx);
 229 }
 230
 231 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 232 {
 233         struct io_rings *r = ctx->rings;
 234
 235         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
 236 }
 237
 238 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 239 {
 240         struct io_rings *rings = ctx->rings;
 241
 242         /* make sure SQ entry isn't read before tail */
 243         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 244 }
 245
 246 static inline int io_run_task_work(void)
 247 {
 248         /*
 249          * Always check-and-clear the task_work notification signal. With how
 250          * signaling works for task_work, we can find it set with nothing to
 251          * run. We need to clear it for that case, like get_signal() does.
 252          */
 253         if (test_thread_flag(TIF_NOTIFY_SIGNAL))
 254                 clear_notify_signal();
 255         if (task_work_pending(current)) {
 256                 __set_current_state(TASK_RUNNING);
 257                 task_work_run();
 258                 return 1;
 259         }
 260
 261         return 0;
 262 }
 263
 264 static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
 265 {
 266         return test_thread_flag(TIF_NOTIFY_SIGNAL) ||
 267                 !wq_list_empty(&ctx->work_llist);
 268 }
 269
 270 static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx)
 271 {
 272         int ret = 0;
 273         int ret2;
 274
 275         if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
 276                 ret = io_run_local_work(ctx);
 277
 278         /* want to run this after in case more is added */
 279         ret2 = io_run_task_work();
 280
 281         /* Try propagate error in favour of if tasks were run,
 282          * but still make sure to run them if requested
 283          */
 284         if (ret >= 0)
 285                 ret += ret2;
 286
 287         return ret;
 288 }
 289
 290 static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
 291 {
 292         bool locked;
 293         int ret;
 294
 295         if (llist_empty(&ctx->work_llist))
 296                 return 0;
 297
 298         locked = true;
 299         ret = __io_run_local_work(ctx, &locked);
 300         /* shouldn't happen! */
 301         if (WARN_ON_ONCE(!locked))
 302                 mutex_lock(&ctx->uring_lock);
 303         return ret;
 304 }
 305
 306 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 307 {
 308         if (!*locked) {
 309                 mutex_lock(&ctx->uring_lock);
 310                 *locked = true;
 311         }
 312 }
 313
 314 /*
 315  * Don't complete immediately but use deferred completion infrastructure.
 316  * Protected by ->uring_lock and can only be used either with
 317  * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex.
 318  */
 319 static inline void io_req_complete_defer(struct io_kiocb *req)
 320         __must_hold(&req->ctx->uring_lock)
 321 {
 322         struct io_submit_state *state = &req->ctx->submit_state;
 323
 324         lockdep_assert_held(&req->ctx->uring_lock);
 325
 326         wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 327 }
 328
 329 static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
 330 {
 331         if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd))
 332                 __io_commit_cqring_flush(ctx);
 333 }
 334
 335 /* must to be called somewhat shortly after putting a request */
 336 static inline void io_put_task(struct task_struct *task, int nr)
 337 {
 338         if (likely(task == current))
 339                 task->io_uring->cached_refs += nr;
 340         else
 341                 __io_put_task(task, nr);
 342 }
 343
 344 static inline void io_get_task_refs(int nr)
 345 {
 346         struct io_uring_task *tctx = current->io_uring;
 347
 348         tctx->cached_refs -= nr;
 349         if (unlikely(tctx->cached_refs < 0))
 350                 io_task_refs_refill(tctx);
 351 }
 352
 353 static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
 354 {
 355         return !ctx->submit_state.free_list.next;
 356 }
 357
 358 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
 359 {
 360         if (unlikely(io_req_cache_empty(ctx)))
 361                 return __io_alloc_req_refill(ctx);
 362         return true;
 363 }
 364
 365 static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 366 {
 367         struct io_wq_work_node *node;
 368
 369         node = wq_stack_extract(&ctx->submit_state.free_list);
 370         return container_of(node, struct io_kiocb, comp_list);
 371 }
 372
 373 static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
 374 {
 375         return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) ||
 376                       ctx->submitter_task == current);
 377 }
 378
 379 #endif