io_uring/filetable: fix file reference underflow

[platform/kernel/linux-starfive.git] / io_uring / poll.c
diff --git a/io_uring/poll.c b/io_uring/poll.c

index 0d9f49c..b444b7d 100644 (file)
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -40,7 +40,14 @@ struct io_poll_table {
  };
  
  #define IO_POLL_CANCEL_FLAG    BIT(31)
-#define IO_POLL_REF_MASK       GENMASK(30, 0)
+#define IO_POLL_RETRY_FLAG     BIT(30)
+#define IO_POLL_REF_MASK       GENMASK(29, 0)
+
+/*
+ * We usually have 1-2 refs taken, 128 is more than enough and we want to
+ * maximise the margin between this amount and the moment when it overflows.
+ */
+#define IO_POLL_REF_BIAS       128
  
  #define IO_WQE_F_DOUBLE                1
  
@@ -58,6 +65,21 @@ static inline bool wqe_is_double(struct wait_queue_entry *wqe)
         return priv & IO_WQE_F_DOUBLE;
  }
  
+static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
+{
+       int v;
+
+       /*
+        * poll_refs are already elevated and we don't have much hope for
+        * grabbing the ownership. Instead of incrementing set a retry flag
+        * to notify the loop that there might have been some change.
+        */
+       v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
+       if (v & IO_POLL_REF_MASK)
+               return false;
+       return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
+}
+
  /*
   * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
   * bump it and acquire ownership. It's disallowed to modify requests while not
@@ -66,6 +88,8 @@ static inline bool wqe_is_double(struct wait_queue_entry *wqe)
   */
  static inline bool io_poll_get_ownership(struct io_kiocb *req)
  {
+       if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
+               return io_poll_get_ownership_slowpath(req);
         return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
  }
  
@@ -116,6 +140,8 @@ static void io_poll_req_insert_locked(struct io_kiocb *req)
         struct io_hash_table *table = &req->ctx->cancel_table_locked;
         u32 index = hash_long(req->cqe.user_data, table->hash_bits);
  
+       lockdep_assert_held(&req->ctx->uring_lock);
+
         hlist_add_head(&req->hash_node, &table->hbs[index].list);
  }
  
@@ -226,6 +252,23 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
                         return IOU_POLL_DONE;
                 if (v & IO_POLL_CANCEL_FLAG)
                         return -ECANCELED;
+               /*
+                * cqe.res contains only events of the first wake up
+                * and all others are be lost. Redo vfs_poll() to get
+                * up to date state.
+                */
+               if ((v & IO_POLL_REF_MASK) != 1)
+                       req->cqe.res = 0;
+               if (v & IO_POLL_RETRY_FLAG) {
+                       req->cqe.res = 0;
+                       /*
+                        * We won't find new events that came in between
+                        * vfs_poll and the ref put unless we clear the flag
+                        * in advance.
+                        */
+                       atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
+                       v &= ~IO_POLL_RETRY_FLAG;
+               }
  
                 /* the mask was stashed in __io_poll_execute */
                 if (!req->cqe.res) {
@@ -237,6 +280,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
                         continue;
                 if (req->apoll_events & EPOLLONESHOT)
                         return IOU_POLL_DONE;
+               if (io_is_uring_fops(req->file))
+                       return IOU_POLL_DONE;
  
                 /* multishot, just fill a CQE and proceed */
                 if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
@@ -256,6 +301,9 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
                                 return ret;
                 }
  
+               /* force the next iteration to vfs_poll() */
+               req->cqe.res = 0;
+
                 /*
                  * Release all references, retry if someone tried to restart
                  * task_work while we were executing it.
@@ -394,7 +442,8 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
         return 1;
  }
  
-static void io_poll_double_prepare(struct io_kiocb *req)
+/* fails only when polling is already completing by the first entry */
+static bool io_poll_double_prepare(struct io_kiocb *req)
  {
         struct wait_queue_head *head;
         struct io_poll *poll = io_poll_get_single(req);
@@ -403,20 +452,20 @@ static void io_poll_double_prepare(struct io_kiocb *req)
         rcu_read_lock();
         head = smp_load_acquire(&poll->head);
         /*
-        * poll arm may not hold ownership and so race with
-        * io_poll_wake() by modifying req->flags. There is only one
-        * poll entry queued, serialise with it by taking its head lock.
+        * poll arm might not hold ownership and so race for req->flags with
+        * io_poll_wake(). There is only one poll entry queued, serialise with
+        * it by taking its head lock. As we're still arming the tw hanlder
+        * is not going to be run, so there are no races with it.
          */
-       if (head)
+       if (head) {
                 spin_lock_irq(&head->lock);
-
-       req->flags |= REQ_F_DOUBLE_POLL;
-       if (req->opcode == IORING_OP_POLL_ADD)
-               req->flags |= REQ_F_ASYNC_DATA;
-
-       if (head)
+               req->flags |= REQ_F_DOUBLE_POLL;
+               if (req->opcode == IORING_OP_POLL_ADD)
+                       req->flags |= REQ_F_ASYNC_DATA;
                 spin_unlock_irq(&head->lock);
+       }
         rcu_read_unlock();
+       return !!head;
  }
  
  static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
@@ -454,7 +503,11 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
                 /* mark as double wq entry */
                 wqe_private |= IO_WQE_F_DOUBLE;
                 io_init_poll_iocb(poll, first->events, first->wait.func);
-               io_poll_double_prepare(req);
+               if (!io_poll_double_prepare(req)) {
+                       /* the request is completing, just back off */
+                       kfree(poll);
+                       return;
+               }
                 *poll_ptr = poll;
         } else {
                 /* fine to modify, there is no poll queued to race with us */
@@ -499,7 +552,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
                                  unsigned issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
-       int v;
  
         INIT_HLIST_NODE(&req->hash_node);
         req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
@@ -567,11 +619,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
  
         if (ipt->owning) {
                 /*
-                * Release ownership. If someone tried to queue a tw while it was
-                * locked, kick it off for them.
+                * Try to release ownership. If we see a change of state, e.g.
+                * poll was waken up, queue up a tw, it'll deal with it.
                  */
-               v = atomic_dec_return(&req->poll_refs);
-               if (unlikely(v & IO_POLL_REF_MASK))
+               if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
                         __io_poll_execute(req, 0);
         }
         return 0;