task_work_run: don't take ->pi_lock unconditionally
authorOleg Nesterov <oleg@redhat.com>
Tue, 18 Feb 2020 15:50:18 +0000 (16:50 +0100)
committerJens Axboe <axboe@kernel.dk>
Mon, 2 Mar 2020 21:06:33 +0000 (14:06 -0700)
As Peter pointed out, task_work() can avoid ->pi_lock and cmpxchg()
if task->task_works == NULL && !PF_EXITING.

And in fact the only reason why task_work_run() needs ->pi_lock is
the possible race with task_work_cancel(), we can optimize this code
and make the locking more clear.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
kernel/task_work.c

index 0fef395..825f282 100644 (file)
@@ -97,16 +97,26 @@ void task_work_run(void)
                 * work->func() can do task_work_add(), do not set
                 * work_exited unless the list is empty.
                 */
-               raw_spin_lock_irq(&task->pi_lock);
                do {
+                       head = NULL;
                        work = READ_ONCE(task->task_works);
-                       head = !work && (task->flags & PF_EXITING) ?
-                               &work_exited : NULL;
+                       if (!work) {
+                               if (task->flags & PF_EXITING)
+                                       head = &work_exited;
+                               else
+                                       break;
+                       }
                } while (cmpxchg(&task->task_works, work, head) != work);
-               raw_spin_unlock_irq(&task->pi_lock);
 
                if (!work)
                        break;
+               /*
+                * Synchronize with task_work_cancel(). It can not remove
+                * the first entry == work, cmpxchg(task_works) must fail.
+                * But it can remove another entry from the ->next list.
+                */
+               raw_spin_lock_irq(&task->pi_lock);
+               raw_spin_unlock_irq(&task->pi_lock);
 
                do {
                        next = work->next;