kernel/fork: group allocation/free of per-cpu counters for mm struct
authorMateusz Guzik <mjguzik@gmail.com>
Wed, 23 Aug 2023 05:06:09 +0000 (07:06 +0200)
committerDennis Zhou <dennis@kernel.org>
Fri, 25 Aug 2023 15:10:35 +0000 (08:10 -0700)
A trivial execve scalability test which tries to be very friendly
(statically linked binaries, all separate) is predominantly bottlenecked
by back-to-back per-cpu counter allocations which serialize on global
locks.

Ease the pain by allocating and freeing them in one go.

Bench can be found here:
http://apollo.backplane.com/DFlyMisc/doexec.c

$ cc -static -O2 -o static-doexec doexec.c
$ ./static-doexec $(nproc)

Even at a very modest scale of 26 cores (ops/s):
before: 133543.63
after: 186061.81 (+39%)

While with the patch these allocations remain a significant problem,
the primary bottleneck shifts to page release handling.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20230823050609.2228718-3-mjguzik@gmail.com
[Dennis: reflowed 1 line]
Signed-off-by: Dennis Zhou <dennis@kernel.org>
kernel/fork.c

index d2e12b6..afd198b 100644 (file)
@@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
  */
 void __mmdrop(struct mm_struct *mm)
 {
-       int i;
-
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);
 
@@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
        put_user_ns(mm->user_ns);
        mm_pasid_drop(mm);
        mm_destroy_cid(mm);
+       percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
 
-       for (i = 0; i < NR_MM_COUNTERS; i++)
-               percpu_counter_destroy(&mm->rss_stat[i]);
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1252,8 +1249,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
 {
-       int i;
-
        mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
        mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
        atomic_set(&mm->mm_users, 1);
@@ -1301,17 +1296,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        if (mm_alloc_cid(mm))
                goto fail_cid;
 
-       for (i = 0; i < NR_MM_COUNTERS; i++)
-               if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
-                       goto fail_pcpu;
+       if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+                                    NR_MM_COUNTERS))
+               goto fail_pcpu;
 
        mm->user_ns = get_user_ns(user_ns);
        lru_gen_init_mm(mm);
        return mm;
 
 fail_pcpu:
-       while (i > 0)
-               percpu_counter_destroy(&mm->rss_stat[--i]);
        mm_destroy_cid(mm);
 fail_cid:
        destroy_context(mm);