Merge tag 'linux-kselftest-next-5.11-rc1' of git://git.kernel.org/pub/scm/linux/kerne...
[platform/kernel/linux-starfive.git] / tools / testing / selftests / vm / userfaultfd.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stress userfaultfd syscall.
4  *
5  *  Copyright (C) 2015  Red Hat, Inc.
6  *
7  * This test allocates two virtual areas and bounces the physical
8  * memory across the two virtual areas (from area_src to area_dst)
9  * using userfaultfd.
10  *
11  * There are three threads running per CPU:
12  *
13  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
14  *    page of the area_dst (while the physical page may still be in
15  *    area_src), and increments a per-page counter in the same page,
16  *    and checks its value against a verification region.
17  *
18  * 2) another per-CPU thread handles the userfaults generated by
19  *    thread 1 above. userfaultfd blocking reads or poll() modes are
20  *    exercised interleaved.
21  *
22  * 3) one last per-CPU thread transfers the memory in the background
23  *    at maximum bandwidth (if not already transferred by thread
24  *    2). Each cpu thread takes cares of transferring a portion of the
25  *    area.
26  *
27  * When all threads of type 3 completed the transfer, one bounce is
28  * complete. area_src and area_dst are then swapped. All threads are
29  * respawned and so the bounce is immediately restarted in the
30  * opposite direction.
31  *
32  * per-CPU threads 1 by triggering userfaults inside
33  * pthread_mutex_lock will also verify the atomicity of the memory
34  * transfer (UFFDIO_COPY).
35  */
36
37 #define _GNU_SOURCE
38 #include <stdio.h>
39 #include <errno.h>
40 #include <unistd.h>
41 #include <stdlib.h>
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <fcntl.h>
45 #include <time.h>
46 #include <signal.h>
47 #include <poll.h>
48 #include <string.h>
49 #include <sys/mman.h>
50 #include <sys/syscall.h>
51 #include <sys/ioctl.h>
52 #include <sys/wait.h>
53 #include <pthread.h>
54 #include <linux/userfaultfd.h>
55 #include <setjmp.h>
56 #include <stdbool.h>
57 #include <assert.h>
58 #include <inttypes.h>
59 #include <stdint.h>
60
61 #include "../kselftest.h"
62
63 #ifdef __NR_userfaultfd
64
65 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
66
67 #define BOUNCE_RANDOM           (1<<0)
68 #define BOUNCE_RACINGFAULTS     (1<<1)
69 #define BOUNCE_VERIFY           (1<<2)
70 #define BOUNCE_POLL             (1<<3)
71 static int bounces;
72
73 #define TEST_ANON       1
74 #define TEST_HUGETLB    2
75 #define TEST_SHMEM      3
76 static int test_type;
77
78 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
79 #define ALARM_INTERVAL_SECS 10
80 static volatile bool test_uffdio_copy_eexist = true;
81 static volatile bool test_uffdio_zeropage_eexist = true;
82 /* Whether to test uffd write-protection */
83 static bool test_uffdio_wp = false;
84
85 static bool map_shared;
86 static int huge_fd;
87 static char *huge_fd_off0;
88 static unsigned long long *count_verify;
89 static int uffd, uffd_flags, finished, *pipefd;
90 static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
91 static char *zeropage;
92 pthread_attr_t attr;
93
94 /* Userfaultfd test statistics */
95 struct uffd_stats {
96         int cpu;
97         unsigned long missing_faults;
98         unsigned long wp_faults;
99 };
100
101 /* pthread_mutex_t starts at page offset 0 */
102 #define area_mutex(___area, ___nr)                                      \
103         ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
104 /*
105  * count is placed in the page after pthread_mutex_t naturally aligned
106  * to avoid non alignment faults on non-x86 archs.
107  */
108 #define area_count(___area, ___nr)                                      \
109         ((volatile unsigned long long *) ((unsigned long)               \
110                                  ((___area) + (___nr)*page_size +       \
111                                   sizeof(pthread_mutex_t) +             \
112                                   sizeof(unsigned long long) - 1) &     \
113                                  ~(unsigned long)(sizeof(unsigned long long) \
114                                                   -  1)))
115
116 const char *examples =
117     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
118     "./userfaultfd anon 100 99999\n\n"
119     "# Run share memory test on 1GiB region with 99 bounces:\n"
120     "./userfaultfd shmem 1000 99\n\n"
121     "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
122     "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
123     "# Run the same hugetlb test but using shmem:\n"
124     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
125     "# 10MiB-~6GiB 999 bounces anonymous test, "
126     "continue forever unless an error triggers\n"
127     "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
128
129 static void usage(void)
130 {
131         fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
132                 "[hugetlbfs_file]\n\n");
133         fprintf(stderr, "Supported <test type>: anon, hugetlb, "
134                 "hugetlb_shared, shmem\n\n");
135         fprintf(stderr, "Examples:\n\n");
136         fprintf(stderr, "%s", examples);
137         exit(1);
138 }
139
140 #define uffd_error(code, fmt, ...)                                             \
141         do {                                                                   \
142                 fprintf(stderr, fmt, ##__VA_ARGS__);                           \
143                 fprintf(stderr, ": %" PRId64 "\n", (int64_t)(code));           \
144                 exit(1);                                                       \
145         } while (0)
146
147 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
148                              unsigned long n_cpus)
149 {
150         int i;
151
152         for (i = 0; i < n_cpus; i++) {
153                 uffd_stats[i].cpu = i;
154                 uffd_stats[i].missing_faults = 0;
155                 uffd_stats[i].wp_faults = 0;
156         }
157 }
158
159 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
160 {
161         int i;
162         unsigned long long miss_total = 0, wp_total = 0;
163
164         for (i = 0; i < n_cpus; i++) {
165                 miss_total += stats[i].missing_faults;
166                 wp_total += stats[i].wp_faults;
167         }
168
169         printf("userfaults: %llu missing (", miss_total);
170         for (i = 0; i < n_cpus; i++)
171                 printf("%lu+", stats[i].missing_faults);
172         printf("\b), %llu wp (", wp_total);
173         for (i = 0; i < n_cpus; i++)
174                 printf("%lu+", stats[i].wp_faults);
175         printf("\b)\n");
176 }
177
178 static int anon_release_pages(char *rel_area)
179 {
180         int ret = 0;
181
182         if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
183                 perror("madvise");
184                 ret = 1;
185         }
186
187         return ret;
188 }
189
190 static void anon_allocate_area(void **alloc_area)
191 {
192         if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
193                 fprintf(stderr, "out of memory\n");
194                 *alloc_area = NULL;
195         }
196 }
197
198 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
199 {
200 }
201
202 /* HugeTLB memory */
203 static int hugetlb_release_pages(char *rel_area)
204 {
205         int ret = 0;
206
207         if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
208                                 rel_area == huge_fd_off0 ? 0 :
209                                 nr_pages * page_size,
210                                 nr_pages * page_size)) {
211                 perror("fallocate");
212                 ret = 1;
213         }
214
215         return ret;
216 }
217
218 static void hugetlb_allocate_area(void **alloc_area)
219 {
220         void *area_alias = NULL;
221         char **alloc_area_alias;
222
223         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
224                            (map_shared ? MAP_SHARED : MAP_PRIVATE) |
225                            MAP_HUGETLB,
226                            huge_fd, *alloc_area == area_src ? 0 :
227                            nr_pages * page_size);
228         if (*alloc_area == MAP_FAILED) {
229                 perror("mmap of hugetlbfs file failed");
230                 goto fail;
231         }
232
233         if (map_shared) {
234                 area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
235                                   MAP_SHARED | MAP_HUGETLB,
236                                   huge_fd, *alloc_area == area_src ? 0 :
237                                   nr_pages * page_size);
238                 if (area_alias == MAP_FAILED) {
239                         perror("mmap of hugetlb file alias failed");
240                         goto fail_munmap;
241                 }
242         }
243
244         if (*alloc_area == area_src) {
245                 huge_fd_off0 = *alloc_area;
246                 alloc_area_alias = &area_src_alias;
247         } else {
248                 alloc_area_alias = &area_dst_alias;
249         }
250         if (area_alias)
251                 *alloc_area_alias = area_alias;
252
253         return;
254
255 fail_munmap:
256         if (munmap(*alloc_area, nr_pages * page_size) < 0) {
257                 perror("hugetlb munmap");
258                 exit(1);
259         }
260 fail:
261         *alloc_area = NULL;
262 }
263
264 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
265 {
266         if (!map_shared)
267                 return;
268         /*
269          * We can't zap just the pagetable with hugetlbfs because
270          * MADV_DONTEED won't work. So exercise -EEXIST on a alias
271          * mapping where the pagetables are not established initially,
272          * this way we'll exercise the -EEXEC at the fs level.
273          */
274         *start = (unsigned long) area_dst_alias + offset;
275 }
276
277 /* Shared memory */
278 static int shmem_release_pages(char *rel_area)
279 {
280         int ret = 0;
281
282         if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
283                 perror("madvise");
284                 ret = 1;
285         }
286
287         return ret;
288 }
289
290 static void shmem_allocate_area(void **alloc_area)
291 {
292         *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
293                            MAP_ANONYMOUS | MAP_SHARED, -1, 0);
294         if (*alloc_area == MAP_FAILED) {
295                 fprintf(stderr, "shared memory mmap failed\n");
296                 *alloc_area = NULL;
297         }
298 }
299
300 struct uffd_test_ops {
301         unsigned long expected_ioctls;
302         void (*allocate_area)(void **alloc_area);
303         int (*release_pages)(char *rel_area);
304         void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
305 };
306
307 #define SHMEM_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
308                                          (1 << _UFFDIO_COPY) | \
309                                          (1 << _UFFDIO_ZEROPAGE))
310
311 #define ANON_EXPECTED_IOCTLS            ((1 << _UFFDIO_WAKE) | \
312                                          (1 << _UFFDIO_COPY) | \
313                                          (1 << _UFFDIO_ZEROPAGE) | \
314                                          (1 << _UFFDIO_WRITEPROTECT))
315
316 static struct uffd_test_ops anon_uffd_test_ops = {
317         .expected_ioctls = ANON_EXPECTED_IOCTLS,
318         .allocate_area  = anon_allocate_area,
319         .release_pages  = anon_release_pages,
320         .alias_mapping = noop_alias_mapping,
321 };
322
323 static struct uffd_test_ops shmem_uffd_test_ops = {
324         .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
325         .allocate_area  = shmem_allocate_area,
326         .release_pages  = shmem_release_pages,
327         .alias_mapping = noop_alias_mapping,
328 };
329
330 static struct uffd_test_ops hugetlb_uffd_test_ops = {
331         .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
332         .allocate_area  = hugetlb_allocate_area,
333         .release_pages  = hugetlb_release_pages,
334         .alias_mapping = hugetlb_alias_mapping,
335 };
336
337 static struct uffd_test_ops *uffd_test_ops;
338
339 static int my_bcmp(char *str1, char *str2, size_t n)
340 {
341         unsigned long i;
342         for (i = 0; i < n; i++)
343                 if (str1[i] != str2[i])
344                         return 1;
345         return 0;
346 }
347
348 static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
349 {
350         struct uffdio_writeprotect prms;
351
352         /* Write protection page faults */
353         prms.range.start = start;
354         prms.range.len = len;
355         /* Undo write-protect, do wakeup after that */
356         prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
357
358         if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) {
359                 fprintf(stderr, "clear WP failed for address 0x%" PRIx64 "\n",
360                         (uint64_t)start);
361                 exit(1);
362         }
363 }
364
365 static void *locking_thread(void *arg)
366 {
367         unsigned long cpu = (unsigned long) arg;
368         struct random_data rand;
369         unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
370         int32_t rand_nr;
371         unsigned long long count;
372         char randstate[64];
373         unsigned int seed;
374         time_t start;
375
376         if (bounces & BOUNCE_RANDOM) {
377                 seed = (unsigned int) time(NULL) - bounces;
378                 if (!(bounces & BOUNCE_RACINGFAULTS))
379                         seed += cpu;
380                 bzero(&rand, sizeof(rand));
381                 bzero(&randstate, sizeof(randstate));
382                 if (initstate_r(seed, randstate, sizeof(randstate), &rand)) {
383                         fprintf(stderr, "srandom_r error\n");
384                         exit(1);
385                 }
386         } else {
387                 page_nr = -bounces;
388                 if (!(bounces & BOUNCE_RACINGFAULTS))
389                         page_nr += cpu * nr_pages_per_cpu;
390         }
391
392         while (!finished) {
393                 if (bounces & BOUNCE_RANDOM) {
394                         if (random_r(&rand, &rand_nr)) {
395                                 fprintf(stderr, "random_r 1 error\n");
396                                 exit(1);
397                         }
398                         page_nr = rand_nr;
399                         if (sizeof(page_nr) > sizeof(rand_nr)) {
400                                 if (random_r(&rand, &rand_nr)) {
401                                         fprintf(stderr, "random_r 2 error\n");
402                                         exit(1);
403                                 }
404                                 page_nr |= (((unsigned long) rand_nr) << 16) <<
405                                            16;
406                         }
407                 } else
408                         page_nr += 1;
409                 page_nr %= nr_pages;
410
411                 start = time(NULL);
412                 if (bounces & BOUNCE_VERIFY) {
413                         count = *area_count(area_dst, page_nr);
414                         if (!count) {
415                                 fprintf(stderr,
416                                         "page_nr %lu wrong count %Lu %Lu\n",
417                                         page_nr, count,
418                                         count_verify[page_nr]);
419                                 exit(1);
420                         }
421
422
423                         /*
424                          * We can't use bcmp (or memcmp) because that
425                          * returns 0 erroneously if the memory is
426                          * changing under it (even if the end of the
427                          * page is never changing and always
428                          * different).
429                          */
430 #if 1
431                         if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
432                                      page_size)) {
433                                 fprintf(stderr,
434                                         "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
435                                         page_nr, count, count_verify[page_nr]);
436                                 exit(1);
437                         }
438 #else
439                         unsigned long loops;
440
441                         loops = 0;
442                         /* uncomment the below line to test with mutex */
443                         /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
444                         while (!bcmp(area_dst + page_nr * page_size, zeropage,
445                                      page_size)) {
446                                 loops += 1;
447                                 if (loops > 10)
448                                         break;
449                         }
450                         /* uncomment below line to test with mutex */
451                         /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
452                         if (loops) {
453                                 fprintf(stderr,
454                                         "page_nr %lu all zero thread %lu %p %lu\n",
455                                         page_nr, cpu, area_dst + page_nr * page_size,
456                                         loops);
457                                 if (loops > 10)
458                                         exit(1);
459                         }
460 #endif
461                 }
462
463                 pthread_mutex_lock(area_mutex(area_dst, page_nr));
464                 count = *area_count(area_dst, page_nr);
465                 if (count != count_verify[page_nr]) {
466                         fprintf(stderr,
467                                 "page_nr %lu memory corruption %Lu %Lu\n",
468                                 page_nr, count,
469                                 count_verify[page_nr]); exit(1);
470                 }
471                 count++;
472                 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
473                 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
474
475                 if (time(NULL) - start > 1)
476                         fprintf(stderr,
477                                 "userfault too slow %ld "
478                                 "possible false positive with overcommit\n",
479                                 time(NULL) - start);
480         }
481
482         return NULL;
483 }
484
485 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
486                             unsigned long offset)
487 {
488         uffd_test_ops->alias_mapping(&uffdio_copy->dst,
489                                      uffdio_copy->len,
490                                      offset);
491         if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
492                 /* real retval in ufdio_copy.copy */
493                 if (uffdio_copy->copy != -EEXIST) {
494                         uffd_error(uffdio_copy->copy,
495                                    "UFFDIO_COPY retry error");
496                 }
497         } else
498                 uffd_error(uffdio_copy->copy, "UFFDIO_COPY retry unexpected");
499 }
500
501 static int __copy_page(int ufd, unsigned long offset, bool retry)
502 {
503         struct uffdio_copy uffdio_copy;
504
505         if (offset >= nr_pages * page_size) {
506                 fprintf(stderr, "unexpected offset %lu\n", offset);
507                 exit(1);
508         }
509         uffdio_copy.dst = (unsigned long) area_dst + offset;
510         uffdio_copy.src = (unsigned long) area_src + offset;
511         uffdio_copy.len = page_size;
512         if (test_uffdio_wp)
513                 uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
514         else
515                 uffdio_copy.mode = 0;
516         uffdio_copy.copy = 0;
517         if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
518                 /* real retval in ufdio_copy.copy */
519                 if (uffdio_copy.copy != -EEXIST)
520                         uffd_error(uffdio_copy.copy, "UFFDIO_COPY error");
521         } else if (uffdio_copy.copy != page_size) {
522                 uffd_error(uffdio_copy.copy, "UFFDIO_COPY unexpected copy");
523         } else {
524                 if (test_uffdio_copy_eexist && retry) {
525                         test_uffdio_copy_eexist = false;
526                         retry_copy_page(ufd, &uffdio_copy, offset);
527                 }
528                 return 1;
529         }
530         return 0;
531 }
532
533 static int copy_page_retry(int ufd, unsigned long offset)
534 {
535         return __copy_page(ufd, offset, true);
536 }
537
538 static int copy_page(int ufd, unsigned long offset)
539 {
540         return __copy_page(ufd, offset, false);
541 }
542
543 static int uffd_read_msg(int ufd, struct uffd_msg *msg)
544 {
545         int ret = read(uffd, msg, sizeof(*msg));
546
547         if (ret != sizeof(*msg)) {
548                 if (ret < 0) {
549                         if (errno == EAGAIN)
550                                 return 1;
551                         perror("blocking read error");
552                 } else {
553                         fprintf(stderr, "short read\n");
554                 }
555                 exit(1);
556         }
557
558         return 0;
559 }
560
561 static void uffd_handle_page_fault(struct uffd_msg *msg,
562                                    struct uffd_stats *stats)
563 {
564         unsigned long offset;
565
566         if (msg->event != UFFD_EVENT_PAGEFAULT) {
567                 fprintf(stderr, "unexpected msg event %u\n", msg->event);
568                 exit(1);
569         }
570
571         if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
572                 wp_range(uffd, msg->arg.pagefault.address, page_size, false);
573                 stats->wp_faults++;
574         } else {
575                 /* Missing page faults */
576                 if (bounces & BOUNCE_VERIFY &&
577                     msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) {
578                         fprintf(stderr, "unexpected write fault\n");
579                         exit(1);
580                 }
581
582                 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
583                 offset &= ~(page_size-1);
584
585                 if (copy_page(uffd, offset))
586                         stats->missing_faults++;
587         }
588 }
589
590 static void *uffd_poll_thread(void *arg)
591 {
592         struct uffd_stats *stats = (struct uffd_stats *)arg;
593         unsigned long cpu = stats->cpu;
594         struct pollfd pollfd[2];
595         struct uffd_msg msg;
596         struct uffdio_register uffd_reg;
597         int ret;
598         char tmp_chr;
599
600         pollfd[0].fd = uffd;
601         pollfd[0].events = POLLIN;
602         pollfd[1].fd = pipefd[cpu*2];
603         pollfd[1].events = POLLIN;
604
605         for (;;) {
606                 ret = poll(pollfd, 2, -1);
607                 if (!ret) {
608                         fprintf(stderr, "poll error %d\n", ret);
609                         exit(1);
610                 }
611                 if (ret < 0) {
612                         perror("poll");
613                         exit(1);
614                 }
615                 if (pollfd[1].revents & POLLIN) {
616                         if (read(pollfd[1].fd, &tmp_chr, 1) != 1) {
617                                 fprintf(stderr, "read pipefd error\n");
618                                 exit(1);
619                         }
620                         break;
621                 }
622                 if (!(pollfd[0].revents & POLLIN)) {
623                         fprintf(stderr, "pollfd[0].revents %d\n",
624                                 pollfd[0].revents);
625                         exit(1);
626                 }
627                 if (uffd_read_msg(uffd, &msg))
628                         continue;
629                 switch (msg.event) {
630                 default:
631                         fprintf(stderr, "unexpected msg event %u\n",
632                                 msg.event); exit(1);
633                         break;
634                 case UFFD_EVENT_PAGEFAULT:
635                         uffd_handle_page_fault(&msg, stats);
636                         break;
637                 case UFFD_EVENT_FORK:
638                         close(uffd);
639                         uffd = msg.arg.fork.ufd;
640                         pollfd[0].fd = uffd;
641                         break;
642                 case UFFD_EVENT_REMOVE:
643                         uffd_reg.range.start = msg.arg.remove.start;
644                         uffd_reg.range.len = msg.arg.remove.end -
645                                 msg.arg.remove.start;
646                         if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) {
647                                 fprintf(stderr, "remove failure\n");
648                                 exit(1);
649                         }
650                         break;
651                 case UFFD_EVENT_REMAP:
652                         area_dst = (char *)(unsigned long)msg.arg.remap.to;
653                         break;
654                 }
655         }
656
657         return NULL;
658 }
659
660 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
661
662 static void *uffd_read_thread(void *arg)
663 {
664         struct uffd_stats *stats = (struct uffd_stats *)arg;
665         struct uffd_msg msg;
666
667         pthread_mutex_unlock(&uffd_read_mutex);
668         /* from here cancellation is ok */
669
670         for (;;) {
671                 if (uffd_read_msg(uffd, &msg))
672                         continue;
673                 uffd_handle_page_fault(&msg, stats);
674         }
675
676         return NULL;
677 }
678
679 static void *background_thread(void *arg)
680 {
681         unsigned long cpu = (unsigned long) arg;
682         unsigned long page_nr, start_nr, mid_nr, end_nr;
683
684         start_nr = cpu * nr_pages_per_cpu;
685         end_nr = (cpu+1) * nr_pages_per_cpu;
686         mid_nr = (start_nr + end_nr) / 2;
687
688         /* Copy the first half of the pages */
689         for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
690                 copy_page_retry(uffd, page_nr * page_size);
691
692         /*
693          * If we need to test uffd-wp, set it up now.  Then we'll have
694          * at least the first half of the pages mapped already which
695          * can be write-protected for testing
696          */
697         if (test_uffdio_wp)
698                 wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
699                         nr_pages_per_cpu * page_size, true);
700
701         /*
702          * Continue the 2nd half of the page copying, handling write
703          * protection faults if any
704          */
705         for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
706                 copy_page_retry(uffd, page_nr * page_size);
707
708         return NULL;
709 }
710
711 static int stress(struct uffd_stats *uffd_stats)
712 {
713         unsigned long cpu;
714         pthread_t locking_threads[nr_cpus];
715         pthread_t uffd_threads[nr_cpus];
716         pthread_t background_threads[nr_cpus];
717
718         finished = 0;
719         for (cpu = 0; cpu < nr_cpus; cpu++) {
720                 if (pthread_create(&locking_threads[cpu], &attr,
721                                    locking_thread, (void *)cpu))
722                         return 1;
723                 if (bounces & BOUNCE_POLL) {
724                         if (pthread_create(&uffd_threads[cpu], &attr,
725                                            uffd_poll_thread,
726                                            (void *)&uffd_stats[cpu]))
727                                 return 1;
728                 } else {
729                         if (pthread_create(&uffd_threads[cpu], &attr,
730                                            uffd_read_thread,
731                                            (void *)&uffd_stats[cpu]))
732                                 return 1;
733                         pthread_mutex_lock(&uffd_read_mutex);
734                 }
735                 if (pthread_create(&background_threads[cpu], &attr,
736                                    background_thread, (void *)cpu))
737                         return 1;
738         }
739         for (cpu = 0; cpu < nr_cpus; cpu++)
740                 if (pthread_join(background_threads[cpu], NULL))
741                         return 1;
742
743         /*
744          * Be strict and immediately zap area_src, the whole area has
745          * been transferred already by the background treads. The
746          * area_src could then be faulted in in a racy way by still
747          * running uffdio_threads reading zeropages after we zapped
748          * area_src (but they're guaranteed to get -EEXIST from
749          * UFFDIO_COPY without writing zero pages into area_dst
750          * because the background threads already completed).
751          */
752         if (uffd_test_ops->release_pages(area_src))
753                 return 1;
754
755
756         finished = 1;
757         for (cpu = 0; cpu < nr_cpus; cpu++)
758                 if (pthread_join(locking_threads[cpu], NULL))
759                         return 1;
760
761         for (cpu = 0; cpu < nr_cpus; cpu++) {
762                 char c;
763                 if (bounces & BOUNCE_POLL) {
764                         if (write(pipefd[cpu*2+1], &c, 1) != 1) {
765                                 fprintf(stderr, "pipefd write error\n");
766                                 return 1;
767                         }
768                         if (pthread_join(uffd_threads[cpu],
769                                          (void *)&uffd_stats[cpu]))
770                                 return 1;
771                 } else {
772                         if (pthread_cancel(uffd_threads[cpu]))
773                                 return 1;
774                         if (pthread_join(uffd_threads[cpu], NULL))
775                                 return 1;
776                 }
777         }
778
779         return 0;
780 }
781
782 static int userfaultfd_open(int features)
783 {
784         struct uffdio_api uffdio_api;
785
786         uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
787         if (uffd < 0) {
788                 fprintf(stderr,
789                         "userfaultfd syscall not available in this kernel\n");
790                 return 1;
791         }
792         uffd_flags = fcntl(uffd, F_GETFD, NULL);
793
794         uffdio_api.api = UFFD_API;
795         uffdio_api.features = features;
796         if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
797                 fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
798                         "run with either root or ptrace capability.\n");
799                 return 1;
800         }
801         if (uffdio_api.api != UFFD_API) {
802                 fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n",
803                         (uint64_t)uffdio_api.api);
804                 return 1;
805         }
806
807         return 0;
808 }
809
810 sigjmp_buf jbuf, *sigbuf;
811
812 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
813 {
814         if (sig == SIGBUS) {
815                 if (sigbuf)
816                         siglongjmp(*sigbuf, 1);
817                 abort();
818         }
819 }
820
821 /*
822  * For non-cooperative userfaultfd test we fork() a process that will
823  * generate pagefaults, will mremap the area monitored by the
824  * userfaultfd and at last this process will release the monitored
825  * area.
826  * For the anonymous and shared memory the area is divided into two
827  * parts, the first part is accessed before mremap, and the second
828  * part is accessed after mremap. Since hugetlbfs does not support
829  * mremap, the entire monitored area is accessed in a single pass for
830  * HUGETLB_TEST.
831  * The release of the pages currently generates event for shmem and
832  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
833  * for hugetlb.
834  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
835  * monitored area, generate pagefaults and test that signal is delivered.
836  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
837  * test robustness use case - we release monitored area, fork a process
838  * that will generate pagefaults and verify signal is generated.
839  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
840  * feature. Using monitor thread, verify no userfault events are generated.
841  */
842 static int faulting_process(int signal_test)
843 {
844         unsigned long nr;
845         unsigned long long count;
846         unsigned long split_nr_pages;
847         unsigned long lastnr;
848         struct sigaction act;
849         unsigned long signalled = 0;
850
851         if (test_type != TEST_HUGETLB)
852                 split_nr_pages = (nr_pages + 1) / 2;
853         else
854                 split_nr_pages = nr_pages;
855
856         if (signal_test) {
857                 sigbuf = &jbuf;
858                 memset(&act, 0, sizeof(act));
859                 act.sa_sigaction = sighndl;
860                 act.sa_flags = SA_SIGINFO;
861                 if (sigaction(SIGBUS, &act, 0)) {
862                         perror("sigaction");
863                         return 1;
864                 }
865                 lastnr = (unsigned long)-1;
866         }
867
868         for (nr = 0; nr < split_nr_pages; nr++) {
869                 int steps = 1;
870                 unsigned long offset = nr * page_size;
871
872                 if (signal_test) {
873                         if (sigsetjmp(*sigbuf, 1) != 0) {
874                                 if (steps == 1 && nr == lastnr) {
875                                         fprintf(stderr, "Signal repeated\n");
876                                         return 1;
877                                 }
878
879                                 lastnr = nr;
880                                 if (signal_test == 1) {
881                                         if (steps == 1) {
882                                                 /* This is a MISSING request */
883                                                 steps++;
884                                                 if (copy_page(uffd, offset))
885                                                         signalled++;
886                                         } else {
887                                                 /* This is a WP request */
888                                                 assert(steps == 2);
889                                                 wp_range(uffd,
890                                                          (__u64)area_dst +
891                                                          offset,
892                                                          page_size, false);
893                                         }
894                                 } else {
895                                         signalled++;
896                                         continue;
897                                 }
898                         }
899                 }
900
901                 count = *area_count(area_dst, nr);
902                 if (count != count_verify[nr]) {
903                         fprintf(stderr,
904                                 "nr %lu memory corruption %Lu %Lu\n",
905                                 nr, count,
906                                 count_verify[nr]);
907                 }
908                 /*
909                  * Trigger write protection if there is by writing
910                  * the same value back.
911                  */
912                 *area_count(area_dst, nr) = count;
913         }
914
915         if (signal_test)
916                 return signalled != split_nr_pages;
917
918         if (test_type == TEST_HUGETLB)
919                 return 0;
920
921         area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
922                           MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
923         if (area_dst == MAP_FAILED) {
924                 perror("mremap");
925                 exit(1);
926         }
927
928         for (; nr < nr_pages; nr++) {
929                 count = *area_count(area_dst, nr);
930                 if (count != count_verify[nr]) {
931                         fprintf(stderr,
932                                 "nr %lu memory corruption %Lu %Lu\n",
933                                 nr, count,
934                                 count_verify[nr]); exit(1);
935                 }
936                 /*
937                  * Trigger write protection if there is by writing
938                  * the same value back.
939                  */
940                 *area_count(area_dst, nr) = count;
941         }
942
943         if (uffd_test_ops->release_pages(area_dst))
944                 return 1;
945
946         for (nr = 0; nr < nr_pages; nr++) {
947                 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) {
948                         fprintf(stderr, "nr %lu is not zero\n", nr);
949                         exit(1);
950                 }
951         }
952
953         return 0;
954 }
955
956 static void retry_uffdio_zeropage(int ufd,
957                                   struct uffdio_zeropage *uffdio_zeropage,
958                                   unsigned long offset)
959 {
960         uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
961                                      uffdio_zeropage->range.len,
962                                      offset);
963         if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
964                 if (uffdio_zeropage->zeropage != -EEXIST) {
965                         uffd_error(uffdio_zeropage->zeropage,
966                                    "UFFDIO_ZEROPAGE retry error");
967                 }
968         } else {
969                 uffd_error(uffdio_zeropage->zeropage,
970                            "UFFDIO_ZEROPAGE retry unexpected");
971         }
972 }
973
974 static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
975 {
976         struct uffdio_zeropage uffdio_zeropage;
977         int ret;
978         unsigned long has_zeropage;
979         __s64 res;
980
981         has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
982
983         if (offset >= nr_pages * page_size) {
984                 fprintf(stderr, "unexpected offset %lu\n", offset);
985                 exit(1);
986         }
987         uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
988         uffdio_zeropage.range.len = page_size;
989         uffdio_zeropage.mode = 0;
990         ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
991         res = uffdio_zeropage.zeropage;
992         if (ret) {
993                 /* real retval in ufdio_zeropage.zeropage */
994                 if (has_zeropage) {
995                         uffd_error(res, "UFFDIO_ZEROPAGE %s",
996                                    res == -EEXIST ? "-EEXIST" : "error");
997                 } else if (res != -EINVAL)
998                         uffd_error(res, "UFFDIO_ZEROPAGE not -EINVAL");
999         } else if (has_zeropage) {
1000                 if (res != page_size) {
1001                         uffd_error(res, "UFFDIO_ZEROPAGE unexpected");
1002                 } else {
1003                         if (test_uffdio_zeropage_eexist && retry) {
1004                                 test_uffdio_zeropage_eexist = false;
1005                                 retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1006                                                       offset);
1007                         }
1008                         return 1;
1009                 }
1010         } else
1011                 uffd_error(res, "UFFDIO_ZEROPAGE succeeded");
1012
1013         return 0;
1014 }
1015
1016 static int uffdio_zeropage(int ufd, unsigned long offset)
1017 {
1018         return __uffdio_zeropage(ufd, offset, false);
1019 }
1020
1021 /* exercise UFFDIO_ZEROPAGE */
1022 static int userfaultfd_zeropage_test(void)
1023 {
1024         struct uffdio_register uffdio_register;
1025         unsigned long expected_ioctls;
1026
1027         printf("testing UFFDIO_ZEROPAGE: ");
1028         fflush(stdout);
1029
1030         if (uffd_test_ops->release_pages(area_dst))
1031                 return 1;
1032
1033         if (userfaultfd_open(0))
1034                 return 1;
1035         uffdio_register.range.start = (unsigned long) area_dst;
1036         uffdio_register.range.len = nr_pages * page_size;
1037         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1038         if (test_uffdio_wp)
1039                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1040         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1041                 fprintf(stderr, "register failure\n");
1042                 exit(1);
1043         }
1044
1045         expected_ioctls = uffd_test_ops->expected_ioctls;
1046         if ((uffdio_register.ioctls & expected_ioctls) !=
1047             expected_ioctls) {
1048                 fprintf(stderr,
1049                         "unexpected missing ioctl for anon memory\n");
1050                 exit(1);
1051         }
1052
1053         if (uffdio_zeropage(uffd, 0)) {
1054                 if (my_bcmp(area_dst, zeropage, page_size)) {
1055                         fprintf(stderr, "zeropage is not zero\n");
1056                         exit(1);
1057                 }
1058         }
1059
1060         close(uffd);
1061         printf("done.\n");
1062         return 0;
1063 }
1064
1065 static int userfaultfd_events_test(void)
1066 {
1067         struct uffdio_register uffdio_register;
1068         unsigned long expected_ioctls;
1069         pthread_t uffd_mon;
1070         int err, features;
1071         pid_t pid;
1072         char c;
1073         struct uffd_stats stats = { 0 };
1074
1075         printf("testing events (fork, remap, remove): ");
1076         fflush(stdout);
1077
1078         if (uffd_test_ops->release_pages(area_dst))
1079                 return 1;
1080
1081         features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1082                 UFFD_FEATURE_EVENT_REMOVE;
1083         if (userfaultfd_open(features))
1084                 return 1;
1085         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1086
1087         uffdio_register.range.start = (unsigned long) area_dst;
1088         uffdio_register.range.len = nr_pages * page_size;
1089         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1090         if (test_uffdio_wp)
1091                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1092         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1093                 fprintf(stderr, "register failure\n");
1094                 exit(1);
1095         }
1096
1097         expected_ioctls = uffd_test_ops->expected_ioctls;
1098         if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1099                 fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1100                 exit(1);
1101         }
1102
1103         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1104                 perror("uffd_poll_thread create");
1105                 exit(1);
1106         }
1107
1108         pid = fork();
1109         if (pid < 0) {
1110                 perror("fork");
1111                 exit(1);
1112         }
1113
1114         if (!pid)
1115                 return faulting_process(0);
1116
1117         waitpid(pid, &err, 0);
1118         if (err) {
1119                 fprintf(stderr, "faulting process failed\n");
1120                 exit(1);
1121         }
1122
1123         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1124                 perror("pipe write");
1125                 exit(1);
1126         }
1127         if (pthread_join(uffd_mon, NULL))
1128                 return 1;
1129
1130         close(uffd);
1131
1132         uffd_stats_report(&stats, 1);
1133
1134         return stats.missing_faults != nr_pages;
1135 }
1136
1137 static int userfaultfd_sig_test(void)
1138 {
1139         struct uffdio_register uffdio_register;
1140         unsigned long expected_ioctls;
1141         unsigned long userfaults;
1142         pthread_t uffd_mon;
1143         int err, features;
1144         pid_t pid;
1145         char c;
1146         struct uffd_stats stats = { 0 };
1147
1148         printf("testing signal delivery: ");
1149         fflush(stdout);
1150
1151         if (uffd_test_ops->release_pages(area_dst))
1152                 return 1;
1153
1154         features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1155         if (userfaultfd_open(features))
1156                 return 1;
1157         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1158
1159         uffdio_register.range.start = (unsigned long) area_dst;
1160         uffdio_register.range.len = nr_pages * page_size;
1161         uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1162         if (test_uffdio_wp)
1163                 uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1164         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1165                 fprintf(stderr, "register failure\n");
1166                 exit(1);
1167         }
1168
1169         expected_ioctls = uffd_test_ops->expected_ioctls;
1170         if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1171                 fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1172                 exit(1);
1173         }
1174
1175         if (faulting_process(1)) {
1176                 fprintf(stderr, "faulting process failed\n");
1177                 exit(1);
1178         }
1179
1180         if (uffd_test_ops->release_pages(area_dst))
1181                 return 1;
1182
1183         if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1184                 perror("uffd_poll_thread create");
1185                 exit(1);
1186         }
1187
1188         pid = fork();
1189         if (pid < 0) {
1190                 perror("fork");
1191                 exit(1);
1192         }
1193
1194         if (!pid)
1195                 exit(faulting_process(2));
1196
1197         waitpid(pid, &err, 0);
1198         if (err) {
1199                 fprintf(stderr, "faulting process failed\n");
1200                 exit(1);
1201         }
1202
1203         if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1204                 perror("pipe write");
1205                 exit(1);
1206         }
1207         if (pthread_join(uffd_mon, (void **)&userfaults))
1208                 return 1;
1209
1210         printf("done.\n");
1211         if (userfaults)
1212                 fprintf(stderr, "Signal test failed, userfaults: %ld\n",
1213                         userfaults);
1214         close(uffd);
1215         return userfaults != 0;
1216 }
1217
1218 static int userfaultfd_stress(void)
1219 {
1220         void *area;
1221         char *tmp_area;
1222         unsigned long nr;
1223         struct uffdio_register uffdio_register;
1224         unsigned long cpu;
1225         int err;
1226         struct uffd_stats uffd_stats[nr_cpus];
1227
1228         uffd_test_ops->allocate_area((void **)&area_src);
1229         if (!area_src)
1230                 return 1;
1231         uffd_test_ops->allocate_area((void **)&area_dst);
1232         if (!area_dst)
1233                 return 1;
1234
1235         if (userfaultfd_open(0))
1236                 return 1;
1237
1238         count_verify = malloc(nr_pages * sizeof(unsigned long long));
1239         if (!count_verify) {
1240                 perror("count_verify");
1241                 return 1;
1242         }
1243
1244         for (nr = 0; nr < nr_pages; nr++) {
1245                 *area_mutex(area_src, nr) = (pthread_mutex_t)
1246                         PTHREAD_MUTEX_INITIALIZER;
1247                 count_verify[nr] = *area_count(area_src, nr) = 1;
1248                 /*
1249                  * In the transition between 255 to 256, powerpc will
1250                  * read out of order in my_bcmp and see both bytes as
1251                  * zero, so leave a placeholder below always non-zero
1252                  * after the count, to avoid my_bcmp to trigger false
1253                  * positives.
1254                  */
1255                 *(area_count(area_src, nr) + 1) = 1;
1256         }
1257
1258         pipefd = malloc(sizeof(int) * nr_cpus * 2);
1259         if (!pipefd) {
1260                 perror("pipefd");
1261                 return 1;
1262         }
1263         for (cpu = 0; cpu < nr_cpus; cpu++) {
1264                 if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
1265                         perror("pipe");
1266                         return 1;
1267                 }
1268         }
1269
1270         if (posix_memalign(&area, page_size, page_size)) {
1271                 fprintf(stderr, "out of memory\n");
1272                 return 1;
1273         }
1274         zeropage = area;
1275         bzero(zeropage, page_size);
1276
1277         pthread_mutex_lock(&uffd_read_mutex);
1278
1279         pthread_attr_init(&attr);
1280         pthread_attr_setstacksize(&attr, 16*1024*1024);
1281
1282         err = 0;
1283         while (bounces--) {
1284                 unsigned long expected_ioctls;
1285
1286                 printf("bounces: %d, mode:", bounces);
1287                 if (bounces & BOUNCE_RANDOM)
1288                         printf(" rnd");
1289                 if (bounces & BOUNCE_RACINGFAULTS)
1290                         printf(" racing");
1291                 if (bounces & BOUNCE_VERIFY)
1292                         printf(" ver");
1293                 if (bounces & BOUNCE_POLL)
1294                         printf(" poll");
1295                 else
1296                         printf(" read");
1297                 printf(", ");
1298                 fflush(stdout);
1299
1300                 if (bounces & BOUNCE_POLL)
1301                         fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1302                 else
1303                         fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1304
1305                 /* register */
1306                 uffdio_register.range.start = (unsigned long) area_dst;
1307                 uffdio_register.range.len = nr_pages * page_size;
1308                 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1309                 if (test_uffdio_wp)
1310                         uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1311                 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1312                         fprintf(stderr, "register failure\n");
1313                         return 1;
1314                 }
1315                 expected_ioctls = uffd_test_ops->expected_ioctls;
1316                 if ((uffdio_register.ioctls & expected_ioctls) !=
1317                     expected_ioctls) {
1318                         fprintf(stderr,
1319                                 "unexpected missing ioctl for anon memory\n");
1320                         return 1;
1321                 }
1322
1323                 if (area_dst_alias) {
1324                         uffdio_register.range.start = (unsigned long)
1325                                 area_dst_alias;
1326                         if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1327                                 fprintf(stderr, "register failure alias\n");
1328                                 return 1;
1329                         }
1330                 }
1331
1332                 /*
1333                  * The madvise done previously isn't enough: some
1334                  * uffd_thread could have read userfaults (one of
1335                  * those already resolved by the background thread)
1336                  * and it may be in the process of calling
1337                  * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1338                  * area_src and it would map a zero page in it (of
1339                  * course such a UFFDIO_COPY is perfectly safe as it'd
1340                  * return -EEXIST). The problem comes at the next
1341                  * bounce though: that racing UFFDIO_COPY would
1342                  * generate zeropages in the area_src, so invalidating
1343                  * the previous MADV_DONTNEED. Without this additional
1344                  * MADV_DONTNEED those zeropages leftovers in the
1345                  * area_src would lead to -EEXIST failure during the
1346                  * next bounce, effectively leaving a zeropage in the
1347                  * area_dst.
1348                  *
1349                  * Try to comment this out madvise to see the memory
1350                  * corruption being caught pretty quick.
1351                  *
1352                  * khugepaged is also inhibited to collapse THP after
1353                  * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1354                  * required to MADV_DONTNEED here.
1355                  */
1356                 if (uffd_test_ops->release_pages(area_dst))
1357                         return 1;
1358
1359                 uffd_stats_reset(uffd_stats, nr_cpus);
1360
1361                 /* bounce pass */
1362                 if (stress(uffd_stats))
1363                         return 1;
1364
1365                 /* Clear all the write protections if there is any */
1366                 if (test_uffdio_wp)
1367                         wp_range(uffd, (unsigned long)area_dst,
1368                                  nr_pages * page_size, false);
1369
1370                 /* unregister */
1371                 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
1372                         fprintf(stderr, "unregister failure\n");
1373                         return 1;
1374                 }
1375                 if (area_dst_alias) {
1376                         uffdio_register.range.start = (unsigned long) area_dst;
1377                         if (ioctl(uffd, UFFDIO_UNREGISTER,
1378                                   &uffdio_register.range)) {
1379                                 fprintf(stderr, "unregister failure alias\n");
1380                                 return 1;
1381                         }
1382                 }
1383
1384                 /* verification */
1385                 if (bounces & BOUNCE_VERIFY) {
1386                         for (nr = 0; nr < nr_pages; nr++) {
1387                                 if (*area_count(area_dst, nr) != count_verify[nr]) {
1388                                         fprintf(stderr,
1389                                                 "error area_count %Lu %Lu %lu\n",
1390                                                 *area_count(area_src, nr),
1391                                                 count_verify[nr],
1392                                                 nr);
1393                                         err = 1;
1394                                         bounces = 0;
1395                                 }
1396                         }
1397                 }
1398
1399                 /* prepare next bounce */
1400                 tmp_area = area_src;
1401                 area_src = area_dst;
1402                 area_dst = tmp_area;
1403
1404                 tmp_area = area_src_alias;
1405                 area_src_alias = area_dst_alias;
1406                 area_dst_alias = tmp_area;
1407
1408                 uffd_stats_report(uffd_stats, nr_cpus);
1409         }
1410
1411         if (err)
1412                 return err;
1413
1414         close(uffd);
1415         return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1416                 || userfaultfd_events_test();
1417 }
1418
1419 /*
1420  * Copied from mlock2-tests.c
1421  */
1422 unsigned long default_huge_page_size(void)
1423 {
1424         unsigned long hps = 0;
1425         char *line = NULL;
1426         size_t linelen = 0;
1427         FILE *f = fopen("/proc/meminfo", "r");
1428
1429         if (!f)
1430                 return 0;
1431         while (getline(&line, &linelen, f) > 0) {
1432                 if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1433                         hps <<= 10;
1434                         break;
1435                 }
1436         }
1437
1438         free(line);
1439         fclose(f);
1440         return hps;
1441 }
1442
1443 static void set_test_type(const char *type)
1444 {
1445         if (!strcmp(type, "anon")) {
1446                 test_type = TEST_ANON;
1447                 uffd_test_ops = &anon_uffd_test_ops;
1448                 /* Only enable write-protect test for anonymous test */
1449                 test_uffdio_wp = true;
1450         } else if (!strcmp(type, "hugetlb")) {
1451                 test_type = TEST_HUGETLB;
1452                 uffd_test_ops = &hugetlb_uffd_test_ops;
1453         } else if (!strcmp(type, "hugetlb_shared")) {
1454                 map_shared = true;
1455                 test_type = TEST_HUGETLB;
1456                 uffd_test_ops = &hugetlb_uffd_test_ops;
1457         } else if (!strcmp(type, "shmem")) {
1458                 map_shared = true;
1459                 test_type = TEST_SHMEM;
1460                 uffd_test_ops = &shmem_uffd_test_ops;
1461         } else {
1462                 fprintf(stderr, "Unknown test type: %s\n", type); exit(1);
1463         }
1464
1465         if (test_type == TEST_HUGETLB)
1466                 page_size = default_huge_page_size();
1467         else
1468                 page_size = sysconf(_SC_PAGE_SIZE);
1469
1470         if (!page_size) {
1471                 fprintf(stderr, "Unable to determine page size\n");
1472                 exit(2);
1473         }
1474         if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1475             > page_size) {
1476                 fprintf(stderr, "Impossible to run this test\n");
1477                 exit(2);
1478         }
1479 }
1480
1481 static void sigalrm(int sig)
1482 {
1483         if (sig != SIGALRM)
1484                 abort();
1485         test_uffdio_copy_eexist = true;
1486         test_uffdio_zeropage_eexist = true;
1487         alarm(ALARM_INTERVAL_SECS);
1488 }
1489
1490 int main(int argc, char **argv)
1491 {
1492         if (argc < 4)
1493                 usage();
1494
1495         if (signal(SIGALRM, sigalrm) == SIG_ERR) {
1496                 fprintf(stderr, "failed to arm SIGALRM");
1497                 exit(1);
1498         }
1499         alarm(ALARM_INTERVAL_SECS);
1500
1501         set_test_type(argv[1]);
1502
1503         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1504         nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1505                 nr_cpus;
1506         if (!nr_pages_per_cpu) {
1507                 fprintf(stderr, "invalid MiB\n");
1508                 usage();
1509         }
1510
1511         bounces = atoi(argv[3]);
1512         if (bounces <= 0) {
1513                 fprintf(stderr, "invalid bounces\n");
1514                 usage();
1515         }
1516         nr_pages = nr_pages_per_cpu * nr_cpus;
1517
1518         if (test_type == TEST_HUGETLB) {
1519                 if (argc < 5)
1520                         usage();
1521                 huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1522                 if (huge_fd < 0) {
1523                         fprintf(stderr, "Open of %s failed", argv[3]);
1524                         perror("open");
1525                         exit(1);
1526                 }
1527                 if (ftruncate(huge_fd, 0)) {
1528                         fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1529                         perror("ftruncate");
1530                         exit(1);
1531                 }
1532         }
1533         printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1534                nr_pages, nr_pages_per_cpu);
1535         return userfaultfd_stress();
1536 }
1537
1538 #else /* __NR_userfaultfd */
1539
1540 #warning "missing __NR_userfaultfd definition"
1541
1542 int main(void)
1543 {
1544         printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1545         return KSFT_SKIP;
1546 }
1547
1548 #endif /* __NR_userfaultfd */