src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_LINUX_BTRFS_H
  48 #include <linux/btrfs.h>
  49 #endif
  50
  51 #ifdef HAVE_FANOTIFY_INIT
  52 #include <sys/fanotify.h>
  53 #endif
  54
  55 #include "systemd/sd-daemon.h"
  56
  57 #include "missing.h"
  58 #include "util.h"
  59 #include "set.h"
  60 #include "ioprio.h"
  61 #include "readahead-common.h"
  62 #include "virt.h"
  63
  64 /* fixme:
  65  *
  66  * - detect ssd on btrfs/lvm...
  67  * - read ahead directories
  68  * - gzip?
  69  * - remount rw?
  70  * - handle files where nothing is in mincore
  71  * - does ioprio_set work with fadvise()?
  72  */
  73
  74 static ReadaheadShared *shared = NULL;
  75 static usec_t starttime;
  76
  77 /* Avoid collisions with the NULL pointer */
  78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  80
  81 static int btrfs_defrag(int fd) {
  82         struct btrfs_ioctl_vol_args data = { .fd = fd };
  83
  84         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  85 }
  86
  87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  88         struct stat st;
  89         void *start = MAP_FAILED;
  90         uint8_t *vec;
  91         uint32_t b, c;
  92         uint64_t inode;
  93         size_t l, pages;
  94         bool mapped;
  95         int r = 0, fd = -1, k;
  96
  97         assert(pack);
  98         assert(fn);
  99
 100         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
 101         if (fd < 0) {
 102
 103                 if (errno == ENOENT)
 104                         return 0;
 105
 106                 if (errno == EPERM || errno == EACCES)
 107                         return 0;
 108
 109                 log_warning("open(%s) failed: %m", fn);
 110                 r = -errno;
 111                 goto finish;
 112         }
 113
 114         k = file_verify(fd, fn, arg_file_size_max, &st);
 115         if (k <= 0) {
 116                 r = k;
 117                 goto finish;
 118         }
 119
 120         if (on_btrfs)
 121                 btrfs_defrag(fd);
 122
 123         l = PAGE_ALIGN(st.st_size);
 124         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 125         if (start == MAP_FAILED) {
 126                 log_warning("mmap(%s) failed: %m", fn);
 127                 r = -errno;
 128                 goto finish;
 129         }
 130
 131         pages = l / page_size();
 132         vec = alloca0(pages);
 133         if (mincore(start, l, vec) < 0) {
 134                 log_warning("mincore(%s) failed: %m", fn);
 135                 r = -errno;
 136                 goto finish;
 137         }
 138
 139         fputs(fn, pack);
 140         fputc('\n', pack);
 141
 142         /* Store the inode, so that we notice when the file is deleted */
 143         inode = (uint64_t) st.st_ino;
 144         fwrite(&inode, sizeof(inode), 1, pack);
 145
 146         mapped = false;
 147         for (c = 0; c < pages; c++) {
 148                 bool new_mapped = !!(vec[c] & 1);
 149
 150                 if (!mapped && new_mapped)
 151                         b = c;
 152                 else if (mapped && !new_mapped) {
 153                         fwrite(&b, sizeof(b), 1, pack);
 154                         fwrite(&c, sizeof(c), 1, pack);
 155
 156                         log_debug("%s: page %u to %u", fn, b, c);
 157                 }
 158
 159                 mapped = new_mapped;
 160         }
 161
 162         /* We don't write any range data if we should read the entire file */
 163         if (mapped && b > 0) {
 164                 fwrite(&b, sizeof(b), 1, pack);
 165                 fwrite(&c, sizeof(c), 1, pack);
 166
 167                 log_debug("%s: page %u to %u", fn, b, c);
 168         }
 169
 170         /* End marker */
 171         b = 0;
 172         fwrite(&b, sizeof(b), 1, pack);
 173         fwrite(&b, sizeof(b), 1, pack);
 174
 175 finish:
 176         if (start != MAP_FAILED)
 177                 munmap(start, l);
 178
 179         safe_close(fd);
 180
 181         return r;
 182 }
 183
 184 static unsigned long fd_first_block(int fd) {
 185         struct {
 186                 struct fiemap fiemap;
 187                 struct fiemap_extent extent;
 188         } data = {
 189                 .fiemap.fm_length = ~0ULL,
 190                 .fiemap.fm_extent_count = 1,
 191         };
 192
 193         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 194                 return 0;
 195
 196         if (data.fiemap.fm_mapped_extents <= 0)
 197                 return 0;
 198
 199         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 200                 return 0;
 201
 202         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 203 }
 204
 205 struct item {
 206         const char *path;
 207         unsigned long block;
 208         unsigned long bin;
 209 };
 210
 211 static int qsort_compare(const void *a, const void *b) {
 212         const struct item *i, *j;
 213
 214         i = a;
 215         j = b;
 216
 217         /* sort by bin first */
 218         if (i->bin < j->bin)
 219                 return -1;
 220         if (i->bin > j->bin)
 221                 return 1;
 222
 223         /* then sort by sector */
 224         if (i->block < j->block)
 225                 return -1;
 226         if (i->block > j->block)
 227                 return 1;
 228
 229         return strcmp(i->path, j->path);
 230 }
 231
 232 static int collect(const char *root) {
 233         enum {
 234                 FD_FANOTIFY,  /* Get the actual fs events */
 235                 FD_SIGNAL,
 236                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 237                 _FD_MAX
 238         };
 239         struct pollfd pollfd[_FD_MAX] = {};
 240         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 241         pid_t my_pid;
 242         Hashmap *files = NULL;
 243         Iterator i;
 244         char *p, *q;
 245         sigset_t mask;
 246         FILE *pack = NULL;
 247         char *pack_fn_new = NULL, *pack_fn = NULL;
 248         bool on_ssd, on_btrfs;
 249         struct statfs sfs;
 250         usec_t not_after;
 251         uint64_t previous_block_readahead;
 252         bool previous_block_readahead_set = false;
 253
 254         assert(root);
 255
 256 #ifdef CONFIG_TIZEN
 257         if (asprintf(&pack_fn, "%s/.readahead", arg_savedir ? arg_savedir : root) < 0) {
 258 #else
 259         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 260 #endif
 261                 r = log_oom();
 262                 goto finish;
 263         }
 264
 265         starttime = now(CLOCK_MONOTONIC);
 266
 267         /* If there's no pack file yet we lower the kernel readahead
 268          * so that mincore() is accurate. If there is a pack file
 269          * already we assume it is accurate enough so that kernel
 270          * readahead is never triggered. */
 271         previous_block_readahead_set =
 272                 access(pack_fn, F_OK) < 0 &&
 273                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 274                 block_set_readahead(root, 8*1024) >= 0;
 275
 276         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 277                 log_warning("Failed to set IDLE IO priority class: %m");
 278
 279         assert_se(sigemptyset(&mask) == 0);
 280         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 281         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 282
 283         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 284                 log_error("signalfd(): %m");
 285                 r = -errno;
 286                 goto finish;
 287         }
 288
 289         files = hashmap_new(string_hash_func, string_compare_func);
 290         if (!files) {
 291                 log_error("Failed to allocate set.");
 292                 r = -ENOMEM;
 293                 goto finish;
 294         }
 295
 296         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 297         if (fanotify_fd < 0)  {
 298                 log_error("Failed to create fanotify object: %m");
 299                 r = -errno;
 300                 goto finish;
 301         }
 302
 303         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 304                 log_error("Failed to mark %s: %m", root);
 305                 r = -errno;
 306                 goto finish;
 307         }
 308
 309         inotify_fd = open_inotify();
 310         if (inotify_fd < 0) {
 311                 r = inotify_fd;
 312                 goto finish;
 313         }
 314
 315         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 316
 317         my_pid = getpid();
 318
 319         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 320         pollfd[FD_FANOTIFY].events = POLLIN;
 321         pollfd[FD_SIGNAL].fd = signal_fd;
 322         pollfd[FD_SIGNAL].events = POLLIN;
 323         pollfd[FD_INOTIFY].fd = inotify_fd;
 324         pollfd[FD_INOTIFY].events = POLLIN;
 325
 326         sd_notify(0,
 327                   "READY=1\n"
 328                   "STATUS=Collecting readahead data");
 329
 330         log_debug("Collecting...");
 331
 332         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 333                 log_debug("Collection canceled");
 334                 r = -ECANCELED;
 335                 goto finish;
 336         }
 337
 338         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 339                 log_debug("Got termination request");
 340                 goto done;
 341         }
 342
 343         for (;;) {
 344                 union {
 345                         struct fanotify_event_metadata metadata;
 346                         char buffer[4096];
 347                 } data;
 348                 ssize_t n;
 349                 struct fanotify_event_metadata *m;
 350                 usec_t t;
 351                 int h;
 352
 353                 if (hashmap_size(files) > arg_files_max) {
 354                         log_debug("Reached maximum number of read ahead files, ending collection.");
 355                         break;
 356                 }
 357
 358                 t = now(CLOCK_MONOTONIC);
 359                 if (t >= not_after) {
 360                         log_debug("Reached maximum collection time, ending collection.");
 361                         break;
 362                 }
 363
 364                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 365
 366                         if (errno == EINTR)
 367                                 continue;
 368
 369                         log_error("poll(): %m");
 370                         r = -errno;
 371                         goto finish;
 372                 }
 373
 374                 if (h == 0) {
 375                         log_debug("Reached maximum collection time, ending collection.");
 376                         break;
 377                 }
 378
 379                 if (pollfd[FD_SIGNAL].revents) {
 380                         log_debug("Got signal.");
 381                         break;
 382                 }
 383
 384                 if (pollfd[FD_INOTIFY].revents) {
 385                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 386                         struct inotify_event *e;
 387
 388                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 389                                 if (errno == EINTR || errno == EAGAIN)
 390                                         continue;
 391
 392                                 log_error("Failed to read inotify event: %m");
 393                                 r = -errno;
 394                                 goto finish;
 395                         }
 396
 397                         e = (struct inotify_event*) inotify_buffer;
 398                         while (n > 0) {
 399                                 size_t step;
 400
 401                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 402                                         log_debug("Collection canceled");
 403                                         r = -ECANCELED;
 404                                         goto finish;
 405                                 }
 406
 407                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 408                                         log_debug("Got termination request");
 409                                         goto done;
 410                                 }
 411
 412                                 step = sizeof(struct inotify_event) + e->len;
 413                                 assert(step <= (size_t) n);
 414
 415                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 416                                 n -= step;
 417                         }
 418                 }
 419
 420                 n = read(fanotify_fd, &data, sizeof(data));
 421                 if (n < 0) {
 422
 423                         if (errno == EINTR || errno == EAGAIN)
 424                                 continue;
 425
 426                         /* fanotify sometimes returns EACCES on read()
 427                          * where it shouldn't. For now let's just
 428                          * ignore it here (which is safe), but
 429                          * eventually this should be
 430                          * dropped when the kernel is fixed.
 431                          *
 432                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 433                         if (errno == EACCES)
 434                                 continue;
 435
 436                         log_error("Failed to read event: %m");
 437                         r = -errno;
 438                         goto finish;
 439                 }
 440
 441                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 442                         char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
 443                         int k;
 444
 445                         if (m->fd < 0)
 446                                 goto next_iteration;
 447
 448                         if (m->pid == my_pid)
 449                                 goto next_iteration;
 450
 451                         __sync_synchronize();
 452                         if (m->pid == shared->replay)
 453                                 goto next_iteration;
 454
 455                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 456                         k = readlink_malloc(fn, &p);
 457                         if (k >= 0) {
 458                                 if (startswith(p, "/tmp") ||
 459                                     endswith(p, " (deleted)") ||
 460                                     hashmap_get(files, p))
 461                                         /* Not interesting, or
 462                                          * already read */
 463                                         free(p);
 464                                 else {
 465                                         unsigned long ul;
 466                                         usec_t entrytime;
 467                                         struct item *entry;
 468
 469                                         entry = new0(struct item, 1);
 470                                         if (!entry) {
 471                                                 r = log_oom();
 472                                                 goto finish;
 473                                         }
 474
 475                                         ul = fd_first_block(m->fd);
 476
 477                                         entrytime = now(CLOCK_MONOTONIC);
 478
 479                                         entry->block = ul;
 480                                         entry->path = strdup(p);
 481                                         if (!entry->path) {
 482                                                 free(entry);
 483                                                 r = log_oom();
 484                                                 goto finish;
 485                                         }
 486                                         entry->bin = (entrytime - starttime) / 2000000;
 487
 488                                         k = hashmap_put(files, p, entry);
 489                                         if (k < 0) {
 490                                                 log_warning("hashmap_put() failed: %s", strerror(-k));
 491                                                 free(p);
 492                                         }
 493                                 }
 494
 495                         } else
 496                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 497
 498                 next_iteration:
 499                         safe_close(m->fd);
 500                 }
 501         }
 502
 503 done:
 504         fanotify_fd = safe_close(fanotify_fd);
 505
 506         log_debug("Writing Pack File...");
 507
 508         on_ssd = fs_on_ssd(root) > 0;
 509         log_debug("On SSD: %s", yes_no(on_ssd));
 510
 511         on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
 512         log_debug("On btrfs: %s", yes_no(on_btrfs));
 513
 514 #ifndef CONFIG_TIZEN
 515         if (asprintf(&pack_fn_new, "%s/.readahead.new", arg_savedir ? arg_savedir : root) < 0) {
 516 #else
 517         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 518 #endif
 519                 r = log_oom();
 520                 goto finish;
 521         }
 522
 523         pack = fopen(pack_fn_new, "we");
 524         if (!pack) {
 525                 log_error("Failed to open pack file: %m");
 526                 r = -errno;
 527                 goto finish;
 528         }
 529
 530         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 531         putc(on_ssd ? 'S' : 'R', pack);
 532
 533         if (on_ssd || on_btrfs) {
 534
 535                 /* On SSD or on btrfs, just write things out in the
 536                  * order the files were accessed. */
 537
 538                 HASHMAP_FOREACH_KEY(q, p, files, i)
 539                         pack_file(pack, p, on_btrfs);
 540         } else {
 541                 unsigned n;
 542
 543                 /* On rotating media, order things by the block
 544                  * numbers */
 545
 546                 log_debug("Ordering...");
 547
 548                 n = hashmap_size(files);
 549                 if (n) {
 550                         _cleanup_free_ struct item *ordered;
 551                         struct item *j;
 552                         unsigned k;
 553
 554                         ordered = new(struct item, n);
 555                         if (!ordered) {
 556                                 r = log_oom();
 557                                 goto finish;
 558                         }
 559
 560                         j = ordered;
 561                         HASHMAP_FOREACH_KEY(q, p, files, i) {
 562                                 memcpy(j, q, sizeof(struct item));
 563                                 j++;
 564                         }
 565
 566                         assert(ordered + n == j);
 567
 568                         qsort(ordered, n, sizeof(struct item), qsort_compare);
 569
 570                         for (k = 0; k < n; k++)
 571                                 pack_file(pack, ordered[k].path, on_btrfs);
 572                 } else
 573                         log_warning("No pack files");
 574         }
 575
 576         log_debug("Finalizing...");
 577
 578         fflush(pack);
 579
 580         if (ferror(pack)) {
 581                 log_error("Failed to write pack file.");
 582                 r = -EIO;
 583                 goto finish;
 584         }
 585
 586         if (rename(pack_fn_new, pack_fn) < 0) {
 587                 log_error("Failed to rename readahead file: %m");
 588                 r = -errno;
 589                 goto finish;
 590         }
 591
 592         fclose(pack);
 593         pack = NULL;
 594
 595         log_debug("Done.");
 596
 597 finish:
 598         safe_close(fanotify_fd);
 599         safe_close(signal_fd);
 600         safe_close(inotify_fd);
 601
 602         if (pack) {
 603                 fclose(pack);
 604                 unlink(pack_fn_new);
 605         }
 606         free(pack_fn_new);
 607         free(pack_fn);
 608
 609         while ((p = hashmap_steal_first_key(files)))
 610                 free(p);
 611
 612         hashmap_free(files);
 613
 614         if (previous_block_readahead_set) {
 615                 uint64_t bytes;
 616
 617                 /* Restore the original kernel readahead setting if we
 618                  * changed it, and nobody has overwritten it since
 619                  * yet. */
 620                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 621                         block_set_readahead(root, previous_block_readahead);
 622         }
 623
 624         return r;
 625 }
 626
 627 int main_collect(const char *root) {
 628
 629         if (!root)
 630                 root = "/";
 631
 632         /* Skip this step on read-only media. Note that we check the
 633          * underlying block device here, not he read-only flag of the
 634          * file system on top, since that one is most likely mounted
 635          * read-only anyway at boot, even if the underlying block
 636          * device is theoretically writable. */
 637         if (fs_on_read_only(root) > 0) {
 638                 log_info("Disabling readahead collector due to read-only media.");
 639                 return EXIT_SUCCESS;
 640         }
 641
 642         if (!enough_ram()) {
 643                 log_info("Disabling readahead collector due to low memory.");
 644                 return EXIT_SUCCESS;
 645         }
 646
 647         shared = shared_get();
 648         if (!shared)
 649                 return EXIT_FAILURE;
 650
 651         shared->collect = getpid();
 652         __sync_synchronize();
 653
 654         if (collect(root) < 0)
 655                 return EXIT_FAILURE;
 656
 657         return EXIT_SUCCESS;
 658 }