1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
47 #ifdef HAVE_LINUX_BTRFS_H
48 #include <linux/btrfs.h>
51 #ifdef HAVE_FANOTIFY_INIT
52 #include <sys/fanotify.h>
55 #include "systemd/sd-daemon.h"
61 #include "readahead-common.h"
66 * - detect ssd on btrfs/lvm...
67 * - read ahead directories
70 * - handle files where nothing is in mincore
71 * - does ioprio_set work with fadvise()?
74 static ReadaheadShared *shared = NULL;
75 static usec_t starttime;
77 /* Avoid collisions with the NULL pointer */
78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
81 static int btrfs_defrag(int fd) {
82 struct btrfs_ioctl_vol_args data = { .fd = fd };
84 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
89 void *start = MAP_FAILED;
95 int r = 0, fd = -1, k;
100 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
106 if (errno == EPERM || errno == EACCES)
109 log_warning("open(%s) failed: %m", fn);
114 k = file_verify(fd, fn, arg_file_size_max, &st);
123 l = PAGE_ALIGN(st.st_size);
124 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
125 if (start == MAP_FAILED) {
126 log_warning("mmap(%s) failed: %m", fn);
131 pages = l / page_size();
132 vec = alloca0(pages);
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
147 for (c = 0; c < pages; c++) {
148 bool new_mapped = !!(vec[c] & 1);
150 if (!mapped && new_mapped)
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
156 log_debug("%s: page %u to %u", fn, b, c);
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
167 log_debug("%s: page %u to %u", fn, b, c);
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
176 if (start != MAP_FAILED)
184 static unsigned long fd_first_block(int fd) {
186 struct fiemap fiemap;
187 struct fiemap_extent extent;
189 .fiemap.fm_length = ~0ULL,
190 .fiemap.fm_extent_count = 1,
193 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
196 if (data.fiemap.fm_mapped_extents <= 0)
199 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
202 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
211 static int qsort_compare(const void *a, const void *b) {
212 const struct item *i, *j;
217 /* sort by bin first */
223 /* then sort by sector */
224 if (i->block < j->block)
226 if (i->block > j->block)
229 return strcmp(i->path, j->path);
232 static int collect(const char *root) {
234 FD_FANOTIFY, /* Get the actual fs events */
236 FD_INOTIFY, /* We get notifications to quit early via this fd */
239 struct pollfd pollfd[_FD_MAX] = {};
240 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
242 Hashmap *files = NULL;
247 char *pack_fn_new = NULL, *pack_fn = NULL;
248 bool on_ssd, on_btrfs;
251 uint64_t previous_block_readahead;
252 bool previous_block_readahead_set = false;
257 if (asprintf(&pack_fn, "%s/.readahead", arg_savedir ? arg_savedir : root) < 0) {
259 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
265 starttime = now(CLOCK_MONOTONIC);
267 /* If there's no pack file yet we lower the kernel readahead
268 * so that mincore() is accurate. If there is a pack file
269 * already we assume it is accurate enough so that kernel
270 * readahead is never triggered. */
271 previous_block_readahead_set =
272 access(pack_fn, F_OK) < 0 &&
273 block_get_readahead(root, &previous_block_readahead) >= 0 &&
274 block_set_readahead(root, 8*1024) >= 0;
276 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
277 log_warning("Failed to set IDLE IO priority class: %m");
279 assert_se(sigemptyset(&mask) == 0);
280 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
281 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
283 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
284 log_error("signalfd(): %m");
289 files = hashmap_new(string_hash_func, string_compare_func);
291 log_error("Failed to allocate set.");
296 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
297 if (fanotify_fd < 0) {
298 log_error("Failed to create fanotify object: %m");
303 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
304 log_error("Failed to mark %s: %m", root);
309 inotify_fd = open_inotify();
310 if (inotify_fd < 0) {
315 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
319 pollfd[FD_FANOTIFY].fd = fanotify_fd;
320 pollfd[FD_FANOTIFY].events = POLLIN;
321 pollfd[FD_SIGNAL].fd = signal_fd;
322 pollfd[FD_SIGNAL].events = POLLIN;
323 pollfd[FD_INOTIFY].fd = inotify_fd;
324 pollfd[FD_INOTIFY].events = POLLIN;
328 "STATUS=Collecting readahead data");
330 log_debug("Collecting...");
332 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
333 log_debug("Collection canceled");
338 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
339 log_debug("Got termination request");
345 struct fanotify_event_metadata metadata;
349 struct fanotify_event_metadata *m;
353 if (hashmap_size(files) > arg_files_max) {
354 log_debug("Reached maximum number of read ahead files, ending collection.");
358 t = now(CLOCK_MONOTONIC);
359 if (t >= not_after) {
360 log_debug("Reached maximum collection time, ending collection.");
364 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
369 log_error("poll(): %m");
375 log_debug("Reached maximum collection time, ending collection.");
379 if (pollfd[FD_SIGNAL].revents) {
380 log_debug("Got signal.");
384 if (pollfd[FD_INOTIFY].revents) {
385 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
386 struct inotify_event *e;
388 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
389 if (errno == EINTR || errno == EAGAIN)
392 log_error("Failed to read inotify event: %m");
397 e = (struct inotify_event*) inotify_buffer;
401 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
402 log_debug("Collection canceled");
407 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
408 log_debug("Got termination request");
412 step = sizeof(struct inotify_event) + e->len;
413 assert(step <= (size_t) n);
415 e = (struct inotify_event*) ((uint8_t*) e + step);
420 n = read(fanotify_fd, &data, sizeof(data));
423 if (errno == EINTR || errno == EAGAIN)
426 /* fanotify sometimes returns EACCES on read()
427 * where it shouldn't. For now let's just
428 * ignore it here (which is safe), but
429 * eventually this should be
430 * dropped when the kernel is fixed.
432 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
436 log_error("Failed to read event: %m");
441 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
442 char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
448 if (m->pid == my_pid)
451 __sync_synchronize();
452 if (m->pid == shared->replay)
455 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
456 k = readlink_malloc(fn, &p);
458 if (startswith(p, "/tmp") ||
459 endswith(p, " (deleted)") ||
460 hashmap_get(files, p))
461 /* Not interesting, or
469 entry = new0(struct item, 1);
475 ul = fd_first_block(m->fd);
477 entrytime = now(CLOCK_MONOTONIC);
480 entry->path = strdup(p);
486 entry->bin = (entrytime - starttime) / 2000000;
488 k = hashmap_put(files, p, entry);
490 log_warning("hashmap_put() failed: %s", strerror(-k));
496 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
504 fanotify_fd = safe_close(fanotify_fd);
506 log_debug("Writing Pack File...");
508 on_ssd = fs_on_ssd(root) > 0;
509 log_debug("On SSD: %s", yes_no(on_ssd));
511 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
512 log_debug("On btrfs: %s", yes_no(on_btrfs));
515 if (asprintf(&pack_fn_new, "%s/.readahead.new", arg_savedir ? arg_savedir : root) < 0) {
517 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
523 pack = fopen(pack_fn_new, "we");
525 log_error("Failed to open pack file: %m");
530 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
531 putc(on_ssd ? 'S' : 'R', pack);
533 if (on_ssd || on_btrfs) {
535 /* On SSD or on btrfs, just write things out in the
536 * order the files were accessed. */
538 HASHMAP_FOREACH_KEY(q, p, files, i)
539 pack_file(pack, p, on_btrfs);
543 /* On rotating media, order things by the block
546 log_debug("Ordering...");
548 n = hashmap_size(files);
550 _cleanup_free_ struct item *ordered;
554 ordered = new(struct item, n);
561 HASHMAP_FOREACH_KEY(q, p, files, i) {
562 memcpy(j, q, sizeof(struct item));
566 assert(ordered + n == j);
568 qsort(ordered, n, sizeof(struct item), qsort_compare);
570 for (k = 0; k < n; k++)
571 pack_file(pack, ordered[k].path, on_btrfs);
573 log_warning("No pack files");
576 log_debug("Finalizing...");
581 log_error("Failed to write pack file.");
586 if (rename(pack_fn_new, pack_fn) < 0) {
587 log_error("Failed to rename readahead file: %m");
598 safe_close(fanotify_fd);
599 safe_close(signal_fd);
600 safe_close(inotify_fd);
609 while ((p = hashmap_steal_first_key(files)))
614 if (previous_block_readahead_set) {
617 /* Restore the original kernel readahead setting if we
618 * changed it, and nobody has overwritten it since
620 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
621 block_set_readahead(root, previous_block_readahead);
627 int main_collect(const char *root) {
632 /* Skip this step on read-only media. Note that we check the
633 * underlying block device here, not he read-only flag of the
634 * file system on top, since that one is most likely mounted
635 * read-only anyway at boot, even if the underlying block
636 * device is theoretically writable. */
637 if (fs_on_read_only(root) > 0) {
638 log_info("Disabling readahead collector due to read-only media.");
643 log_info("Disabling readahead collector due to low memory.");
647 shared = shared_get();
651 shared->collect = getpid();
652 __sync_synchronize();
654 if (collect(root) < 0)