1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
29 #include <sys/xattr.h>
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
50 /* These are the lower and upper bounds if we deduce the max_use value
51 * from the file system size */
52 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
53 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
55 /* This is the upper bound if we deduce max_size from max_use */
56 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
58 /* This is the upper bound if we deduce the keep_free value from the
60 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
62 /* This is the keep_free value when we can't determine the system
64 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
66 /* n_data was the first entry we added after the initial file format design */
67 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
69 /* How many entries to keep in the entry array chain cache at max */
70 #define CHAIN_CACHE_MAX 20
72 /* How much to increase the journal file size at once each time we allocate something new. */
74 #define FILE_SIZE_INCREASE (JOURNAL_FILE_SIZE_MIN)
76 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
79 static int journal_file_set_online(JournalFile *f) {
85 if (!(f->fd >= 0 && f->header))
88 switch(f->header->state) {
93 f->header->state = STATE_ONLINE;
102 int journal_file_set_offline(JournalFile *f) {
108 if (!(f->fd >= 0 && f->header))
111 if (f->header->state != STATE_ONLINE)
116 f->header->state = STATE_OFFLINE;
123 void journal_file_close(JournalFile *f) {
127 /* Write the final tag */
128 if (f->seal && f->writable)
129 journal_file_append_tag(f);
132 /* Sync everything to disk, before we mark the file offline */
133 if (f->mmap && f->fd >= 0)
134 mmap_cache_close_fd(f->mmap, f->fd);
136 journal_file_set_offline(f);
139 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
145 mmap_cache_unref(f->mmap);
147 hashmap_free_free(f->chain_cache);
149 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
150 free(f->compress_buffer);
155 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
156 else if (f->fsprg_state)
157 free(f->fsprg_state);
162 gcry_md_close(f->hmac);
168 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
175 memcpy(h.signature, HEADER_SIGNATURE, 8);
176 h.header_size = htole64(ALIGN64(sizeof(h)));
178 h.incompatible_flags |= htole32(
179 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
180 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
182 h.compatible_flags = htole32(
183 f->seal * HEADER_COMPATIBLE_SEALED);
185 r = sd_id128_randomize(&h.file_id);
190 h.seqnum_id = template->header->seqnum_id;
191 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
193 h.seqnum_id = h.file_id;
195 k = pwrite(f->fd, &h, sizeof(h), 0);
205 static int journal_file_refresh_header(JournalFile *f) {
211 r = sd_id128_get_machine(&f->header->machine_id);
215 r = sd_id128_get_boot(&boot_id);
219 if (sd_id128_equal(boot_id, f->header->boot_id))
220 f->tail_entry_monotonic_valid = true;
222 f->header->boot_id = boot_id;
224 journal_file_set_online(f);
226 /* Sync the online state to disk */
232 static int journal_file_verify_header(JournalFile *f) {
237 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
240 /* In both read and write mode we refuse to open files with
241 * incompatible flags we don't know */
242 flags = le32toh(f->header->incompatible_flags);
243 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
244 if (flags & ~HEADER_INCOMPATIBLE_ANY)
245 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
246 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
247 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
249 log_debug("Journal file %s uses incompatible flags %"PRIx32
250 " disabled at compilation time.", f->path, flags);
251 return -EPROTONOSUPPORT;
254 /* When open for writing we refuse to open files with
255 * compatible flags, too */
256 flags = le32toh(f->header->compatible_flags);
257 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
258 if (flags & ~HEADER_COMPATIBLE_ANY)
259 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
260 f->path, flags & ~HEADER_COMPATIBLE_ANY);
261 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
263 log_debug("Journal file %s uses compatible flags %"PRIx32
264 " disabled at compilation time.", f->path, flags);
265 return -EPROTONOSUPPORT;
268 if (f->header->state >= _STATE_MAX)
271 /* The first addition was n_data, so check that we are at least this large */
272 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
275 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
278 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
281 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
284 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
285 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
286 !VALID64(le64toh(f->header->tail_object_offset)) ||
287 !VALID64(le64toh(f->header->entry_array_offset)))
292 sd_id128_t machine_id;
295 r = sd_id128_get_machine(&machine_id);
299 if (!sd_id128_equal(machine_id, f->header->machine_id))
302 state = f->header->state;
304 if (state == STATE_ONLINE) {
305 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
307 } else if (state == STATE_ARCHIVED)
309 else if (state != STATE_OFFLINE) {
310 log_debug("Journal file %s has unknown state %u.", f->path, state);
315 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
316 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
318 f->seal = JOURNAL_HEADER_SEALED(f->header);
323 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
324 uint64_t old_size, new_size;
329 /* We assume that this file is not sparse, and we know that
330 * for sure, since we always call posix_fallocate()
334 le64toh(f->header->header_size) +
335 le64toh(f->header->arena_size);
337 new_size = PAGE_ALIGN(offset + size);
338 if (new_size < le64toh(f->header->header_size))
339 new_size = le64toh(f->header->header_size);
341 if (new_size <= old_size)
344 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
347 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
350 if (fstatvfs(f->fd, &svfs) >= 0) {
353 available = svfs.f_bfree * svfs.f_bsize;
355 if (available >= f->metrics.keep_free)
356 available -= f->metrics.keep_free;
360 if (new_size - old_size > available)
365 /* Increase by larger blocks at once */
366 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
367 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
368 new_size = f->metrics.max_size;
370 /* Note that the glibc fallocate() fallback is very
371 inefficient, hence we try to minimize the allocation area
373 r = posix_fallocate(f->fd, old_size, new_size - old_size);
377 if (fstat(f->fd, &f->last_stat) < 0)
380 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
385 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
392 /* Avoid SIGBUS on invalid accesses */
393 if (offset + size > (uint64_t) f->last_stat.st_size) {
394 /* Hmm, out of range? Let's refresh the fstat() data
395 * first, before we trust that check. */
397 if (fstat(f->fd, &f->last_stat) < 0 ||
398 offset + size > (uint64_t) f->last_stat.st_size)
399 return -EADDRNOTAVAIL;
402 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
405 static uint64_t minimum_header_size(Object *o) {
407 static const uint64_t table[] = {
408 [OBJECT_DATA] = sizeof(DataObject),
409 [OBJECT_FIELD] = sizeof(FieldObject),
410 [OBJECT_ENTRY] = sizeof(EntryObject),
411 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
412 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
413 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
414 [OBJECT_TAG] = sizeof(TagObject),
417 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
418 return sizeof(ObjectHeader);
420 return table[o->object.type];
423 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
432 /* Objects may only be located at multiple of 64 bit */
433 if (!VALID64(offset))
437 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
442 s = le64toh(o->object.size);
444 if (s < sizeof(ObjectHeader))
447 if (o->object.type <= OBJECT_UNUSED)
450 if (s < minimum_header_size(o))
453 if (type > 0 && o->object.type != type)
456 if (s > sizeof(ObjectHeader)) {
457 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
468 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
473 r = le64toh(f->header->tail_entry_seqnum) + 1;
476 /* If an external seqnum counter was passed, we update
477 * both the local and the external one, and set it to
478 * the maximum of both */
486 f->header->tail_entry_seqnum = htole64(r);
488 if (f->header->head_entry_seqnum == 0)
489 f->header->head_entry_seqnum = htole64(r);
494 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
501 assert(type > 0 && type < _OBJECT_TYPE_MAX);
502 assert(size >= sizeof(ObjectHeader));
506 r = journal_file_set_online(f);
510 p = le64toh(f->header->tail_object_offset);
512 p = le64toh(f->header->header_size);
514 r = journal_file_move_to_object(f, -1, p, &tail);
518 p += ALIGN64(le64toh(tail->object.size));
521 r = journal_file_allocate(f, p, size);
525 r = journal_file_move_to(f, type, false, p, size, &t);
532 o->object.type = type;
533 o->object.size = htole64(size);
535 f->header->tail_object_offset = htole64(p);
536 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
544 static int journal_file_setup_data_hash_table(JournalFile *f) {
551 /* We estimate that we need 1 hash table entry per 768 of
552 journal file and we want to make sure we never get beyond
553 75% fill level. Calculate the hash table size for the
554 maximum file size based on these metrics. */
556 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
557 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
558 s = DEFAULT_DATA_HASH_TABLE_SIZE;
560 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
562 r = journal_file_append_object(f,
563 OBJECT_DATA_HASH_TABLE,
564 offsetof(Object, hash_table.items) + s,
569 memzero(o->hash_table.items, s);
571 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
572 f->header->data_hash_table_size = htole64(s);
577 static int journal_file_setup_field_hash_table(JournalFile *f) {
584 /* We use a fixed size hash table for the fields as this
585 * number should grow very slowly only */
587 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
588 r = journal_file_append_object(f,
589 OBJECT_FIELD_HASH_TABLE,
590 offsetof(Object, hash_table.items) + s,
595 memzero(o->hash_table.items, s);
597 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
598 f->header->field_hash_table_size = htole64(s);
603 static int journal_file_map_data_hash_table(JournalFile *f) {
610 p = le64toh(f->header->data_hash_table_offset);
611 s = le64toh(f->header->data_hash_table_size);
613 r = journal_file_move_to(f,
614 OBJECT_DATA_HASH_TABLE,
621 f->data_hash_table = t;
625 static int journal_file_map_field_hash_table(JournalFile *f) {
632 p = le64toh(f->header->field_hash_table_offset);
633 s = le64toh(f->header->field_hash_table_size);
635 r = journal_file_move_to(f,
636 OBJECT_FIELD_HASH_TABLE,
643 f->field_hash_table = t;
647 static int journal_file_link_field(
660 if (o->object.type != OBJECT_FIELD)
663 /* This might alter the window we are looking at */
665 o->field.next_hash_offset = o->field.head_data_offset = 0;
667 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
668 p = le64toh(f->field_hash_table[h].tail_hash_offset);
670 f->field_hash_table[h].head_hash_offset = htole64(offset);
672 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
676 o->field.next_hash_offset = htole64(offset);
679 f->field_hash_table[h].tail_hash_offset = htole64(offset);
681 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
682 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
687 static int journal_file_link_data(
700 if (o->object.type != OBJECT_DATA)
703 /* This might alter the window we are looking at */
705 o->data.next_hash_offset = o->data.next_field_offset = 0;
706 o->data.entry_offset = o->data.entry_array_offset = 0;
707 o->data.n_entries = 0;
709 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
710 p = le64toh(f->data_hash_table[h].tail_hash_offset);
712 /* Only entry in the hash table is easy */
713 f->data_hash_table[h].head_hash_offset = htole64(offset);
715 /* Move back to the previous data object, to patch in
718 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
722 o->data.next_hash_offset = htole64(offset);
725 f->data_hash_table[h].tail_hash_offset = htole64(offset);
727 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
728 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
733 int journal_file_find_field_object_with_hash(
735 const void *field, uint64_t size, uint64_t hash,
736 Object **ret, uint64_t *offset) {
738 uint64_t p, osize, h;
742 assert(field && size > 0);
744 osize = offsetof(Object, field.payload) + size;
746 if (f->header->field_hash_table_size == 0)
749 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
750 p = le64toh(f->field_hash_table[h].head_hash_offset);
755 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
759 if (le64toh(o->field.hash) == hash &&
760 le64toh(o->object.size) == osize &&
761 memcmp(o->field.payload, field, size) == 0) {
771 p = le64toh(o->field.next_hash_offset);
777 int journal_file_find_field_object(
779 const void *field, uint64_t size,
780 Object **ret, uint64_t *offset) {
785 assert(field && size > 0);
787 hash = hash64(field, size);
789 return journal_file_find_field_object_with_hash(f,
794 int journal_file_find_data_object_with_hash(
796 const void *data, uint64_t size, uint64_t hash,
797 Object **ret, uint64_t *offset) {
799 uint64_t p, osize, h;
803 assert(data || size == 0);
805 osize = offsetof(Object, data.payload) + size;
807 if (f->header->data_hash_table_size == 0)
810 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
811 p = le64toh(f->data_hash_table[h].head_hash_offset);
816 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
820 if (le64toh(o->data.hash) != hash)
823 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
824 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
828 l = le64toh(o->object.size);
829 if (l <= offsetof(Object, data.payload))
832 l -= offsetof(Object, data.payload);
834 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
835 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
840 memcmp(f->compress_buffer, data, size) == 0) {
851 return -EPROTONOSUPPORT;
853 } else if (le64toh(o->object.size) == osize &&
854 memcmp(o->data.payload, data, size) == 0) {
866 p = le64toh(o->data.next_hash_offset);
872 int journal_file_find_data_object(
874 const void *data, uint64_t size,
875 Object **ret, uint64_t *offset) {
880 assert(data || size == 0);
882 hash = hash64(data, size);
884 return journal_file_find_data_object_with_hash(f,
889 static int journal_file_append_field(
891 const void *field, uint64_t size,
892 Object **ret, uint64_t *offset) {
900 assert(field && size > 0);
902 hash = hash64(field, size);
904 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
918 osize = offsetof(Object, field.payload) + size;
919 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
923 o->field.hash = htole64(hash);
924 memcpy(o->field.payload, field, size);
926 r = journal_file_link_field(f, o, p, hash);
930 /* The linking might have altered the window, so let's
931 * refresh our pointer */
932 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
937 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
951 static int journal_file_append_data(
953 const void *data, uint64_t size,
954 Object **ret, uint64_t *offset) {
959 int r, compression = 0;
963 assert(data || size == 0);
965 hash = hash64(data, size);
967 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
981 osize = offsetof(Object, data.payload) + size;
982 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
986 o->data.hash = htole64(hash);
988 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
989 if (f->compress_xz &&
990 size >= COMPRESSION_SIZE_THRESHOLD) {
993 compression = compress_blob(data, size, o->data.payload, &rsize);
996 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
997 o->object.flags |= compression;
999 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1000 size, rsize, object_compressed_to_string(compression));
1005 if (!compression && size > 0)
1006 memcpy(o->data.payload, data, size);
1008 r = journal_file_link_data(f, o, p, hash);
1012 /* The linking might have altered the window, so let's
1013 * refresh our pointer */
1014 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1021 eq = memchr(data, '=', size);
1022 if (eq && eq > data) {
1026 /* Create field object ... */
1027 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1031 /* ... and link it in. */
1032 o->data.next_field_offset = fo->field.head_data_offset;
1033 fo->field.head_data_offset = le64toh(p);
1037 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1051 uint64_t journal_file_entry_n_items(Object *o) {
1054 if (o->object.type != OBJECT_ENTRY)
1057 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1060 uint64_t journal_file_entry_array_n_items(Object *o) {
1063 if (o->object.type != OBJECT_ENTRY_ARRAY)
1066 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1069 uint64_t journal_file_hash_table_n_items(Object *o) {
1072 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1073 o->object.type != OBJECT_FIELD_HASH_TABLE)
1076 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1079 static int link_entry_into_array(JournalFile *f,
1084 uint64_t n = 0, ap = 0, q, i, a, hidx;
1092 a = le64toh(*first);
1093 i = hidx = le64toh(*idx);
1096 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1100 n = journal_file_entry_array_n_items(o);
1102 o->entry_array.items[i] = htole64(p);
1103 *idx = htole64(hidx + 1);
1109 a = le64toh(o->entry_array.next_entry_array_offset);
1120 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1121 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1127 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1132 o->entry_array.items[i] = htole64(p);
1135 *first = htole64(q);
1137 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1141 o->entry_array.next_entry_array_offset = htole64(q);
1144 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1145 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1147 *idx = htole64(hidx + 1);
1152 static int link_entry_into_array_plus_one(JournalFile *f,
1167 *extra = htole64(p);
1171 i = htole64(le64toh(*idx) - 1);
1172 r = link_entry_into_array(f, first, &i, p);
1177 *idx = htole64(le64toh(*idx) + 1);
1181 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1188 p = le64toh(o->entry.items[i].object_offset);
1192 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1196 return link_entry_into_array_plus_one(f,
1197 &o->data.entry_offset,
1198 &o->data.entry_array_offset,
1203 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1211 if (o->object.type != OBJECT_ENTRY)
1214 __sync_synchronize();
1216 /* Link up the entry itself */
1217 r = link_entry_into_array(f,
1218 &f->header->entry_array_offset,
1219 &f->header->n_entries,
1224 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1226 if (f->header->head_entry_realtime == 0)
1227 f->header->head_entry_realtime = o->entry.realtime;
1229 f->header->tail_entry_realtime = o->entry.realtime;
1230 f->header->tail_entry_monotonic = o->entry.monotonic;
1232 f->tail_entry_monotonic_valid = true;
1234 /* Link up the items */
1235 n = journal_file_entry_n_items(o);
1236 for (i = 0; i < n; i++) {
1237 r = journal_file_link_entry_item(f, o, offset, i);
1245 static int journal_file_append_entry_internal(
1247 const dual_timestamp *ts,
1249 const EntryItem items[], unsigned n_items,
1251 Object **ret, uint64_t *offset) {
1258 assert(items || n_items == 0);
1261 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1263 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1267 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1268 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1269 o->entry.realtime = htole64(ts->realtime);
1270 o->entry.monotonic = htole64(ts->monotonic);
1271 o->entry.xor_hash = htole64(xor_hash);
1272 o->entry.boot_id = f->header->boot_id;
1275 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1280 r = journal_file_link_entry(f, o, np);
1293 void journal_file_post_change(JournalFile *f) {
1296 /* inotify() does not receive IN_MODIFY events from file
1297 * accesses done via mmap(). After each access we hence
1298 * trigger IN_MODIFY by truncating the journal file to its
1299 * current size which triggers IN_MODIFY. */
1301 __sync_synchronize();
1303 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1304 log_error("Failed to truncate file to its own size: %m");
1307 static int entry_item_cmp(const void *_a, const void *_b) {
1308 const EntryItem *a = _a, *b = _b;
1310 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1312 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1317 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1321 uint64_t xor_hash = 0;
1322 struct dual_timestamp _ts;
1325 assert(iovec || n_iovec == 0);
1328 dual_timestamp_get(&_ts);
1332 if (f->tail_entry_monotonic_valid &&
1333 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1337 r = journal_file_maybe_append_tag(f, ts->realtime);
1342 /* alloca() can't take 0, hence let's allocate at least one */
1343 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1345 for (i = 0; i < n_iovec; i++) {
1349 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1353 xor_hash ^= le64toh(o->data.hash);
1354 items[i].object_offset = htole64(p);
1355 items[i].hash = o->data.hash;
1358 /* Order by the position on disk, in order to improve seek
1359 * times for rotating media. */
1360 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1362 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1364 journal_file_post_change(f);
1369 typedef struct ChainCacheItem {
1370 uint64_t first; /* the array at the beginning of the chain */
1371 uint64_t array; /* the cached array */
1372 uint64_t begin; /* the first item in the cached array */
1373 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1374 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1377 static void chain_cache_put(
1384 uint64_t last_index) {
1387 /* If the chain item to cache for this chain is the
1388 * first one it's not worth caching anything */
1392 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1393 ci = hashmap_steal_first(h);
1395 ci = new(ChainCacheItem, 1);
1402 if (hashmap_put(h, &ci->first, ci) < 0) {
1407 assert(ci->first == first);
1412 ci->last_index = last_index;
1415 static int generic_array_get(
1419 Object **ret, uint64_t *offset) {
1422 uint64_t p = 0, a, t = 0;
1430 /* Try the chain cache first */
1431 ci = hashmap_get(f->chain_cache, &first);
1432 if (ci && i > ci->total) {
1441 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1445 k = journal_file_entry_array_n_items(o);
1447 p = le64toh(o->entry_array.items[i]);
1453 a = le64toh(o->entry_array.next_entry_array_offset);
1459 /* Let's cache this item for the next invocation */
1460 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1462 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1475 static int generic_array_get_plus_one(
1480 Object **ret, uint64_t *offset) {
1489 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1502 return generic_array_get(f, first, i-1, ret, offset);
1511 static int generic_array_bisect(
1516 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1517 direction_t direction,
1522 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1523 bool subtract_one = false;
1524 Object *o, *array = NULL;
1529 assert(test_object);
1531 /* Start with the first array in the chain */
1534 ci = hashmap_get(f->chain_cache, &first);
1535 if (ci && n > ci->total) {
1536 /* Ah, we have iterated this bisection array chain
1537 * previously! Let's see if we can skip ahead in the
1538 * chain, as far as the last time. But we can't jump
1539 * backwards in the chain, so let's check that
1542 r = test_object(f, ci->begin, needle);
1546 if (r == TEST_LEFT) {
1547 /* OK, what we are looking for is right of the
1548 * begin of this EntryArray, so let's jump
1549 * straight to previously cached array in the
1555 last_index = ci->last_index;
1560 uint64_t left, right, k, lp;
1562 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1566 k = journal_file_entry_array_n_items(array);
1572 lp = p = le64toh(array->entry_array.items[i]);
1576 r = test_object(f, p, needle);
1580 if (r == TEST_FOUND)
1581 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1583 if (r == TEST_RIGHT) {
1587 if (last_index != (uint64_t) -1) {
1588 assert(last_index <= right);
1590 /* If we cached the last index we
1591 * looked at, let's try to not to jump
1592 * too wildly around and see if we can
1593 * limit the range to look at early to
1594 * the immediate neighbors of the last
1595 * index we looked at. */
1597 if (last_index > 0) {
1598 uint64_t x = last_index - 1;
1600 p = le64toh(array->entry_array.items[x]);
1604 r = test_object(f, p, needle);
1608 if (r == TEST_FOUND)
1609 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1611 if (r == TEST_RIGHT)
1617 if (last_index < right) {
1618 uint64_t y = last_index + 1;
1620 p = le64toh(array->entry_array.items[y]);
1624 r = test_object(f, p, needle);
1628 if (r == TEST_FOUND)
1629 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1631 if (r == TEST_RIGHT)
1639 if (left == right) {
1640 if (direction == DIRECTION_UP)
1641 subtract_one = true;
1647 assert(left < right);
1648 i = (left + right) / 2;
1650 p = le64toh(array->entry_array.items[i]);
1654 r = test_object(f, p, needle);
1658 if (r == TEST_FOUND)
1659 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1661 if (r == TEST_RIGHT)
1669 if (direction == DIRECTION_UP) {
1671 subtract_one = true;
1682 last_index = (uint64_t) -1;
1683 a = le64toh(array->entry_array.next_entry_array_offset);
1689 if (subtract_one && t == 0 && i == 0)
1692 /* Let's cache this item for the next invocation */
1693 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1695 if (subtract_one && i == 0)
1697 else if (subtract_one)
1698 p = le64toh(array->entry_array.items[i-1]);
1700 p = le64toh(array->entry_array.items[i]);
1702 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1713 *idx = t + i + (subtract_one ? -1 : 0);
1719 static int generic_array_bisect_plus_one(
1725 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1726 direction_t direction,
1732 bool step_back = false;
1736 assert(test_object);
1741 /* This bisects the array in object 'first', but first checks
1743 r = test_object(f, extra, needle);
1747 if (r == TEST_FOUND)
1748 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1750 /* if we are looking with DIRECTION_UP then we need to first
1751 see if in the actual array there is a matching entry, and
1752 return the last one of that. But if there isn't any we need
1753 to return this one. Hence remember this, and return it
1756 step_back = direction == DIRECTION_UP;
1758 if (r == TEST_RIGHT) {
1759 if (direction == DIRECTION_DOWN)
1765 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1767 if (r == 0 && step_back)
1776 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1792 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1798 else if (p < needle)
1804 int journal_file_move_to_entry_by_offset(
1807 direction_t direction,
1811 return generic_array_bisect(f,
1812 le64toh(f->header->entry_array_offset),
1813 le64toh(f->header->n_entries),
1821 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1828 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1832 if (le64toh(o->entry.seqnum) == needle)
1834 else if (le64toh(o->entry.seqnum) < needle)
1840 int journal_file_move_to_entry_by_seqnum(
1843 direction_t direction,
1847 return generic_array_bisect(f,
1848 le64toh(f->header->entry_array_offset),
1849 le64toh(f->header->n_entries),
1856 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1863 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1867 if (le64toh(o->entry.realtime) == needle)
1869 else if (le64toh(o->entry.realtime) < needle)
1875 int journal_file_move_to_entry_by_realtime(
1878 direction_t direction,
1882 return generic_array_bisect(f,
1883 le64toh(f->header->entry_array_offset),
1884 le64toh(f->header->n_entries),
1886 test_object_realtime,
1891 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1898 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1902 if (le64toh(o->entry.monotonic) == needle)
1904 else if (le64toh(o->entry.monotonic) < needle)
1910 static inline int find_data_object_by_boot_id(
1915 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1917 sd_id128_to_string(boot_id, t + 9);
1918 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1921 int journal_file_move_to_entry_by_monotonic(
1925 direction_t direction,
1934 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1940 return generic_array_bisect_plus_one(f,
1941 le64toh(o->data.entry_offset),
1942 le64toh(o->data.entry_array_offset),
1943 le64toh(o->data.n_entries),
1945 test_object_monotonic,
1950 int journal_file_next_entry(
1952 Object *o, uint64_t p,
1953 direction_t direction,
1954 Object **ret, uint64_t *offset) {
1960 assert(p > 0 || !o);
1962 n = le64toh(f->header->n_entries);
1967 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1969 if (o->object.type != OBJECT_ENTRY)
1972 r = generic_array_bisect(f,
1973 le64toh(f->header->entry_array_offset),
1974 le64toh(f->header->n_entries),
1983 if (direction == DIRECTION_DOWN) {
1996 /* And jump to it */
1997 r = generic_array_get(f,
1998 le64toh(f->header->entry_array_offset),
2005 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2006 log_debug("%s: entry array corrupted at entry %"PRIu64,
2017 int journal_file_skip_entry(
2019 Object *o, uint64_t p,
2021 Object **ret, uint64_t *offset) {
2030 if (o->object.type != OBJECT_ENTRY)
2033 r = generic_array_bisect(f,
2034 le64toh(f->header->entry_array_offset),
2035 le64toh(f->header->n_entries),
2044 /* Calculate new index */
2046 if ((uint64_t) -skip >= i)
2049 i = i - (uint64_t) -skip;
2051 i += (uint64_t) skip;
2053 n = le64toh(f->header->n_entries);
2060 return generic_array_get(f,
2061 le64toh(f->header->entry_array_offset),
2066 int journal_file_next_entry_for_data(
2068 Object *o, uint64_t p,
2069 uint64_t data_offset,
2070 direction_t direction,
2071 Object **ret, uint64_t *offset) {
2078 assert(p > 0 || !o);
2080 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2084 n = le64toh(d->data.n_entries);
2089 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2091 if (o->object.type != OBJECT_ENTRY)
2094 r = generic_array_bisect_plus_one(f,
2095 le64toh(d->data.entry_offset),
2096 le64toh(d->data.entry_array_offset),
2097 le64toh(d->data.n_entries),
2107 if (direction == DIRECTION_DOWN) {
2121 return generic_array_get_plus_one(f,
2122 le64toh(d->data.entry_offset),
2123 le64toh(d->data.entry_array_offset),
2128 int journal_file_move_to_entry_by_offset_for_data(
2130 uint64_t data_offset,
2132 direction_t direction,
2133 Object **ret, uint64_t *offset) {
2140 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2144 return generic_array_bisect_plus_one(f,
2145 le64toh(d->data.entry_offset),
2146 le64toh(d->data.entry_array_offset),
2147 le64toh(d->data.n_entries),
2154 int journal_file_move_to_entry_by_monotonic_for_data(
2156 uint64_t data_offset,
2159 direction_t direction,
2160 Object **ret, uint64_t *offset) {
2168 /* First, seek by time */
2169 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2175 r = generic_array_bisect_plus_one(f,
2176 le64toh(o->data.entry_offset),
2177 le64toh(o->data.entry_array_offset),
2178 le64toh(o->data.n_entries),
2180 test_object_monotonic,
2186 /* And now, continue seeking until we find an entry that
2187 * exists in both bisection arrays */
2193 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2197 r = generic_array_bisect_plus_one(f,
2198 le64toh(d->data.entry_offset),
2199 le64toh(d->data.entry_array_offset),
2200 le64toh(d->data.n_entries),
2208 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2212 r = generic_array_bisect_plus_one(f,
2213 le64toh(o->data.entry_offset),
2214 le64toh(o->data.entry_array_offset),
2215 le64toh(o->data.n_entries),
2237 int journal_file_move_to_entry_by_seqnum_for_data(
2239 uint64_t data_offset,
2241 direction_t direction,
2242 Object **ret, uint64_t *offset) {
2249 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2253 return generic_array_bisect_plus_one(f,
2254 le64toh(d->data.entry_offset),
2255 le64toh(d->data.entry_array_offset),
2256 le64toh(d->data.n_entries),
2263 int journal_file_move_to_entry_by_realtime_for_data(
2265 uint64_t data_offset,
2267 direction_t direction,
2268 Object **ret, uint64_t *offset) {
2275 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2279 return generic_array_bisect_plus_one(f,
2280 le64toh(d->data.entry_offset),
2281 le64toh(d->data.entry_array_offset),
2282 le64toh(d->data.n_entries),
2284 test_object_realtime,
2289 void journal_file_dump(JournalFile *f) {
2296 journal_file_print_header(f);
2298 p = le64toh(f->header->header_size);
2300 r = journal_file_move_to_object(f, -1, p, &o);
2304 switch (o->object.type) {
2307 printf("Type: OBJECT_UNUSED\n");
2311 printf("Type: OBJECT_DATA\n");
2315 printf("Type: OBJECT_FIELD\n");
2319 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2320 le64toh(o->entry.seqnum),
2321 le64toh(o->entry.monotonic),
2322 le64toh(o->entry.realtime));
2325 case OBJECT_FIELD_HASH_TABLE:
2326 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2329 case OBJECT_DATA_HASH_TABLE:
2330 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2333 case OBJECT_ENTRY_ARRAY:
2334 printf("Type: OBJECT_ENTRY_ARRAY\n");
2338 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2339 le64toh(o->tag.seqnum),
2340 le64toh(o->tag.epoch));
2344 printf("Type: unknown (%u)\n", o->object.type);
2348 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2349 printf("Flags: %s\n",
2350 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2352 if (p == le64toh(f->header->tail_object_offset))
2355 p = p + ALIGN64(le64toh(o->object.size));
2360 log_error("File corrupt");
2363 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2366 x = format_timestamp(buf, l, t);
2372 void journal_file_print_header(JournalFile *f) {
2373 char a[33], b[33], c[33], d[33];
2374 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2376 char bytes[FORMAT_BYTES_MAX];
2380 printf("File Path: %s\n"
2384 "Sequential Number ID: %s\n"
2386 "Compatible Flags:%s%s\n"
2387 "Incompatible Flags:%s%s%s\n"
2388 "Header size: %"PRIu64"\n"
2389 "Arena size: %"PRIu64"\n"
2390 "Data Hash Table Size: %"PRIu64"\n"
2391 "Field Hash Table Size: %"PRIu64"\n"
2392 "Rotate Suggested: %s\n"
2393 "Head Sequential Number: %"PRIu64"\n"
2394 "Tail Sequential Number: %"PRIu64"\n"
2395 "Head Realtime Timestamp: %s\n"
2396 "Tail Realtime Timestamp: %s\n"
2397 "Tail Monotonic Timestamp: %s\n"
2398 "Objects: %"PRIu64"\n"
2399 "Entry Objects: %"PRIu64"\n",
2401 sd_id128_to_string(f->header->file_id, a),
2402 sd_id128_to_string(f->header->machine_id, b),
2403 sd_id128_to_string(f->header->boot_id, c),
2404 sd_id128_to_string(f->header->seqnum_id, d),
2405 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2406 f->header->state == STATE_ONLINE ? "ONLINE" :
2407 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2408 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2409 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2410 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2411 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2412 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2413 le64toh(f->header->header_size),
2414 le64toh(f->header->arena_size),
2415 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2416 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2417 yes_no(journal_file_rotate_suggested(f, 0)),
2418 le64toh(f->header->head_entry_seqnum),
2419 le64toh(f->header->tail_entry_seqnum),
2420 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2421 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2422 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2423 le64toh(f->header->n_objects),
2424 le64toh(f->header->n_entries));
2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2427 printf("Data Objects: %"PRIu64"\n"
2428 "Data Hash Table Fill: %.1f%%\n",
2429 le64toh(f->header->n_data),
2430 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2432 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2433 printf("Field Objects: %"PRIu64"\n"
2434 "Field Hash Table Fill: %.1f%%\n",
2435 le64toh(f->header->n_fields),
2436 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2438 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2439 printf("Tag Objects: %"PRIu64"\n",
2440 le64toh(f->header->n_tags));
2441 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2442 printf("Entry Array Objects: %"PRIu64"\n",
2443 le64toh(f->header->n_entry_arrays));
2445 if (fstat(f->fd, &st) >= 0)
2446 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2449 int journal_file_open(
2455 JournalMetrics *metrics,
2456 MMapCache *mmap_cache,
2457 JournalFile *template,
2458 JournalFile **ret) {
2462 bool newly_created = false;
2467 if ((flags & O_ACCMODE) != O_RDONLY &&
2468 (flags & O_ACCMODE) != O_RDWR)
2471 if (!endswith(fname, ".journal") &&
2472 !endswith(fname, ".journal~"))
2475 f = new0(JournalFile, 1);
2483 f->prot = prot_from_flags(flags);
2484 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2485 #if defined(HAVE_LZ4)
2486 f->compress_lz4 = compress;
2487 #elif defined(HAVE_XZ)
2488 f->compress_xz = compress;
2495 f->mmap = mmap_cache_ref(mmap_cache);
2497 f->mmap = mmap_cache_new();
2504 f->path = strdup(fname);
2510 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2511 if (!f->chain_cache) {
2516 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2522 if (fstat(f->fd, &f->last_stat) < 0) {
2527 if (f->last_stat.st_size == 0 && f->writable) {
2530 /* Let's attach the creation time to the journal file,
2531 * so that the vacuuming code knows the age of this
2532 * file even if the file might end up corrupted one
2533 * day... Ideally we'd just use the creation time many
2534 * file systems maintain for each file, but there is
2535 * currently no usable API to query this, hence let's
2536 * emulate this via extended attributes. If extended
2537 * attributes are not supported we'll just skip this,
2538 * and rely solely on mtime/atime/ctime of the file.*/
2540 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2541 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2544 /* Try to load the FSPRG state, and if we can't, then
2545 * just don't do sealing */
2547 r = journal_file_fss_load(f);
2553 r = journal_file_init_header(f, template);
2557 if (fstat(f->fd, &f->last_stat) < 0) {
2562 newly_created = true;
2565 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2570 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2571 if (f->header == MAP_FAILED) {
2577 if (!newly_created) {
2578 r = journal_file_verify_header(f);
2584 if (!newly_created && f->writable) {
2585 r = journal_file_fss_load(f);
2593 journal_default_metrics(metrics, f->fd);
2594 f->metrics = *metrics;
2595 } else if (template)
2596 f->metrics = template->metrics;
2598 r = journal_file_refresh_header(f);
2604 r = journal_file_hmac_setup(f);
2609 if (newly_created) {
2610 r = journal_file_setup_field_hash_table(f);
2614 r = journal_file_setup_data_hash_table(f);
2619 r = journal_file_append_first_tag(f);
2625 r = journal_file_map_field_hash_table(f);
2629 r = journal_file_map_data_hash_table(f);
2637 journal_file_close(f);
2642 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2643 _cleanup_free_ char *p = NULL;
2645 JournalFile *old_file, *new_file = NULL;
2653 if (!old_file->writable)
2656 if (!endswith(old_file->path, ".journal"))
2659 l = strlen(old_file->path);
2660 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2661 (int) l - 8, old_file->path,
2662 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2663 le64toh((*f)->header->head_entry_seqnum),
2664 le64toh((*f)->header->head_entry_realtime));
2668 r = rename(old_file->path, p);
2672 old_file->header->state = STATE_ARCHIVED;
2674 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2675 journal_file_close(old_file);
2681 int journal_file_open_reliably(
2687 JournalMetrics *metrics,
2688 MMapCache *mmap_cache,
2689 JournalFile *template,
2690 JournalFile **ret) {
2694 _cleanup_free_ char *p = NULL;
2696 r = journal_file_open(fname, flags, mode, compress, seal,
2697 metrics, mmap_cache, template, ret);
2698 if (r != -EBADMSG && /* corrupted */
2699 r != -ENODATA && /* truncated */
2700 r != -EHOSTDOWN && /* other machine */
2701 r != -EPROTONOSUPPORT && /* incompatible feature */
2702 r != -EBUSY && /* unclean shutdown */
2703 r != -ESHUTDOWN /* already archived */)
2706 if ((flags & O_ACCMODE) == O_RDONLY)
2709 if (!(flags & O_CREAT))
2712 if (!endswith(fname, ".journal"))
2715 /* The file is corrupted. Rotate it away and try it again (but only once) */
2718 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2720 (unsigned long long) now(CLOCK_REALTIME),
2724 r = rename(fname, p);
2728 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2730 return journal_file_open(fname, flags, mode, compress, seal,
2731 metrics, mmap_cache, template, ret);
2734 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2736 uint64_t q, xor_hash = 0;
2749 ts.monotonic = le64toh(o->entry.monotonic);
2750 ts.realtime = le64toh(o->entry.realtime);
2752 n = journal_file_entry_n_items(o);
2753 /* alloca() can't take 0, hence let's allocate at least one */
2754 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2756 for (i = 0; i < n; i++) {
2763 q = le64toh(o->entry.items[i].object_offset);
2764 le_hash = o->entry.items[i].hash;
2766 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2770 if (le_hash != o->data.hash)
2773 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2776 /* We hit the limit on 32bit machines */
2777 if ((uint64_t) t != l)
2780 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2781 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2784 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2785 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2789 data = from->compress_buffer;
2792 return -EPROTONOSUPPORT;
2795 data = o->data.payload;
2797 r = journal_file_append_data(to, data, l, &u, &h);
2801 xor_hash ^= le64toh(u->data.hash);
2802 items[i].object_offset = htole64(h);
2803 items[i].hash = u->data.hash;
2805 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2810 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2813 void journal_default_metrics(JournalMetrics *m, int fd) {
2814 uint64_t fs_size = 0;
2816 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2821 if (fstatvfs(fd, &ss) >= 0)
2822 fs_size = ss.f_frsize * ss.f_blocks;
2824 if (m->max_use == (uint64_t) -1) {
2827 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2829 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2830 m->max_use = DEFAULT_MAX_USE_UPPER;
2832 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2833 m->max_use = DEFAULT_MAX_USE_LOWER;
2835 m->max_use = DEFAULT_MAX_USE_LOWER;
2837 m->max_use = PAGE_ALIGN(m->max_use);
2839 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2840 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2843 if (m->max_size == (uint64_t) -1) {
2844 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2846 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2847 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2849 m->max_size = PAGE_ALIGN(m->max_size);
2851 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2852 m->max_size = JOURNAL_FILE_SIZE_MIN;
2854 if (m->max_size*2 > m->max_use)
2855 m->max_use = m->max_size*2;
2857 if (m->min_size == (uint64_t) -1)
2858 m->min_size = JOURNAL_FILE_SIZE_MIN;
2860 m->min_size = PAGE_ALIGN(m->min_size);
2862 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2863 m->min_size = JOURNAL_FILE_SIZE_MIN;
2865 if (m->min_size > m->max_size)
2866 m->max_size = m->min_size;
2869 if (m->keep_free == (uint64_t) -1) {
2872 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2874 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2875 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2878 m->keep_free = DEFAULT_KEEP_FREE;
2881 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2882 format_bytes(a, sizeof(a), m->max_use),
2883 format_bytes(b, sizeof(b), m->max_size),
2884 format_bytes(c, sizeof(c), m->min_size),
2885 format_bytes(d, sizeof(d), m->keep_free));
2888 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2893 if (f->header->head_entry_realtime == 0)
2896 *from = le64toh(f->header->head_entry_realtime);
2900 if (f->header->tail_entry_realtime == 0)
2903 *to = le64toh(f->header->tail_entry_realtime);
2909 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2917 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2921 if (le64toh(o->data.n_entries) <= 0)
2925 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2929 *from = le64toh(o->entry.monotonic);
2933 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2937 r = generic_array_get_plus_one(f,
2938 le64toh(o->data.entry_offset),
2939 le64toh(o->data.entry_array_offset),
2940 le64toh(o->data.n_entries)-1,
2945 *to = le64toh(o->entry.monotonic);
2951 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2954 /* If we gained new header fields we gained new features,
2955 * hence suggest a rotation */
2956 if (le64toh(f->header->header_size) < sizeof(Header)) {
2957 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2961 /* Let's check if the hash tables grew over a certain fill
2962 * level (75%, borrowing this value from Java's hash table
2963 * implementation), and if so suggest a rotation. To calculate
2964 * the fill level we need the n_data field, which only exists
2965 * in newer versions. */
2967 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2968 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2969 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2971 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2972 le64toh(f->header->n_data),
2973 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2974 (unsigned long long) f->last_stat.st_size,
2975 f->last_stat.st_size / le64toh(f->header->n_data));
2979 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2980 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2981 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2983 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2984 le64toh(f->header->n_fields),
2985 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2989 /* Are the data objects properly indexed by field objects? */
2990 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2991 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2992 le64toh(f->header->n_data) > 0 &&
2993 le64toh(f->header->n_fields) == 0)
2996 if (max_file_usec > 0) {
2999 h = le64toh(f->header->head_entry_realtime);
3000 t = now(CLOCK_REALTIME);
3002 if (h > 0 && t > h + max_file_usec)