#include "disk-io.h"
#include "print-tree.h"
#include "utils.h"
-#include "ulist.h"
+#include "kernel-shared/ulist.h"
#include "rbtree-utils.h"
+#include "transaction.h"
+#include "repair.h"
#include "qgroup-verify.h"
/*#define QGROUP_VERIFY_DEBUG*/
static unsigned long tot_extents_scanned = 0;
-static void add_bytes(u64 root_objectid, u64 num_bytes, int exclusive);
+struct qgroup_count;
+static struct qgroup_count *find_count(u64 qgroupid);
struct qgroup_info {
u64 referenced;
struct qgroup_info info;
struct rb_node rb_node;
+
+ /* Parents when we are a child group */
+ struct list_head groups;
+
+ /*
+ * Children when we are a parent group (not currently used but
+ * maintained to mirror kernel handling of qgroups)
+ */
+ struct list_head members;
+
+ u64 cur_refcnt;
+
+ struct list_head bad_list;
};
static struct counts_tree {
unsigned int qgroup_inconsist:1;
} counts = { .root = RB_ROOT };
+static LIST_HEAD(bad_qgroups);
+
static struct rb_root by_bytenr = RB_ROOT;
/*
+ * Glue structure to represent the relations between qgroups. Mirrored
+ * from kernel.
+ */
+struct btrfs_qgroup_list {
+ struct list_head next_group;
+ struct list_head next_member;
+ struct qgroup_count *group; /* Parent group */
+ struct qgroup_count *member;
+};
+
+/* Allow us to reset ref counts during accounting without zeroing each group. */
+static u64 qgroup_seq = 1ULL;
+
+static inline void update_cur_refcnt(struct qgroup_count *c)
+{
+ if (c->cur_refcnt < qgroup_seq)
+ c->cur_refcnt = qgroup_seq;
+ c->cur_refcnt++;
+}
+
+static inline u64 group_get_cur_refcnt(struct qgroup_count *c)
+{
+ if (c->cur_refcnt < qgroup_seq)
+ return 0;
+ return c->cur_refcnt - qgroup_seq;
+}
+
+static void inc_qgroup_seq(int root_count)
+{
+ qgroup_seq += root_count + 1;
+}
+
+/*
* List of interior tree blocks. We walk this list after loading the
* extent tree to resolve implied refs. For each interior node we'll
* place a shared ref in the ref tree against each child object. This
/*
* Resolves all the possible roots for the ref at parent.
*/
-static void find_parent_roots(struct ulist *roots, u64 parent)
+static int find_parent_roots(struct ulist *roots, u64 parent)
{
struct ref *ref;
struct rb_node *node;
+ int ret;
/*
* Search the rbtree for the first ref with bytenr == parent.
* For each unresolved root, we recurse
*/
ref = find_ref_bytenr(parent);
+ if (!ref) {
+ error("bytenr ref not found for parent %llu",
+ (unsigned long long)parent);
+ return -EIO;
+ }
node = &ref->bytenr_node;
- BUG_ON(ref == NULL);
- BUG_ON(ref->bytenr != parent);
+ if (ref->bytenr != parent) {
+ error("found bytenr ref does not match parent: %llu != %llu",
+ (unsigned long long)ref->bytenr,
+ (unsigned long long)parent);
+ return -EIO;
+ }
{
/*
*/
struct rb_node *prev_node = rb_prev(&ref->bytenr_node);
struct ref *prev;
+
if (prev_node) {
prev = rb_entry(prev_node, struct ref, bytenr_node);
- BUG_ON(prev->bytenr == parent);
+ if (prev->bytenr == parent) {
+ error(
+ "unexpected: prev bytenr same as parent: %llu",
+ (unsigned long long)parent);
+ return -EIO;
+ }
}
}
do {
- if (ref->root)
- ulist_add(roots, ref->root, 0, 0);
- else
- find_parent_roots(roots, ref->parent);
+ if (ref->root) {
+ if (is_fstree(ref->root)) {
+ ret = ulist_add(roots, ref->root, 0, 0);
+ if (ret < 0)
+ goto out;
+ }
+ } else if (ref->parent == ref->bytenr) {
+ /*
+ * Special loop case for tree reloc tree
+ */
+ ref->root = BTRFS_TREE_RELOC_OBJECTID;
+ } else {
+ ret = find_parent_roots(roots, ref->parent);
+ if (ret < 0)
+ goto out;
+ }
node = rb_next(node);
if (node)
ref = rb_entry(node, struct ref, bytenr_node);
} while (node && ref->bytenr == parent);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int account_one_extent(struct ulist *roots, u64 bytenr, u64 num_bytes)
+{
+ int ret;
+ u64 id, nr_roots, nr_refs;
+ struct qgroup_count *count;
+ struct ulist *counts = ulist_alloc(0);
+ struct ulist *tmp = ulist_alloc(0);
+ struct ulist_iterator uiter;
+ struct ulist_iterator tmp_uiter;
+ struct ulist_node *unode;
+ struct ulist_node *tmp_unode;
+ struct btrfs_qgroup_list *glist;
+
+ if (!counts || !tmp) {
+ ulist_free(counts);
+ ulist_free(tmp);
+ return ENOMEM;
+ }
+
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ BUG_ON(unode->val == 0ULL);
+
+ /*
+ * For each root, find their corresponding tracking group and
+ * add it to our qgroups list.
+ */
+ count = find_count(unode->val);
+ if (!count)
+ continue;
+
+ BUG_ON(!is_fstree(unode->val));
+ ret = ulist_add(counts, count->qgroupid, ptr_to_u64(count), 0);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * Now we look for parents (and parents of those...). Use a tmp
+ * ulist here to avoid re-walking (and re-incrementing) our
+ * already added items on every loop iteration.
+ */
+ ulist_reinit(tmp);
+ ret = ulist_add(tmp, count->qgroupid, ptr_to_u64(count), 0);
+ if (ret < 0)
+ goto out;
+
+ ULIST_ITER_INIT(&tmp_uiter);
+ while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ /* Bump the refcount on a node every time we see it. */
+ count = u64_to_ptr(tmp_unode->aux);
+ update_cur_refcnt(count);
+
+ list_for_each_entry(glist, &count->groups, next_group) {
+ struct qgroup_count *parent;
+ parent = glist->group;
+ id = parent->qgroupid;
+
+ BUG_ON(!count);
+
+ ret = ulist_add(counts, id, ptr_to_u64(parent),
+ 0);
+ if (ret < 0)
+ goto out;
+ ret = ulist_add(tmp, id, ptr_to_u64(parent),
+ 0);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * Now that we have gathered up and counted all the groups, we
+ * can add bytes for this ref.
+ */
+ nr_roots = roots->nnodes;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(counts, &uiter))) {
+ count = u64_to_ptr(unode->aux);
+
+ nr_refs = group_get_cur_refcnt(count);
+ if (nr_refs) {
+ count->info.referenced += num_bytes;
+ count->info.referenced_compressed += num_bytes;
+
+ if (nr_refs == nr_roots) {
+ count->info.exclusive += num_bytes;
+ count->info.exclusive_compressed += num_bytes;
+ }
+ }
+#ifdef QGROUP_VERIFY_DEBUG
+ printf("account (%llu, %llu), qgroup %llu/%llu, rfer %llu,"
+ " excl %llu, refs %llu, roots %llu\n", bytenr, num_bytes,
+ btrfs_qgroup_level(count->qgroupid),
+ btrfs_qgroup_subvid(count->qgroupid),
+ count->info.referenced, count->info.exclusive, nr_refs,
+ nr_roots);
+#endif
+ }
+
+ inc_qgroup_seq(roots->nnodes);
+ ret = 0;
+out:
+ ulist_free(counts);
+ ulist_free(tmp);
+ return ret;
}
static void print_subvol_info(u64 subvolid, u64 bytenr, u64 num_bytes,
* - resolve all possible roots for shared refs, insert each
* of those into ref_roots ulist (this is a recursive process)
*
- * - Walk ref_roots ulist, adding extent bytes to each qgroup count that
- * cooresponds to a found root.
+ * - With all roots resolved we can account the ref - this is done in
+ * account_one_extent().
*/
-static void account_all_refs(int do_qgroups, u64 search_subvol)
+static int account_all_refs(int do_qgroups, u64 search_subvol)
{
- int exclusive;
struct ref *ref;
struct rb_node *node;
u64 bytenr, num_bytes;
struct ulist *roots = ulist_alloc(0);
- struct ulist_iterator uiter;
- struct ulist_node *unode;
+ int ret;
node = rb_first(&by_bytenr);
while (node) {
do {
BUG_ON(ref->bytenr != bytenr);
BUG_ON(ref->num_bytes != num_bytes);
- if (ref->root)
- ulist_add(roots, ref->root, 0, 0);
- else
- find_parent_roots(roots, ref->parent);
+ if (ref->root) {
+ if (is_fstree(ref->root)) {
+ if (ulist_add(roots, ref->root, 0, 0) < 0)
+ goto enomem;
+ }
+ } else {
+ ret = find_parent_roots(roots, ref->parent);
+ if (ret < 0)
+ goto enomem;
+ }
/*
* When we leave this inner loop, node is set
ref = rb_entry(node, struct ref, bytenr_node);
} while (node && ref->bytenr == bytenr);
- /*
- * Now that we have all roots, we can properly account
- * this extent against the corresponding qgroups.
- */
- if (roots->nnodes == 1)
- exclusive = 1;
- else
- exclusive = 0;
-
if (search_subvol)
print_subvol_info(search_subvol, bytenr, num_bytes,
roots);
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- BUG_ON(unode->val == 0ULL);
- /* We only want to account fs trees */
- if (is_fstree(unode->val) && do_qgroups)
- add_bytes(unode->val, num_bytes, exclusive);
- }
+ if (!do_qgroups)
+ continue;
+
+ if (account_one_extent(roots, bytenr, num_bytes))
+ goto enomem;
}
ulist_free(roots);
+ return 0;
+enomem:
+ error("Out of memory while accounting refs for qgroups");
+ return -ENOMEM;
}
static u64 resolve_one_root(u64 bytenr)
if (ref->root)
return ref->root;
+ if (ref->parent == bytenr)
+ return BTRFS_TREE_RELOC_OBJECTID;
return resolve_one_root(ref->parent);
}
// printf("travel_tree: bytenr: %llu\tnum_bytes: %llu\tref_parent: %llu\n",
// bytenr, num_bytes, ref_parent);
- eb = read_tree_block(root, bytenr, num_bytes, 0);
+ eb = read_tree_block(info, bytenr, 0);
if (!extent_buffer_uptodate(eb))
return -EIO;
nr = btrfs_header_nritems(eb);
for (i = 0; i < nr; i++) {
new_bytenr = btrfs_node_blockptr(eb, i);
- new_num_bytes = root->nodesize;
+ new_num_bytes = info->nodesize;
ret = travel_tree(info, root, new_bytenr, new_num_bytes,
ref_parent);
struct btrfs_root *root;
struct btrfs_key key;
+ /* Tree reloc tree doesn't contribute qgroup, skip it */
+ if (root_id == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
key.objectid = root_id;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
item->exclusive = btrfs_qgroup_info_exclusive(leaf, disk);
item->exclusive_compressed =
btrfs_qgroup_info_exclusive_compressed(leaf, disk);
+ INIT_LIST_HEAD(&c->groups);
+ INIT_LIST_HEAD(&c->members);
+ INIT_LIST_HEAD(&c->bad_list);
if (insert_count(c)) {
free(c);
return c;
}
-static void add_bytes(u64 root_objectid, u64 num_bytes, int exclusive)
+static int add_qgroup_relation(u64 memberid, u64 parentid)
{
- struct qgroup_count *count = find_count(root_objectid);
- struct qgroup_info *qg;
+ struct qgroup_count *member;
+ struct qgroup_count *parent;
+ struct btrfs_qgroup_list *list;
- BUG_ON(num_bytes < 4096); /* Random sanity check. */
+ if (memberid > parentid)
+ return 0;
- if (!count)
- return;
+ member = find_count(memberid);
+ parent = find_count(parentid);
+ if (!member || !parent)
+ return -ENOENT;
- qg = &count->info;
+ list = calloc(1, sizeof(*list));
+ if (!list)
+ return -ENOMEM;
- qg->referenced += num_bytes;
- /*
- * count of compressed bytes is unimplemented, so we do the
- * same as kernel.
- */
- qg->referenced_compressed += num_bytes;
+ list->group = parent;
+ list->member = member;
+ list_add_tail(&list->next_group, &member->groups);
+ list_add_tail(&list->next_member, &parent->members);
- if (exclusive) {
- qg->exclusive += num_bytes;
- qg->exclusive_compressed += num_bytes;
- }
+ return 0;
}
-static void read_qgroup_status(struct btrfs_path *path,
+static void read_qgroup_status(struct extent_buffer *eb, int slot,
struct counts_tree *counts)
{
struct btrfs_qgroup_status_item *status_item;
u64 flags;
- status_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_qgroup_status_item);
- flags = btrfs_qgroup_status_flags(path->nodes[0], status_item);
- counts->qgroup_inconsist = flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- counts->rescan_running = flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ status_item = btrfs_item_ptr(eb, slot, struct btrfs_qgroup_status_item);
+ flags = btrfs_qgroup_status_flags(eb, status_item);
+ /*
+ * Since qgroup_inconsist/rescan_running is just one bit,
+ * assign value directly won't work.
+ */
+ counts->qgroup_inconsist = !!(flags &
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT);
+ counts->rescan_running = !!(flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN);
}
static int load_quota_info(struct btrfs_fs_info *info)
struct btrfs_qgroup_info_item *item;
struct qgroup_count *count;
int i, nr;
+ int search_relations = 0;
+loop:
+ /*
+ * Do 2 passes, the first allocates group counts and reads status
+ * items. The 2nd pass picks up relation items and glues them to their
+ * respective count structures.
+ */
btrfs_init_path(&path);
key.offset = 0;
- key.objectid = 0;
+ key.objectid = search_relations ? 0 : BTRFS_QGROUP_RELATION_KEY;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
btrfs_item_key(leaf, &disk_key, i);
btrfs_disk_key_to_cpu(&key, &disk_key);
+ if (search_relations) {
+ if (key.type == BTRFS_QGROUP_RELATION_KEY) {
+ ret = add_qgroup_relation(key.objectid,
+ key.offset);
+ if (ret) {
+ error("out of memory");
+ goto out;
+ }
+ }
+ continue;
+ }
+
if (key.type == BTRFS_QGROUP_STATUS_KEY) {
- read_qgroup_status(&path, &counts);
+ read_qgroup_status(leaf, i, &counts);
continue;
}
- if (key.type == BTRFS_QGROUP_RELATION_KEY)
- printf("Ignoring qgroup relation key %llu\n",
- key.objectid);
/*
- * Ignore: BTRFS_QGROUP_LIMIT_KEY,
- * BTRFS_QGROUP_RELATION_KEY
+ * At this point, we can ignore anything that
+ * isn't a qgroup info.
*/
if (key.type != BTRFS_QGROUP_INFO_KEY)
continue;
ret = 0;
btrfs_release_path(&path);
+
+ if (!search_relations) {
+ search_relations = 1;
+ goto loop;
+ }
+
out:
return ret;
}
bytenr = key.objectid;
num_bytes = key.offset;
if (key.type == BTRFS_METADATA_ITEM_KEY) {
- num_bytes = info->extent_root->nodesize;
+ num_bytes = info->nodesize;
meta = 1;
}
prefix, type, bytes, type, bytes_compressed);
}
+static inline int qgroup_printable(struct qgroup_count *c)
+{
+ return !!(c->subvol_exists || btrfs_qgroup_level(c->qgroupid));
+}
+
static int report_qgroup_difference(struct qgroup_count *count, int verbose)
{
int is_different;
is_different = excl_diff || ref_diff;
- if (verbose || (is_different && count->subvol_exists)) {
- printf("Counts for qgroup id: %llu %s\n",
- (unsigned long long)count->qgroupid,
+ if (verbose || (is_different && qgroup_printable(count))) {
+ printf("Counts for qgroup id: %llu/%llu %s\n",
+ btrfs_qgroup_level(count->qgroupid),
+ btrfs_qgroup_subvid(count->qgroupid),
is_different ? "are different" : "");
print_fields(info->referenced, info->referenced_compressed,
print_fields_signed(excl_diff, excl_diff,
"diff:", "exclusive");
}
- return (is_different && count->subvol_exists);
+
+ return is_different;
}
-int report_qgroups(int all)
+void report_qgroups(int all)
{
struct rb_node *node;
struct qgroup_count *c;
- int ret = 0;
- if (counts.rescan_running) {
+ if (!repair && counts.rescan_running) {
if (all) {
printf(
- "Qgroup rescan is running, qgroup counts difference is expected\n");
+ "Qgroup rescan is running, a difference in qgroup counts is expected\n");
} else {
printf(
- "Qgroup rescan is running, ignore qgroup check\n");
- return ret;
+ "Qgroup rescan is running, qgroups will not be printed.\n");
+ return;
}
}
if (counts.qgroup_inconsist && !counts.rescan_running)
- fprintf(stderr, "Qgroup is already inconsistent before checking\n");
+ fprintf(stderr, "Qgroup are marked as inconsistent.\n");
node = rb_first(&counts.root);
while (node) {
c = rb_entry(node, struct qgroup_count, rb_node);
- ret |= report_qgroup_difference(c, all);
+
+ if (report_qgroup_difference(c, all))
+ list_add_tail(&c->bad_list, &bad_qgroups);
+
node = rb_next(node);
}
- return ret;
+}
+
+void free_qgroup_counts(void)
+{
+ struct rb_node *node;
+ struct qgroup_count *c;
+ struct btrfs_qgroup_list *glist, *tmpglist;
+
+ node = rb_first(&counts.root);
+ while (node) {
+ c = rb_entry(node, struct qgroup_count, rb_node);
+
+ list_del(&c->bad_list);
+
+ list_for_each_entry_safe(glist, tmpglist, &c->groups,
+ next_group) {
+ list_del(&glist->next_group);
+ list_del(&glist->next_member);
+ free(glist);
+ }
+ list_for_each_entry_safe(glist, tmpglist, &c->members,
+ next_group) {
+ list_del(&glist->next_group);
+ list_del(&glist->next_member);
+ free(glist);
+ }
+
+ node = rb_next(node);
+
+ rb_erase(&c->rb_node, &counts.root);
+ free(c);
+ }
}
int qgroup_verify_all(struct btrfs_fs_info *info)
goto out;
}
- account_all_refs(1, 0);
+ ret = account_all_refs(1, 0);
out:
/*
}
printf("Offset\t\tLen\tRoot Refs\tRoots\n");
- account_all_refs(0, subvol);
+ ret = account_all_refs(0, subvol);
out:
free_tree_blocks();
return ret;
}
+static int repair_qgroup_info(struct btrfs_fs_info *info,
+ struct qgroup_count *count)
+{
+ int ret;
+ struct btrfs_root *root = info->quota_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path path;
+ struct btrfs_qgroup_info_item *info_item;
+ struct btrfs_key key;
+
+ printf("Repair qgroup %llu/%llu\n", btrfs_qgroup_level(count->qgroupid),
+ btrfs_qgroup_subvid(count->qgroupid));
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_init_path(&path);
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_INFO_KEY;
+ key.offset = count->qgroupid;
+ ret = btrfs_search_slot(trans, root, &key, &path, 0, 1);
+ if (ret) {
+ error("could not find disk item for qgroup %llu/%llu",
+ btrfs_qgroup_level(count->qgroupid),
+ btrfs_qgroup_subvid(count->qgroupid));
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+
+ info_item = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_qgroup_info_item);
+
+ btrfs_set_qgroup_info_generation(path.nodes[0], info_item,
+ trans->transid);
+
+ btrfs_set_qgroup_info_referenced(path.nodes[0], info_item,
+ count->info.referenced);
+ btrfs_set_qgroup_info_referenced_compressed(path.nodes[0], info_item,
+ count->info.referenced_compressed);
+
+ btrfs_set_qgroup_info_exclusive(path.nodes[0], info_item,
+ count->info.exclusive);
+ btrfs_set_qgroup_info_exclusive_compressed(path.nodes[0], info_item,
+ count->info.exclusive_compressed);
+
+ btrfs_mark_buffer_dirty(path.nodes[0]);
+
+out:
+ btrfs_commit_transaction(trans, root);
+ btrfs_release_path(&path);
+
+ return ret;
+}
+
+static int repair_qgroup_status(struct btrfs_fs_info *info)
+{
+ int ret;
+ struct btrfs_root *root = info->quota_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path path;
+ struct btrfs_key key;
+ struct btrfs_qgroup_status_item *status_item;
+
+ printf("Repair qgroup status item\n");
+
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_init_path(&path);
+ key.objectid = 0;
+ key.type = BTRFS_QGROUP_STATUS_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(trans, root, &key, &path, 0, 1);
+ if (ret) {
+ error("could not find qgroup status item");
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+
+ status_item = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_qgroup_status_item);
+ btrfs_set_qgroup_status_flags(path.nodes[0], status_item,
+ BTRFS_QGROUP_STATUS_FLAG_ON);
+ btrfs_set_qgroup_status_rescan(path.nodes[0], status_item, 0);
+ btrfs_set_qgroup_status_generation(path.nodes[0], status_item,
+ trans->transid);
+
+ btrfs_mark_buffer_dirty(path.nodes[0]);
+
+out:
+ btrfs_commit_transaction(trans, root);
+ btrfs_release_path(&path);
+
+ return ret;
+}
+
+int repair_qgroups(struct btrfs_fs_info *info, int *repaired)
+{
+ int ret = 0;
+ struct qgroup_count *count, *tmpcount;
+
+ *repaired = 0;
+
+ if (!repair)
+ return 0;
+
+ list_for_each_entry_safe(count, tmpcount, &bad_qgroups, bad_list) {
+ ret = repair_qgroup_info(info, count);
+ if (ret) {
+ goto out;
+ }
+
+ (*repaired)++;
+
+ list_del_init(&count->bad_list);
+ }
+
+ /*
+ * Do this step last as we want the latest transaction id on
+ * our qgroup status to avoid a (useless) warning after
+ * mount.
+ */
+ if (*repaired || counts.qgroup_inconsist || counts.rescan_running) {
+ ret = repair_qgroup_status(info);
+ if (ret)
+ goto out;
+
+ (*repaired)++;
+ }
+
+out:
+ return ret;
+}