2 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
3 * Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
5 * This file is part of LVM2.
7 * This copyrighted material is made available to anyone wishing to use,
8 * modify, copy, or redistribute it subject to the terms and conditions
9 * of the GNU Lesser General Public License v.2.1.
11 * You should have received a copy of the GNU Lesser General Public License
12 * along with this program; if not, write to the Free Software Foundation,
13 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 #include "toolcontext.h"
20 #include "lvm-string.h"
32 #include "filter-persistent.h"
35 #include <sys/param.h>
37 static struct physical_volume *_pv_read(struct cmd_context *cmd,
38 struct dm_pool *pvmem,
41 uint64_t *label_sector,
42 int warnings, int scan_label_only);
44 static struct physical_volume *_find_pv_by_name(struct cmd_context *cmd,
47 static struct pv_list *_find_pv_in_vg(const struct volume_group *vg,
50 static struct pv_list *_find_pv_in_vg_by_uuid(const struct volume_group *vg,
53 static uint32_t _vg_bad_status_bits(const struct volume_group *vg,
56 const char _really_init[] =
57 "Really INITIALIZE physical volume \"%s\" of volume group \"%s\" [y/n]? ";
59 static int _alignment_overrides_default(unsigned long data_alignment,
60 unsigned long default_pe_align)
62 return data_alignment && (default_pe_align % data_alignment);
65 unsigned long set_pe_align(struct physical_volume *pv, unsigned long data_alignment)
67 unsigned long default_pe_align, temp_pe_align;
73 /* Always use specified data_alignment */
74 pv->pe_align = data_alignment;
78 default_pe_align = find_config_tree_int(pv->fmt->cmd,
79 "devices/default_data_alignment",
80 DEFAULT_DATA_ALIGNMENT);
83 /* align on 1 MiB multiple */
84 default_pe_align *= DEFAULT_PE_ALIGN;
86 /* align on 64 KiB multiple (old default) */
87 default_pe_align = DEFAULT_PE_ALIGN_OLD;
89 pv->pe_align = MAX((default_pe_align << SECTOR_SHIFT),
90 lvm_getpagesize()) >> SECTOR_SHIFT;
96 * Align to stripe-width of underlying md device if present
98 if (find_config_tree_bool(pv->fmt->cmd, "devices/md_chunk_alignment",
99 DEFAULT_MD_CHUNK_ALIGNMENT)) {
100 temp_pe_align = dev_md_stripe_width(pv->fmt->cmd->sysfs_dir, pv->dev);
101 if (_alignment_overrides_default(temp_pe_align, default_pe_align))
102 pv->pe_align = MAX(pv->pe_align, temp_pe_align);
106 * Align to topology's minimum_io_size or optimal_io_size if present
107 * - minimum_io_size - the smallest request the device can perform
108 * w/o incurring a read-modify-write penalty (e.g. MD's chunk size)
109 * - optimal_io_size - the device's preferred unit of receiving I/O
110 * (e.g. MD's stripe width)
112 if (find_config_tree_bool(pv->fmt->cmd,
113 "devices/data_alignment_detection",
114 DEFAULT_DATA_ALIGNMENT_DETECTION)) {
115 temp_pe_align = dev_minimum_io_size(pv->fmt->cmd->sysfs_dir, pv->dev);
116 if (_alignment_overrides_default(temp_pe_align, default_pe_align))
117 pv->pe_align = MAX(pv->pe_align, temp_pe_align);
119 temp_pe_align = dev_optimal_io_size(pv->fmt->cmd->sysfs_dir, pv->dev);
120 if (_alignment_overrides_default(temp_pe_align, default_pe_align))
121 pv->pe_align = MAX(pv->pe_align, temp_pe_align);
125 log_very_verbose("%s: Setting PE alignment to %lu sectors.",
126 dev_name(pv->dev), pv->pe_align);
131 unsigned long set_pe_align_offset(struct physical_volume *pv,
132 unsigned long data_alignment_offset)
134 if (pv->pe_align_offset)
137 if (data_alignment_offset) {
138 /* Always use specified data_alignment_offset */
139 pv->pe_align_offset = data_alignment_offset;
146 if (find_config_tree_bool(pv->fmt->cmd,
147 "devices/data_alignment_offset_detection",
148 DEFAULT_DATA_ALIGNMENT_OFFSET_DETECTION)) {
149 int align_offset = dev_alignment_offset(pv->fmt->cmd->sysfs_dir,
151 /* must handle a -1 alignment_offset; means dev is misaligned */
152 if (align_offset < 0)
154 pv->pe_align_offset = MAX(pv->pe_align_offset, align_offset);
158 log_very_verbose("%s: Setting PE alignment offset to %lu sectors.",
159 dev_name(pv->dev), pv->pe_align_offset);
161 return pv->pe_align_offset;
164 void add_pvl_to_vgs(struct volume_group *vg, struct pv_list *pvl)
166 dm_list_add(&vg->pvs, &pvl->list);
171 void del_pvl_from_vgs(struct volume_group *vg, struct pv_list *pvl)
174 dm_list_del(&pvl->list);
175 pvl->pv->vg = NULL; /* orphan */
180 * add_pv_to_vg - Add a physical volume to a volume group
181 * @vg - volume group to add to
182 * @pv_name - name of the pv (to be removed)
183 * @pv - physical volume to add to volume group
188 * FIXME: remove pv_name - obtain safely from pv
190 int add_pv_to_vg(struct volume_group *vg, const char *pv_name,
191 struct physical_volume *pv)
194 struct format_instance *fid = vg->fid;
195 struct dm_pool *mem = vg->vgmem;
196 char uuid[64] __attribute__((aligned(8)));
197 struct dm_list *mdas;
199 log_verbose("Adding physical volume '%s' to volume group '%s'",
202 if (!(pvl = dm_pool_zalloc(mem, sizeof(*pvl)))) {
203 log_error("pv_list allocation for '%s' failed", pv_name);
207 if (!is_orphan_vg(pv->vg_name)) {
208 log_error("Physical volume '%s' is already in volume group "
209 "'%s'", pv_name, pv->vg_name);
213 if (pv->fmt != fid->fmt) {
214 log_error("Physical volume %s is of different format type (%s)",
215 pv_name, pv->fmt->name);
219 /* Ensure PV doesn't depend on another PV already in the VG */
220 if (pv_uses_vg(pv, vg)) {
221 log_error("Physical volume %s might be constructed from same "
222 "volume group %s", pv_name, vg->name);
226 if (!(pv->vg_name = dm_pool_strdup(mem, vg->name))) {
227 log_error("vg->name allocation failed for '%s'", pv_name);
231 memcpy(&pv->vgid, &vg->id, sizeof(vg->id));
233 /* Units of 512-byte sectors */
234 pv->pe_size = vg->extent_size;
237 * pe_count must always be calculated by pv_setup
239 pv->pe_alloc_count = 0;
242 * FIXME: this does not work entirely correctly in the case where a PV
243 * has 2 mdas and only one is ignored; ideally all non-ignored mdas
244 * should be placed on metadata_areas list and ignored on the
245 * metadata_areas_ignored list; however this requires another
246 * fairly complex refactoring to remove the 'mdas' parameter from both
247 * pv_setup and pv_write. For now, we only put ignored mdas on the
248 * metadata_areas_ignored list if all mdas in the PV are ignored;
249 * otherwise, we use the non-ignored list.
251 if (!pv_mda_used_count(pv))
252 mdas = &fid->metadata_areas_ignored;
254 mdas = &fid->metadata_areas_in_use;
256 if (!fid->fmt->ops->pv_setup(fid->fmt, UINT64_C(0), 0,
257 vg->extent_size, 0, 0, 0UL, UINT64_C(0),
259 log_error("Format-specific setup of physical volume '%s' "
264 if (_find_pv_in_vg(vg, pv_name) ||
265 _find_pv_in_vg_by_uuid(vg, &pv->id)) {
266 if (!id_write_format(&pv->id, uuid, sizeof(uuid))) {
270 log_error("Physical volume '%s (%s)' listed more than once.",
275 if (vg->pv_count && (vg->pv_count == vg->max_pv)) {
276 log_error("No space for '%s' - volume group '%s' "
277 "holds max %d physical volume(s).", pv_name,
278 vg->name, vg->max_pv);
282 if (!alloc_pv_segment_whole_pv(mem, pv))
285 if ((uint64_t) vg->extent_count + pv->pe_count > UINT32_MAX) {
286 log_error("Unable to add %s to %s: new extent count (%"
287 PRIu64 ") exceeds limit (%" PRIu32 ").",
289 (uint64_t) vg->extent_count + pv->pe_count,
295 add_pvl_to_vgs(vg, pvl);
296 vg->extent_count += pv->pe_count;
297 vg->free_count += pv->pe_count;
302 static int _copy_pv(struct dm_pool *pvmem,
303 struct physical_volume *pv_to,
304 struct physical_volume *pv_from)
306 memcpy(pv_to, pv_from, sizeof(*pv_to));
308 if (!(pv_to->vg_name = dm_pool_strdup(pvmem, pv_from->vg_name)))
311 if (!str_list_dup(pvmem, &pv_to->tags, &pv_from->tags))
314 if (!peg_dup(pvmem, &pv_to->segments, &pv_from->segments))
320 static struct pv_list *_copy_pvl(struct dm_pool *pvmem, struct pv_list *pvl_from)
322 struct pv_list *pvl_to = NULL;
324 if (!(pvl_to = dm_pool_zalloc(pvmem, sizeof(*pvl_to))))
327 if (!(pvl_to->pv = dm_pool_alloc(pvmem, sizeof(*pvl_to->pv))))
330 if(!_copy_pv(pvmem, pvl_to->pv, pvl_from->pv))
335 dm_pool_free(pvmem, pvl_to);
339 int get_pv_from_vg_by_id(const struct format_type *fmt, const char *vg_name,
340 const char *vgid, const char *pvid,
341 struct physical_volume *pv)
343 struct volume_group *vg;
345 int r = 0, consistent = 0;
347 if (!(vg = vg_read_internal(fmt->cmd, vg_name, vgid, 1, &consistent))) {
348 log_error("get_pv_from_vg_by_id: vg_read_internal failed to read VG %s",
354 log_warn("WARNING: Volume group %s is not consistent",
357 dm_list_iterate_items(pvl, &vg->pvs) {
358 if (id_equal(&pvl->pv->id, (const struct id *) pvid)) {
359 if (!_copy_pv(fmt->cmd->mem, pv, pvl->pv)) {
360 log_error("internal PV duplication failed");
373 int move_pv(struct volume_group *vg_from, struct volume_group *vg_to,
376 struct physical_volume *pv;
379 /* FIXME: handle tags */
380 if (!(pvl = find_pv_in_vg(vg_from, pv_name))) {
381 log_error("Physical volume %s not in volume group %s",
382 pv_name, vg_from->name);
386 if (_vg_bad_status_bits(vg_from, RESIZEABLE_VG) ||
387 _vg_bad_status_bits(vg_to, RESIZEABLE_VG))
390 del_pvl_from_vgs(vg_from, pvl);
391 add_pvl_to_vgs(vg_to, pvl);
395 vg_from->extent_count -= pv_pe_count(pv);
396 vg_to->extent_count += pv_pe_count(pv);
398 vg_from->free_count -= pv_pe_count(pv) - pv_pe_alloc_count(pv);
399 vg_to->free_count += pv_pe_count(pv) - pv_pe_alloc_count(pv);
404 int move_pvs_used_by_lv(struct volume_group *vg_from,
405 struct volume_group *vg_to,
408 struct lv_segment *lvseg;
411 struct logical_volume *lv;
413 /* FIXME: handle tags */
414 if (!(lvl = find_lv_in_vg(vg_from, lv_name))) {
415 log_error("Logical volume %s not in volume group %s",
416 lv_name, vg_from->name);
420 if (_vg_bad_status_bits(vg_from, RESIZEABLE_VG) ||
421 _vg_bad_status_bits(vg_to, RESIZEABLE_VG))
424 dm_list_iterate_items(lvseg, &lvl->lv->segments) {
426 if (!move_pvs_used_by_lv(vg_from, vg_to,
427 lvseg->log_lv->name))
429 for (s = 0; s < lvseg->area_count; s++) {
430 if (seg_type(lvseg, s) == AREA_PV) {
431 if (!move_pv(vg_from, vg_to,
432 pv_dev_name(seg_pv(lvseg, s))))
434 } else if (seg_type(lvseg, s) == AREA_LV) {
435 lv = seg_lv(lvseg, s);
436 if (!move_pvs_used_by_lv(vg_from, vg_to,
445 static int validate_new_vg_name(struct cmd_context *cmd, const char *vg_name)
447 char vg_path[PATH_MAX];
449 if (!validate_name(vg_name))
452 snprintf(vg_path, PATH_MAX, "%s%s", cmd->dev_dir, vg_name);
453 if (path_exists(vg_path)) {
454 log_error("%s: already exists in filesystem", vg_path);
461 int validate_vg_rename_params(struct cmd_context *cmd,
462 const char *vg_name_old,
463 const char *vg_name_new)
468 dev_dir = cmd->dev_dir;
469 length = strlen(dev_dir);
471 /* Check sanity of new name */
472 if (strlen(vg_name_new) > NAME_LEN - length - 2) {
473 log_error("New volume group path exceeds maximum length "
474 "of %d!", NAME_LEN - length - 2);
478 if (!validate_new_vg_name(cmd, vg_name_new)) {
479 log_error("New volume group name \"%s\" is invalid",
484 if (!strcmp(vg_name_old, vg_name_new)) {
485 log_error("Old and new volume group names must differ");
492 int vg_rename(struct cmd_context *cmd, struct volume_group *vg,
493 const char *new_name)
495 struct dm_pool *mem = vg->vgmem;
498 vg->old_name = vg->name;
500 if (!(vg->name = dm_pool_strdup(mem, new_name))) {
501 log_error("vg->name allocation failed for '%s'", new_name);
505 dm_list_iterate_items(pvl, &vg->pvs) {
506 if (!(pvl->pv->vg_name = dm_pool_strdup(mem, new_name))) {
507 log_error("pv->vg_name allocation failed for '%s'",
508 pv_dev_name(pvl->pv));
516 int remove_lvs_in_vg(struct cmd_context *cmd,
517 struct volume_group *vg,
523 while ((lst = dm_list_first(&vg->lvs))) {
524 lvl = dm_list_item(lst, struct lv_list);
525 if (!lv_remove_with_dependencies(cmd, lvl->lv, force, 0))
532 int vg_remove_check(struct volume_group *vg)
536 if (vg_read_error(vg) || vg_missing_pv_count(vg)) {
537 log_error("Volume group \"%s\" not found, is inconsistent "
538 "or has PVs missing.", vg ? vg->name : "");
539 log_error("Consider vgreduce --removemissing if metadata "
544 if (!vg_check_status(vg, EXPORTED_VG))
547 lv_count = vg_visible_lvs(vg);
550 log_error("Volume group \"%s\" still contains %u "
551 "logical volume(s)", vg->name, lv_count);
561 void vg_remove_pvs(struct volume_group *vg)
563 struct pv_list *pvl, *tpvl;
565 dm_list_iterate_items_safe(pvl, tpvl, &vg->pvs) {
566 del_pvl_from_vgs(vg, pvl);
567 dm_list_add(&vg->removed_pvs, &pvl->list);
571 int vg_remove(struct volume_group *vg)
573 struct physical_volume *pv;
577 if (!lock_vol(vg->cmd, VG_ORPHANS, LCK_VG_WRITE)) {
578 log_error("Can't get lock for orphan PVs");
582 if (!vg_remove_mdas(vg)) {
583 log_error("vg_remove_mdas %s failed", vg->name);
584 unlock_vg(vg->cmd, VG_ORPHANS);
588 /* init physical volumes */
589 dm_list_iterate_items(pvl, &vg->removed_pvs) {
591 if (is_missing_pv(pv))
594 log_verbose("Removing physical volume \"%s\" from "
595 "volume group \"%s\"", pv_dev_name(pv), vg->name);
596 pv->vg_name = vg->fid->fmt->orphan_vg_name;
597 pv->status = ALLOCATABLE_PV;
599 if (!dev_get_size(pv_dev(pv), &pv->size)) {
600 log_error("%s: Couldn't get size.", pv_dev_name(pv));
605 /* FIXME Write to same sector label was read from */
606 if (!pv_write(vg->cmd, pv, NULL, INT64_C(-1))) {
607 log_error("Failed to remove physical volume \"%s\""
608 " from volume group \"%s\"",
609 pv_dev_name(pv), vg->name);
614 backup_remove(vg->cmd, vg->name);
617 log_print("Volume group \"%s\" successfully removed", vg->name);
619 log_error("Volume group \"%s\" not properly removed", vg->name);
621 unlock_vg(vg->cmd, VG_ORPHANS);
626 * Extend a VG by a single PV / device path
629 * - vg: handle of volume group to extend by 'pv_name'
630 * - pv_name: device path of PV to add to VG
631 * - pp: parameters to pass to implicit pvcreate; if NULL, do not pvcreate
634 static int vg_extend_single_pv(struct volume_group *vg, char *pv_name,
635 struct pvcreate_params *pp)
637 struct physical_volume *pv;
639 pv = pv_by_path(vg->fid->fmt->cmd, pv_name);
641 log_error("%s not identified as an existing "
642 "physical volume", pv_name);
644 } else if (!pv && pp) {
645 pv = pvcreate_single(vg->cmd, pv_name, pp);
649 if (!add_pv_to_vg(vg, pv_name, pv))
655 * Extend a VG by a single PV / device path
658 * - vg: handle of volume group to extend by 'pv_name'
659 * - pv_count: count of device paths of PVs
660 * - pv_names: device paths of PVs to add to VG
661 * - pp: parameters to pass to implicit pvcreate; if NULL, do not pvcreate
664 int vg_extend(struct volume_group *vg, int pv_count, char **pv_names,
665 struct pvcreate_params *pp)
669 if (_vg_bad_status_bits(vg, RESIZEABLE_VG))
673 for (i = 0; i < pv_count; i++) {
674 unescape_colons_and_at_signs(pv_names[i], NULL, NULL);
675 if (!vg_extend_single_pv(vg, pv_names[i], pp))
679 /* FIXME Decide whether to initialise and add new mdahs to format instance */
684 log_error("Unable to add physical volume '%s' to "
685 "volume group '%s'.", pv_names[i], vg->name);
689 /* FIXME: use this inside vgreduce_single? */
690 int vg_reduce(struct volume_group *vg, char *pv_name)
692 struct physical_volume *pv;
695 if (_vg_bad_status_bits(vg, RESIZEABLE_VG))
702 if (!(pvl = find_pv_in_vg(vg, pv_name))) {
703 log_error("Physical volume %s not in volume group %s.",
710 if (pv_pe_alloc_count(pv)) {
711 log_error("Physical volume %s still in use.",
716 if (!dev_get_size(pv_dev(pv), &pv->size)) {
717 log_error("%s: Couldn't get size.", pv_name);
721 vg->free_count -= pv_pe_count(pv) - pv_pe_alloc_count(pv);
722 vg->extent_count -= pv_pe_count(pv);
723 del_pvl_from_vgs(vg, pvl);
725 /* add pv to the remove_pvs list */
726 dm_list_add(&vg->removed_pvs, &pvl->list);
731 log_error("Unable to remove physical volume '%s' from "
732 "volume group '%s'.", pv_name, vg->name);
736 int lv_change_tag(struct logical_volume *lv, const char *tag, int add_tag)
740 if (!(lv->vg->fid->fmt->features & FMT_TAGS)) {
741 log_error("Logical volume %s/%s does not support tags",
742 lv->vg->name, lv->name);
747 if (!(tag_new = dm_pool_strdup(lv->vg->vgmem, tag))) {
748 log_error("Failed to duplicate tag %s from %s/%s",
749 tag, lv->vg->name, lv->name);
752 if (!str_list_add(lv->vg->vgmem, &lv->tags, tag_new)) {
753 log_error("Failed to add tag %s to %s/%s",
754 tag, lv->vg->name, lv->name);
758 if (!str_list_del(&lv->tags, tag)) {
759 log_error("Failed to remove tag %s from %s/%s",
760 tag, lv->vg->name, lv->name);
767 int vg_change_tag(struct volume_group *vg, const char *tag, int add_tag)
771 if (!(vg->fid->fmt->features & FMT_TAGS)) {
772 log_error("Volume group %s does not support tags", vg->name);
777 if (!(tag_new = dm_pool_strdup(vg->vgmem, tag))) {
778 log_error("Failed to duplicate tag %s from %s",
782 if (!str_list_add(vg->vgmem, &vg->tags, tag_new)) {
783 log_error("Failed to add tag %s to volume group %s",
788 if (!str_list_del(&vg->tags, tag)) {
789 log_error("Failed to remove tag %s from volume group "
790 "%s", tag, vg->name);
797 const char *strip_dir(const char *vg_name, const char *dev_dir)
799 size_t len = strlen(dev_dir);
800 if (!strncmp(vg_name, dev_dir, len))
807 * Validate parameters to vg_create() before calling.
808 * FIXME: Move inside vg_create library function.
809 * FIXME: Change vgcreate_params struct to individual gets/sets
811 int vgcreate_params_validate(struct cmd_context *cmd,
812 struct vgcreate_params *vp)
814 if (!validate_new_vg_name(cmd, vp->vg_name)) {
815 log_error("New volume group name \"%s\" is invalid",
820 if (vp->alloc == ALLOC_INHERIT) {
821 log_error("Volume Group allocation policy cannot inherit "
826 if (!vp->extent_size) {
827 log_error("Physical extent size may not be zero");
831 if (!(cmd->fmt->features & FMT_UNLIMITED_VOLS)) {
836 if (vp->max_lv > 255 || vp->max_pv > 255) {
837 log_error("Number of volumes may not exceed 255");
846 * Create a (struct volume_group) volume group handle from a struct volume_group pointer and a
847 * possible failure code or zero for success.
849 static struct volume_group *_vg_make_handle(struct cmd_context *cmd,
850 struct volume_group *vg,
853 struct dm_pool *vgmem;
856 if (!(vgmem = dm_pool_create("lvm2 vg_handle", VG_MEMPOOL_CHUNK)) ||
857 !(vg = dm_pool_zalloc(vgmem, sizeof(*vg)))) {
858 log_error("Error allocating vg handle.");
860 dm_pool_destroy(vgmem);
866 vg->read_status = failure;
868 return (struct volume_group *)vg;
871 int lv_has_unknown_segments(const struct logical_volume *lv)
873 struct lv_segment *seg;
874 /* foreach segment */
875 dm_list_iterate_items(seg, &lv->segments)
876 if (seg_unknown(seg))
881 int vg_has_unknown_segments(const struct volume_group *vg)
886 dm_list_iterate_items(lvl, &vg->lvs)
887 if (lv_has_unknown_segments(lvl->lv))
893 * Create a VG with default parameters.
895 * - struct volume_group* with SUCCESS code: VG structure created
896 * - NULL or struct volume_group* with FAILED_* code: error creating VG structure
897 * Use vg_read_error() to determine success or failure.
898 * FIXME: cleanup usage of _vg_make_handle()
900 struct volume_group *vg_create(struct cmd_context *cmd, const char *vg_name)
902 struct volume_group *vg;
907 if (!validate_name(vg_name)) {
908 log_error("Invalid vg name %s", vg_name);
909 /* FIXME: use _vg_make_handle() w/proper error code */
913 rc = vg_lock_newname(cmd, vg_name);
915 /* NOTE: let caller decide - this may be check for existence */
916 return _vg_make_handle(cmd, NULL, rc);
918 /* FIXME: Is this vg_read_internal necessary? Move it inside
920 /* is this vg name already in use ? */
921 if ((vg = vg_read_internal(cmd, vg_name, NULL, 1, &consistent))) {
922 log_error("A volume group called '%s' already exists.", vg_name);
923 unlock_and_free_vg(cmd, vg, vg_name);
924 return _vg_make_handle(cmd, NULL, FAILED_EXIST);
927 if (!(mem = dm_pool_create("lvm2 vg_create", VG_MEMPOOL_CHUNK)))
930 if (!(vg = dm_pool_zalloc(mem, sizeof(*vg))))
933 if (!id_create(&vg->id)) {
934 log_error("Couldn't create uuid for volume group '%s'.",
939 /* Strip dev_dir if present */
940 vg_name = strip_dir(vg_name, cmd->dev_dir);
945 if (!(vg->name = dm_pool_strdup(mem, vg_name)))
950 vg->status = (RESIZEABLE_VG | LVM_READ | LVM_WRITE);
951 if (!(vg->system_id = dm_pool_alloc(mem, NAME_LEN)))
954 *vg->system_id = '\0';
956 vg->extent_size = DEFAULT_EXTENT_SIZE * 2;
957 vg->extent_count = 0;
960 vg->max_lv = DEFAULT_MAX_LV;
961 vg->max_pv = DEFAULT_MAX_PV;
963 vg->alloc = DEFAULT_ALLOC_POLICY;
964 vg->mda_copies = DEFAULT_VGMETADATACOPIES;
967 dm_list_init(&vg->pvs);
969 dm_list_init(&vg->lvs);
971 dm_list_init(&vg->tags);
973 /* initialize removed_pvs list */
974 dm_list_init(&vg->removed_pvs);
976 if (!(vg->fid = cmd->fmt->ops->create_instance(cmd->fmt, vg_name,
978 log_error("Failed to create format instance");
982 if (vg->fid->fmt->ops->vg_setup &&
983 !vg->fid->fmt->ops->vg_setup(vg->fid, vg)) {
984 log_error("Format specific setup of volume group '%s' failed.",
988 return _vg_make_handle(cmd, vg, SUCCESS);
991 unlock_and_free_vg(cmd, vg, vg_name);
992 /* FIXME: use _vg_make_handle() w/proper error code */
996 uint64_t extents_from_size(struct cmd_context *cmd, uint64_t size,
997 uint32_t extent_size)
999 if (size % extent_size) {
1000 size += extent_size - size % extent_size;
1001 log_print("Rounding up size to full physical extent %s",
1002 display_size(cmd, size));
1005 if (size > (uint64_t) UINT32_MAX * extent_size) {
1006 log_error("Volume too large (%s) for extent size %s. "
1007 "Upper limit is %s.",
1008 display_size(cmd, size),
1009 display_size(cmd, (uint64_t) extent_size),
1010 display_size(cmd, (uint64_t) UINT32_MAX *
1015 return (uint64_t) size / extent_size;
1019 * Return random integer in [0,max) interval
1021 * The loop rejects numbers that come from an "incomplete" slice of the
1022 * RAND_MAX space (considering the number space [0, RAND_MAX] is divided
1023 * into some "max"-sized slices and at most a single smaller slice,
1024 * between [n*max, RAND_MAX] for suitable n -- numbers from this last slice
1025 * are discarded because they could distort the distribution in favour of
1028 static unsigned _even_rand( unsigned *seed, unsigned max )
1032 /* make sure distribution is even */
1034 r = (unsigned) rand_r( seed );
1036 } while ( r - ret > RAND_MAX - max );
1041 static dm_bitset_t _bitset_with_random_bits(struct dm_pool *mem, uint32_t num_bits,
1042 uint32_t num_set_bits, unsigned *seed)
1045 unsigned bit_selected;
1047 uint32_t i = num_bits - num_set_bits;
1049 if (!(bs = dm_bitset_create(mem, (unsigned) num_bits))) {
1050 log_error("Failed to allocate bitset for setting random bits.");
1054 if (!dm_pool_begin_object(mem, 512)) {
1055 log_error("dm_pool_begin_object failed for random list of bits.");
1056 dm_pool_free(mem, bs);
1060 /* Perform loop num_set_bits times, selecting one bit each time */
1061 while (i++ < num_bits) {
1062 /* Select a random bit between 0 and (i-1) inclusive. */
1063 bit_selected = _even_rand(seed, i);
1066 * If the bit was already set, set the new bit that became
1067 * choosable for the first time during this pass.
1068 * This maintains a uniform probability distribution by compensating
1069 * for being unable to select it until this pass.
1071 if (dm_bit(bs, bit_selected))
1072 bit_selected = i - 1;
1074 dm_bit_set(bs, bit_selected);
1076 if (dm_snprintf(buf, sizeof(buf), "%u ", bit_selected) < 0) {
1077 log_error("snprintf random bit failed.");
1078 dm_pool_free(mem, bs);
1081 if (!dm_pool_grow_object(mem, buf, strlen(buf))) {
1082 log_error("Failed to generate list of random bits.");
1083 dm_pool_free(mem, bs);
1088 log_debug("Selected %" PRIu32 " random bits from %" PRIu32 ": %s", num_set_bits, num_bits, (char *) dm_pool_end_object(mem));
1093 static int _vg_ignore_mdas(struct volume_group *vg, uint32_t num_to_ignore)
1095 struct metadata_area *mda;
1096 uint32_t mda_used_count = vg_mda_used_count(vg);
1097 dm_bitset_t mda_to_ignore_bs;
1100 log_debug("Adjusting ignored mdas for %s: %" PRIu32 " of %" PRIu32 " mdas in use "
1101 "but %" PRIu32 " required. Changing %" PRIu32 " mda.",
1102 vg->name, mda_used_count, vg_mda_count(vg), vg_mda_copies(vg), num_to_ignore);
1107 if (!(mda_to_ignore_bs = _bitset_with_random_bits(vg->vgmem, mda_used_count,
1108 num_to_ignore, &vg->cmd->rand_seed)))
1111 dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use)
1112 if (!mda_is_ignored(mda) && (--mda_used_count,
1113 dm_bit(mda_to_ignore_bs, mda_used_count))) {
1114 mda_set_ignored(mda, 1);
1115 if (!--num_to_ignore)
1119 log_error(INTERNAL_ERROR "Unable to find %"PRIu32" metadata areas to ignore "
1120 "on volume group %s", num_to_ignore, vg->name);
1125 dm_pool_free(vg->vgmem, mda_to_ignore_bs);
1129 static int _vg_unignore_mdas(struct volume_group *vg, uint32_t num_to_unignore)
1131 struct metadata_area *mda, *tmda;
1132 uint32_t mda_used_count = vg_mda_used_count(vg);
1133 uint32_t mda_count = vg_mda_count(vg);
1134 uint32_t mda_free_count = mda_count - mda_used_count;
1135 dm_bitset_t mda_to_unignore_bs;
1138 if (!num_to_unignore)
1141 log_debug("Adjusting ignored mdas for %s: %" PRIu32 " of %" PRIu32 " mdas in use "
1142 "but %" PRIu32 " required. Changing %" PRIu32 " mda.",
1143 vg->name, mda_used_count, mda_count, vg_mda_copies(vg), num_to_unignore);
1145 if (!(mda_to_unignore_bs = _bitset_with_random_bits(vg->vgmem, mda_free_count,
1146 num_to_unignore, &vg->cmd->rand_seed)))
1149 dm_list_iterate_items_safe(mda, tmda, &vg->fid->metadata_areas_ignored)
1150 if (mda_is_ignored(mda) && (--mda_free_count,
1151 dm_bit(mda_to_unignore_bs, mda_free_count))) {
1152 mda_set_ignored(mda, 0);
1153 dm_list_move(&vg->fid->metadata_areas_in_use,
1155 if (!--num_to_unignore)
1159 dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use)
1160 if (mda_is_ignored(mda) && (--mda_free_count,
1161 dm_bit(mda_to_unignore_bs, mda_free_count))) {
1162 mda_set_ignored(mda, 0);
1163 if (!--num_to_unignore)
1167 log_error(INTERNAL_ERROR "Unable to find %"PRIu32" metadata areas to unignore "
1168 "on volume group %s", num_to_unignore, vg->name);
1173 dm_pool_free(vg->vgmem, mda_to_unignore_bs);
1177 static int _vg_adjust_ignored_mdas(struct volume_group *vg)
1179 uint32_t mda_copies_used = vg_mda_used_count(vg);
1181 if (vg->mda_copies == VGMETADATACOPIES_UNMANAGED) {
1182 /* Ensure at least one mda is in use. */
1183 if (!mda_copies_used && vg_mda_count(vg) && !_vg_unignore_mdas(vg, 1))
1190 /* Not an error to have vg_mda_count larger than total mdas. */
1191 if (vg->mda_copies == VGMETADATACOPIES_ALL ||
1192 vg->mda_copies >= vg_mda_count(vg)) {
1194 if (!_vg_unignore_mdas(vg, vg_mda_count(vg) - mda_copies_used))
1196 } else if (mda_copies_used < vg->mda_copies) {
1197 if (!_vg_unignore_mdas(vg, vg->mda_copies - mda_copies_used))
1199 } else if (mda_copies_used > vg->mda_copies)
1200 if (!_vg_ignore_mdas(vg, mda_copies_used - vg->mda_copies))
1204 * The VGMETADATACOPIES_ALL value will never be written disk.
1205 * It is a special cmdline value that means 2 things:
1206 * 1. clear all ignore bits in all mdas in this vg
1207 * 2. set the "unmanaged" policy going forward for metadata balancing
1209 if (vg->mda_copies == VGMETADATACOPIES_ALL)
1210 vg->mda_copies = VGMETADATACOPIES_UNMANAGED;
1215 uint64_t find_min_mda_size(struct dm_list *mdas)
1217 uint64_t min_mda_size = UINT64_MAX, mda_size;
1218 struct metadata_area *mda;
1220 dm_list_iterate_items(mda, mdas) {
1221 if (!mda->ops->mda_total_sectors)
1223 mda_size = mda->ops->mda_total_sectors(mda);
1224 if (mda_size < min_mda_size)
1225 min_mda_size = mda_size;
1228 if (min_mda_size == UINT64_MAX)
1229 min_mda_size = UINT64_C(0);
1231 return min_mda_size;
1234 static int _move_mdas(struct volume_group *vg_from, struct volume_group *vg_to,
1235 struct dm_list *mdas_from, struct dm_list *mdas_to)
1237 struct metadata_area *mda, *mda2;
1240 dm_list_iterate_items_safe(mda, mda2, mdas_from) {
1241 if (!mda->ops->mda_in_vg) {
1246 if (!mda->ops->mda_in_vg(vg_from->fid, vg_from, mda)) {
1247 if (is_orphan_vg(vg_to->name))
1248 dm_list_del(&mda->list);
1250 dm_list_move(mdas_to, &mda->list);
1257 * Separate metadata areas after splitting a VG.
1258 * Also accepts orphan VG as destination (for vgreduce).
1260 int vg_split_mdas(struct cmd_context *cmd __attribute__((unused)),
1261 struct volume_group *vg_from, struct volume_group *vg_to)
1263 struct dm_list *mdas_from_in_use, *mdas_to_in_use;
1264 struct dm_list *mdas_from_ignored, *mdas_to_ignored;
1267 mdas_from_in_use = &vg_from->fid->metadata_areas_in_use;
1268 mdas_from_ignored = &vg_from->fid->metadata_areas_ignored;
1269 mdas_to_in_use = &vg_to->fid->metadata_areas_in_use;
1270 mdas_to_ignored = &vg_to->fid->metadata_areas_ignored;
1272 common_mda = _move_mdas(vg_from, vg_to,
1273 mdas_from_in_use, mdas_to_in_use);
1274 common_mda = _move_mdas(vg_from, vg_to,
1275 mdas_from_ignored, mdas_to_ignored);
1277 if ((dm_list_empty(mdas_from_in_use) &&
1278 dm_list_empty(mdas_from_ignored)) ||
1279 ((!is_orphan_vg(vg_to->name) &&
1280 dm_list_empty(mdas_to_in_use) &&
1281 dm_list_empty(mdas_to_ignored))))
1287 static int _wipe_sb(struct device *dev, const char *type, const char *name,
1288 int wipe_len, struct pvcreate_params *pp,
1289 int (*func)(struct device *dev, uint64_t *signature))
1292 uint64_t superblock;
1294 wipe = func(dev, &superblock);
1296 log_error("Fatal error while trying to detect %s on %s.",
1304 /* Specifying --yes => do not ask. */
1305 if (!pp->yes && (pp->force == PROMPT) &&
1306 yes_no_prompt("WARNING: %s detected on %s. Wipe it? [y/n] ",
1307 type, name) != 'y') {
1308 log_error("Aborting pvcreate on %s.", name);
1312 log_print("Wiping %s on %s.", type, name);
1313 if (!dev_set(dev, superblock, wipe_len, 0)) {
1314 log_error("Failed to wipe %s on %s.", type, name);
1322 * See if we may pvcreate on this device.
1323 * 0 indicates we may not.
1325 static int pvcreate_check(struct cmd_context *cmd, const char *name,
1326 struct pvcreate_params *pp)
1328 struct physical_volume *pv;
1330 struct dm_list mdas;
1332 dm_list_init(&mdas);
1334 /* FIXME Check partition type is LVM unless --force is given */
1336 /* Is there a pv here already? */
1337 pv = pv_read(cmd, name, &mdas, NULL, 0, 0);
1340 * If a PV has no MDAs it may appear to be an orphan until the
1341 * metadata is read off another PV in the same VG. Detecting
1342 * this means checking every VG by scanning every PV on the
1345 if (pv && is_orphan(pv) && mdas_empty_or_ignored(&mdas)) {
1346 if (!scan_vgs_for_pvs(cmd, 0))
1348 pv = pv_read(cmd, name, NULL, NULL, 0, 0);
1351 /* Allow partial & exported VGs to be destroyed. */
1352 /* We must have -ff to overwrite a non orphan */
1353 if (pv && !is_orphan(pv) && pp->force != DONT_PROMPT_OVERRIDE) {
1354 log_error("Can't initialize physical volume \"%s\" of "
1355 "volume group \"%s\" without -ff", name, pv_vg_name(pv));
1360 if (pv && !is_orphan(pv) && !pp->yes &&
1361 yes_no_prompt(_really_init, name, pv_vg_name(pv)) == 'n') {
1362 log_error("%s: physical volume not initialized", name);
1366 if (sigint_caught())
1369 dev = dev_cache_get(name, cmd->filter);
1371 /* Is there an md superblock here? */
1372 /* FIXME: still possible issues here - rescan cache? */
1373 if (!dev && md_filtering()) {
1374 refresh_filters(cmd);
1375 init_md_filtering(0);
1376 dev = dev_cache_get(name, cmd->filter);
1377 init_md_filtering(1);
1381 log_error("Device %s not found (or ignored by filtering).", name);
1386 * This test will fail if the device belongs to an MD array.
1388 if (!dev_test_excl(dev)) {
1389 /* FIXME Detect whether device-mapper itself is still using it */
1390 log_error("Can't open %s exclusively. Mounted filesystem?",
1395 if (!_wipe_sb(dev, "software RAID md superblock", name, 4, pp, dev_is_md))
1398 if (!_wipe_sb(dev, "swap signature", name, 10, pp, dev_is_swap))
1401 if (!_wipe_sb(dev, "LUKS signature", name, 8, pp, dev_is_luks))
1404 if (sigint_caught())
1407 if (pv && !is_orphan(pv) && pp->force) {
1408 log_warn("WARNING: Forcing physical volume creation on "
1410 !is_orphan(pv) ? " of volume group \"" : "",
1411 !is_orphan(pv) ? pv_vg_name(pv) : "",
1412 !is_orphan(pv) ? "\"" : "");
1418 void pvcreate_params_set_defaults(struct pvcreate_params *pp)
1420 memset(pp, 0, sizeof(*pp));
1423 pp->data_alignment = UINT64_C(0);
1424 pp->data_alignment_offset = UINT64_C(0);
1425 pp->pvmetadatacopies = DEFAULT_PVMETADATACOPIES;
1426 pp->pvmetadatasize = DEFAULT_PVMETADATASIZE;
1427 pp->labelsector = DEFAULT_LABELSECTOR;
1430 pp->extent_count = 0;
1431 pp->extent_size = 0;
1432 pp->restorefile = 0;
1435 pp->metadataignore = DEFAULT_PVMETADATAIGNORE;
1439 * pvcreate_single() - initialize a device with PV label and metadata area
1442 * - pv_name: device path to initialize
1443 * - pp: parameters to pass to pv_create; if NULL, use default values
1447 * struct physical_volume * (non-NULL): handle to physical volume created
1449 struct physical_volume * pvcreate_single(struct cmd_context *cmd,
1450 const char *pv_name,
1451 struct pvcreate_params *pp)
1453 struct physical_volume *pv;
1455 struct dm_list mdas;
1456 struct pvcreate_params default_pp;
1457 char buffer[64] __attribute__((aligned(8)));
1459 pvcreate_params_set_defaults(&default_pp);
1464 if ((dev = device_from_pvid(cmd, pp->idp, NULL)) &&
1465 (dev != dev_cache_get(pv_name, cmd->filter))) {
1466 if (!id_write_format((const struct id*)&pp->idp->uuid,
1467 buffer, sizeof(buffer)))
1469 log_error("uuid %s already in use on \"%s\"", buffer,
1475 if (!pvcreate_check(cmd, pv_name, pp))
1478 if (sigint_caught())
1481 if (!(dev = dev_cache_get(pv_name, cmd->filter))) {
1482 log_error("%s: Couldn't find device. Check your filters?",
1487 dm_list_init(&mdas);
1488 if (!(pv = pv_create(cmd, dev, pp->idp, pp->size,
1489 pp->data_alignment, pp->data_alignment_offset,
1490 pp->pe_start, pp->extent_count, pp->extent_size,
1491 pp->pvmetadatacopies, pp->pvmetadatasize,
1492 pp->metadataignore, &mdas))) {
1493 log_error("Failed to setup physical volume \"%s\"", pv_name);
1497 log_verbose("Set up physical volume for \"%s\" with %" PRIu64
1498 " available sectors", pv_name, pv_size(pv));
1500 /* Wipe existing label first */
1501 if (!label_remove(pv_dev(pv))) {
1502 log_error("Failed to wipe existing label on %s", pv_name);
1507 log_verbose("Zeroing start of device %s", pv_name);
1508 if (!dev_open_quiet(dev)) {
1509 log_error("%s not opened: device not zeroed", pv_name);
1513 if (!dev_set(dev, UINT64_C(0), (size_t) 2048, 0)) {
1514 log_error("%s not wiped: aborting", pv_name);
1521 log_very_verbose("Writing physical volume data to disk \"%s\"",
1524 if (!(pv_write(cmd, pv, &mdas, pp->labelsector))) {
1525 log_error("Failed to write physical volume \"%s\"", pv_name);
1529 log_print("Physical volume \"%s\" successfully created", pv_name);
1537 static void _free_pv(struct dm_pool *mem, struct physical_volume *pv)
1539 dm_pool_free(mem, pv);
1542 static struct physical_volume *_alloc_pv(struct dm_pool *mem, struct device *dev)
1544 struct physical_volume *pv = dm_pool_zalloc(mem, sizeof(*pv));
1552 pv->pe_alloc_count = 0;
1554 pv->pe_align_offset = 0;
1558 pv->status = ALLOCATABLE_PV;
1560 dm_list_init(&pv->tags);
1561 dm_list_init(&pv->segments);
1567 * pv_create - initialize a physical volume for use with a volume group
1570 * @dev: PV device to initialize
1571 * @size: size of the PV in sectors
1572 * @data_alignment: requested alignment of data
1573 * @data_alignment_offset: requested offset to aligned data
1574 * @pe_start: physical extent start
1575 * @existing_extent_count
1576 * @existing_extent_size
1582 * PV handle - physical volume initialized successfully
1583 * NULL - invalid parameter or problem initializing the physical volume
1586 * FIXME: shorten argument list and replace with explict 'set' functions
1588 struct physical_volume *pv_create(const struct cmd_context *cmd,
1590 struct id *id, uint64_t size,
1591 unsigned long data_alignment,
1592 unsigned long data_alignment_offset,
1594 uint32_t existing_extent_count,
1595 uint32_t existing_extent_size,
1596 int pvmetadatacopies, uint64_t pvmetadatasize,
1597 unsigned metadataignore, struct dm_list *mdas)
1599 const struct format_type *fmt = cmd->fmt;
1600 struct dm_pool *mem = fmt->cmd->mem;
1601 struct physical_volume *pv = _alloc_pv(mem, dev);
1607 memcpy(&pv->id, id, sizeof(*id));
1608 else if (!id_create(&pv->id)) {
1609 log_error("Failed to create random uuid for %s.",
1614 if (!dev_get_size(pv->dev, &pv->size)) {
1615 log_error("%s: Couldn't get size.", pv_dev_name(pv));
1620 if (size > pv->size)
1621 log_warn("WARNING: %s: Overriding real size. "
1622 "You could lose data.", pv_dev_name(pv));
1623 log_verbose("%s: Pretending size is %" PRIu64 " sectors.",
1624 pv_dev_name(pv), size);
1628 if (pv->size < PV_MIN_SIZE) {
1629 log_error("%s: Size must exceed minimum of %ld sectors.",
1630 pv_dev_name(pv), PV_MIN_SIZE);
1634 if (pv->size < data_alignment) {
1635 log_error("%s: Data alignment must not exceed device size.",
1641 pv->vg_name = fmt->orphan_vg_name;
1643 if (!fmt->ops->pv_setup(fmt, pe_start, existing_extent_count,
1644 existing_extent_size, data_alignment,
1645 data_alignment_offset,
1646 pvmetadatacopies, pvmetadatasize,
1647 metadataignore, mdas, pv, NULL)) {
1648 log_error("%s: Format-specific setup of physical volume "
1649 "failed.", pv_dev_name(pv));
1660 /* FIXME: liblvm todo - make into function that returns handle */
1661 struct pv_list *find_pv_in_vg(const struct volume_group *vg,
1662 const char *pv_name)
1664 return _find_pv_in_vg(vg, pv_name);
1667 static struct pv_list *_find_pv_in_vg(const struct volume_group *vg,
1668 const char *pv_name)
1670 struct pv_list *pvl;
1672 dm_list_iterate_items(pvl, &vg->pvs)
1673 if (pvl->pv->dev == dev_cache_get(pv_name, vg->cmd->filter))
1679 struct pv_list *find_pv_in_pv_list(const struct dm_list *pl,
1680 const struct physical_volume *pv)
1682 struct pv_list *pvl;
1684 dm_list_iterate_items(pvl, pl)
1691 int pv_is_in_vg(struct volume_group *vg, struct physical_volume *pv)
1693 struct pv_list *pvl;
1695 dm_list_iterate_items(pvl, &vg->pvs)
1702 static struct pv_list *_find_pv_in_vg_by_uuid(const struct volume_group *vg,
1703 const struct id *id)
1705 struct pv_list *pvl;
1707 dm_list_iterate_items(pvl, &vg->pvs)
1708 if (id_equal(&pvl->pv->id, id))
1715 * find_pv_in_vg_by_uuid - Find PV in VG by PV UUID
1716 * @vg: volume group to search
1717 * @id: UUID of the PV to match
1720 * struct pv_list within owning struct volume_group - if UUID of PV found in VG
1721 * NULL - invalid parameter or UUID of PV not found in VG
1724 * FIXME - liblvm todo - make into function that takes VG handle
1726 struct pv_list *find_pv_in_vg_by_uuid(const struct volume_group *vg,
1727 const struct id *id)
1729 return _find_pv_in_vg_by_uuid(vg, id);
1732 struct lv_list *find_lv_in_vg(const struct volume_group *vg,
1733 const char *lv_name)
1735 struct lv_list *lvl;
1738 /* Use last component */
1739 if ((ptr = strrchr(lv_name, '/')))
1744 dm_list_iterate_items(lvl, &vg->lvs)
1745 if (!strcmp(lvl->lv->name, ptr))
1751 struct lv_list *find_lv_in_lv_list(const struct dm_list *ll,
1752 const struct logical_volume *lv)
1754 struct lv_list *lvl;
1756 dm_list_iterate_items(lvl, ll)
1763 struct lv_list *find_lv_in_vg_by_lvid(struct volume_group *vg,
1764 const union lvid *lvid)
1766 struct lv_list *lvl;
1768 dm_list_iterate_items(lvl, &vg->lvs)
1769 if (!strncmp(lvl->lv->lvid.s, lvid->s, sizeof(*lvid)))
1775 struct logical_volume *find_lv(const struct volume_group *vg,
1776 const char *lv_name)
1778 struct lv_list *lvl = find_lv_in_vg(vg, lv_name);
1779 return lvl ? lvl->lv : NULL;
1782 struct physical_volume *find_pv(struct volume_group *vg, struct device *dev)
1784 struct pv_list *pvl;
1786 dm_list_iterate_items(pvl, &vg->pvs)
1787 if (dev == pvl->pv->dev)
1793 /* FIXME: liblvm todo - make into function that returns handle */
1794 struct physical_volume *find_pv_by_name(struct cmd_context *cmd,
1795 const char *pv_name)
1797 return _find_pv_by_name(cmd, pv_name);
1801 static struct physical_volume *_find_pv_by_name(struct cmd_context *cmd,
1802 const char *pv_name)
1804 struct dm_list mdas;
1805 struct physical_volume *pv;
1807 dm_list_init(&mdas);
1808 if (!(pv = _pv_read(cmd, cmd->mem, pv_name, &mdas, NULL, 1, 0))) {
1809 log_error("Physical volume %s not found", pv_name);
1813 if (is_orphan_vg(pv->vg_name) && mdas_empty_or_ignored(&mdas)) {
1814 /* If a PV has no MDAs - need to search all VGs for it */
1815 if (!scan_vgs_for_pvs(cmd, 1))
1817 if (!(pv = _pv_read(cmd, cmd->mem, pv_name, NULL, NULL, 1, 0))) {
1818 log_error("Physical volume %s not found", pv_name);
1823 if (is_orphan_vg(pv->vg_name)) {
1824 log_error("Physical volume %s not in a volume group", pv_name);
1831 /* Find segment at a given logical extent in an LV */
1832 struct lv_segment *find_seg_by_le(const struct logical_volume *lv, uint32_t le)
1834 struct lv_segment *seg;
1836 dm_list_iterate_items(seg, &lv->segments)
1837 if (le >= seg->le && le < seg->le + seg->len)
1843 struct lv_segment *first_seg(const struct logical_volume *lv)
1845 struct lv_segment *seg;
1847 dm_list_iterate_items(seg, &lv->segments)
1853 int vg_remove_mdas(struct volume_group *vg)
1855 struct metadata_area *mda;
1857 /* FIXME Improve recovery situation? */
1858 /* Remove each copy of the metadata */
1859 dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
1860 if (mda->ops->vg_remove &&
1861 !mda->ops->vg_remove(vg->fid, vg, mda))
1869 * Determine whether two vgs are compatible for merging.
1871 int vgs_are_compatible(struct cmd_context *cmd __attribute__((unused)),
1872 struct volume_group *vg_from,
1873 struct volume_group *vg_to)
1875 struct lv_list *lvl1, *lvl2;
1876 struct pv_list *pvl;
1877 char *name1, *name2;
1879 if (lvs_in_vg_activated(vg_from)) {
1880 log_error("Logical volumes in \"%s\" must be inactive",
1885 /* Check compatibility */
1886 if (vg_to->extent_size != vg_from->extent_size) {
1887 log_error("Extent sizes differ: %d (%s) and %d (%s)",
1888 vg_to->extent_size, vg_to->name,
1889 vg_from->extent_size, vg_from->name);
1893 if (vg_to->max_pv &&
1894 (vg_to->max_pv < vg_to->pv_count + vg_from->pv_count)) {
1895 log_error("Maximum number of physical volumes (%d) exceeded "
1896 " for \"%s\" and \"%s\"", vg_to->max_pv, vg_to->name,
1901 if (vg_to->max_lv &&
1902 (vg_to->max_lv < vg_visible_lvs(vg_to) + vg_visible_lvs(vg_from))) {
1903 log_error("Maximum number of logical volumes (%d) exceeded "
1904 " for \"%s\" and \"%s\"", vg_to->max_lv, vg_to->name,
1909 /* Metadata types must be the same */
1910 if (vg_to->fid->fmt != vg_from->fid->fmt) {
1911 log_error("Metadata types differ for \"%s\" and \"%s\"",
1912 vg_to->name, vg_from->name);
1916 /* Clustering attribute must be the same */
1917 if (vg_is_clustered(vg_to) != vg_is_clustered(vg_from)) {
1918 log_error("Clustered attribute differs for \"%s\" and \"%s\"",
1919 vg_to->name, vg_from->name);
1923 /* Check no conflicts with LV names */
1924 dm_list_iterate_items(lvl1, &vg_to->lvs) {
1925 name1 = lvl1->lv->name;
1927 dm_list_iterate_items(lvl2, &vg_from->lvs) {
1928 name2 = lvl2->lv->name;
1930 if (!strcmp(name1, name2)) {
1931 log_error("Duplicate logical volume "
1933 "in \"%s\" and \"%s\"",
1934 name1, vg_to->name, vg_from->name);
1940 /* Check no PVs are constructed from either VG */
1941 dm_list_iterate_items(pvl, &vg_to->pvs) {
1942 if (pv_uses_vg(pvl->pv, vg_from)) {
1943 log_error("Physical volume %s might be constructed "
1944 "from same volume group %s.",
1945 pv_dev_name(pvl->pv), vg_from->name);
1950 dm_list_iterate_items(pvl, &vg_from->pvs) {
1951 if (pv_uses_vg(pvl->pv, vg_to)) {
1952 log_error("Physical volume %s might be constructed "
1953 "from same volume group %s.",
1954 pv_dev_name(pvl->pv), vg_to->name);
1962 struct _lv_postorder_baton {
1963 int (*fn)(struct logical_volume *lv, void *data);
1967 static int _lv_postorder_visit(struct logical_volume *,
1968 int (*fn)(struct logical_volume *lv, void *data),
1971 static int _lv_postorder_level(struct logical_volume *lv, void *data)
1973 struct _lv_postorder_baton *baton = data;
1974 if (lv->status & POSTORDER_OPEN_FLAG)
1975 return 1; // a data structure loop has closed...
1976 lv->status |= POSTORDER_OPEN_FLAG;
1977 int r =_lv_postorder_visit(lv, baton->fn, baton->data);
1978 lv->status &= ~POSTORDER_OPEN_FLAG;
1979 lv->status |= POSTORDER_FLAG;
1983 static int _lv_each_dependency(struct logical_volume *lv,
1984 int (*fn)(struct logical_volume *lv, void *data),
1988 struct lv_segment *lvseg;
1990 struct logical_volume *deps[] = {
1991 (lv->rdevice && lv != lv->rdevice->lv) ? lv->rdevice->lv : 0,
1992 (lv->rdevice && lv != lv->rdevice->slog) ? lv->rdevice->slog : 0,
1993 lv->snapshot ? lv->snapshot->origin : 0,
1994 lv->snapshot ? lv->snapshot->cow : 0 };
1995 for (i = 0; i < sizeof(deps) / sizeof(*deps); ++i) {
1996 if (deps[i] && !fn(deps[i], data))
2000 dm_list_iterate_items(lvseg, &lv->segments) {
2001 if (lvseg->log_lv && !fn(lvseg->log_lv, data))
2003 if (lvseg->rlog_lv && !fn(lvseg->rlog_lv, data))
2005 for (s = 0; s < lvseg->area_count; ++s) {
2006 if (seg_type(lvseg, s) == AREA_LV && !fn(seg_lv(lvseg,s), data))
2013 static int _lv_postorder_cleanup(struct logical_volume *lv, void *data)
2015 if (!(lv->status & POSTORDER_FLAG))
2017 lv->status &= ~POSTORDER_FLAG;
2019 if (!_lv_each_dependency(lv, _lv_postorder_cleanup, data))
2024 static int _lv_postorder_visit(struct logical_volume *lv,
2025 int (*fn)(struct logical_volume *lv, void *data),
2028 struct _lv_postorder_baton baton;
2031 if (lv->status & POSTORDER_FLAG)
2036 r = _lv_each_dependency(lv, _lv_postorder_level, &baton);
2044 * This will walk the LV dependency graph in depth-first order and in the
2045 * postorder, call a callback function "fn". The void *data is passed along all
2046 * the calls. The callback may return zero to indicate an error and terminate
2047 * the depth-first walk. The error is propagated to return value of
2050 static int _lv_postorder(struct logical_volume *lv,
2051 int (*fn)(struct logical_volume *lv, void *data),
2055 r = _lv_postorder_visit(lv, fn, data);
2056 _lv_postorder_cleanup(lv, 0);
2060 struct _lv_mark_if_partial_baton {
2064 static int _lv_mark_if_partial_collect(struct logical_volume *lv, void *data)
2066 struct _lv_mark_if_partial_baton *baton = data;
2067 if (lv->status & PARTIAL_LV)
2073 static int _lv_mark_if_partial_single(struct logical_volume *lv, void *data)
2076 struct _lv_mark_if_partial_baton baton;
2077 struct lv_segment *lvseg;
2079 dm_list_iterate_items(lvseg, &lv->segments) {
2080 for (s = 0; s < lvseg->area_count; ++s) {
2081 if (seg_type(lvseg, s) == AREA_PV) {
2082 if (is_missing_pv(seg_pv(lvseg, s)))
2083 lv->status |= PARTIAL_LV;
2089 _lv_each_dependency(lv, _lv_mark_if_partial_collect, &baton);
2092 lv->status |= PARTIAL_LV;
2097 static int _lv_mark_if_partial(struct logical_volume *lv)
2099 return _lv_postorder(lv, _lv_mark_if_partial_single, NULL);
2103 * Mark LVs with missing PVs using PARTIAL_LV status flag. The flag is
2104 * propagated transitively, so LVs referencing other LVs are marked
2105 * partial as well, if any of their referenced LVs are marked partial.
2107 int vg_mark_partial_lvs(struct volume_group *vg)
2109 struct logical_volume *lv;
2110 struct lv_list *lvl;
2112 dm_list_iterate_items(lvl, &vg->lvs) {
2114 if (!_lv_mark_if_partial(lv))
2121 * Be sure that all PV devices have cached read ahead in dev-cache
2122 * Currently it takes read_ahead from first PV segment only
2124 static int _lv_read_ahead_single(struct logical_volume *lv, void *data)
2126 struct lv_segment *seg = first_seg(lv);
2127 uint32_t seg_read_ahead = 0, *read_ahead = data;
2129 if (seg && seg->area_count && seg_type(seg, 0) == AREA_PV)
2130 dev_get_read_ahead(seg_pv(seg, 0)->dev, &seg_read_ahead);
2132 if (seg_read_ahead > *read_ahead)
2133 *read_ahead = seg_read_ahead;
2139 * Calculate readahead for logical volume from underlying PV devices.
2140 * If read_ahead is NULL, only ensure that readahead of PVs are preloaded
2141 * into PV struct device in dev cache.
2143 void lv_calculate_readahead(const struct logical_volume *lv, uint32_t *read_ahead)
2145 uint32_t _read_ahead = 0;
2147 if (lv->read_ahead == DM_READ_AHEAD_AUTO)
2148 _lv_postorder((struct logical_volume *)lv, _lv_read_ahead_single, &_read_ahead);
2151 log_debug("Calculated readahead of LV %s is %u", lv->name, _read_ahead);
2152 *read_ahead = _read_ahead;
2157 * Check that an LV and all its PV references are correctly listed in vg->lvs
2158 * and vg->pvs, respectively. This only looks at a single LV, but *not* at the
2159 * LVs it is using. To do the latter, you should use _lv_postorder with this
2160 * function. C.f. vg_validate.
2162 static int _lv_validate_references_single(struct logical_volume *lv, void *data)
2164 struct volume_group *vg = lv->vg;
2165 struct lv_segment *lvseg;
2166 struct pv_list *pvl;
2167 struct lv_list *lvl;
2172 dm_list_iterate_items(lvl, &vg->lvs) {
2173 if (lvl->lv == lv) {
2180 log_error(INTERNAL_ERROR
2181 "Referenced LV %s not listed in VG %s.",
2182 lv->name, vg->name);
2186 dm_list_iterate_items(lvseg, &lv->segments) {
2187 for (s = 0; s < lvseg->area_count; ++s) {
2188 if (seg_type(lvseg, s) == AREA_PV) {
2190 /* look up the reference in vg->pvs */
2191 dm_list_iterate_items(pvl, &vg->pvs) {
2192 if (pvl->pv == seg_pv(lvseg, s)) {
2199 log_error(INTERNAL_ERROR
2200 "Referenced PV %s not listed in VG %s.",
2201 pv_dev_name(seg_pv(lvseg, s)), vg->name);
2211 int vg_validate(struct volume_group *vg)
2213 struct pv_list *pvl, *pvl2;
2214 struct lv_list *lvl, *lvl2;
2215 struct lv_segment *seg;
2216 char uuid[64] __attribute__((aligned(8)));
2218 uint32_t hidden_lv_count = 0, lv_count = 0, lv_visible_count = 0;
2219 uint32_t pv_count = 0;
2220 uint32_t num_snapshots = 0;
2221 uint32_t loop_counter1, loop_counter2;
2223 if (vg->alloc == ALLOC_CLING_BY_TAGS) {
2224 log_error(INTERNAL_ERROR "VG %s allocation policy set to invalid cling_by_tags.",
2229 /* FIXME Also check there's no data/metadata overlap */
2230 dm_list_iterate_items(pvl, &vg->pvs) {
2231 if (++pv_count > vg->pv_count) {
2232 log_error(INTERNAL_ERROR "PV list corruption detected in VG %s.", vg->name);
2233 /* FIXME Dump list structure? */
2236 if (pvl->pv->vg != vg) {
2237 log_error(INTERNAL_ERROR "VG %s PV list entry points "
2238 "to different VG %s", vg->name,
2239 pvl->pv->vg ? pvl->pv->vg->name : "NULL");
2244 loop_counter1 = loop_counter2 = 0;
2245 /* FIXME Use temp hash table instead? */
2246 dm_list_iterate_items(pvl, &vg->pvs) {
2247 if (++loop_counter1 > pv_count)
2249 dm_list_iterate_items(pvl2, &vg->pvs) {
2250 if (++loop_counter2 > pv_count)
2254 if (id_equal(&pvl->pv->id,
2256 if (!id_write_format(&pvl->pv->id, uuid,
2259 log_error(INTERNAL_ERROR "Duplicate PV id "
2260 "%s detected for %s in %s.",
2261 uuid, pv_dev_name(pvl->pv),
2267 if (strcmp(pvl->pv->vg_name, vg->name)) {
2268 log_error(INTERNAL_ERROR "VG name for PV %s is corrupted.",
2269 pv_dev_name(pvl->pv));
2274 if (!check_pv_segments(vg)) {
2275 log_error(INTERNAL_ERROR "PV segments corrupted in %s.",
2281 * Count all non-snapshot invisible LVs
2283 dm_list_iterate_items(lvl, &vg->lvs) {
2286 if (lv_is_cow(lvl->lv))
2289 if (lv_is_visible(lvl->lv))
2292 if (!check_lv_segments(lvl->lv, 0)) {
2293 log_error(INTERNAL_ERROR "LV segments corrupted in %s.",
2298 if (lvl->lv->alloc == ALLOC_CLING_BY_TAGS) {
2299 log_error(INTERNAL_ERROR "LV %s allocation policy set to invalid cling_by_tags.",
2304 if (lvl->lv->status & VISIBLE_LV)
2308 if (lv_is_cow(lvl->lv))
2311 /* virtual origins are always hidden */
2312 if (lv_is_origin(lvl->lv) && !lv_is_virtual_origin(lvl->lv))
2315 /* count other non-snapshot invisible volumes */
2319 * FIXME: add check for unreferenced invisible LVs
2320 * - snapshot cow & origin
2321 * - mirror log & images
2322 * - mirror conversion volumes (_mimagetmp*)
2327 * all volumes = visible LVs + snapshot_cows + invisible LVs
2329 if (lv_count != lv_visible_count + num_snapshots + hidden_lv_count) {
2330 log_error(INTERNAL_ERROR "#internal LVs (%u) != #LVs (%"
2331 PRIu32 ") + #snapshots (%" PRIu32 ") + #internal LVs (%u) in VG %s",
2332 lv_count, lv_visible_count,
2333 num_snapshots, hidden_lv_count, vg->name);
2337 /* Avoid endless loop if lv->segments list is corrupt */
2341 loop_counter1 = loop_counter2 = 0;
2342 /* FIXME Use temp hash table instead? */
2343 dm_list_iterate_items(lvl, &vg->lvs) {
2344 if (++loop_counter1 > lv_count)
2346 dm_list_iterate_items(lvl2, &vg->lvs) {
2347 if (++loop_counter2 > lv_count)
2351 if (!strcmp(lvl->lv->name, lvl2->lv->name)) {
2352 log_error(INTERNAL_ERROR "Duplicate LV name "
2353 "%s detected in %s.", lvl->lv->name,
2357 if (id_equal(&lvl->lv->lvid.id[1],
2358 &lvl2->lv->lvid.id[1])) {
2359 if (!id_write_format(&lvl->lv->lvid.id[1], uuid,
2362 log_error(INTERNAL_ERROR "Duplicate LV id "
2363 "%s detected for %s and %s in %s.",
2364 uuid, lvl->lv->name, lvl2->lv->name,
2370 if (!check_lv_segments(lvl->lv, 1)) {
2371 log_error(INTERNAL_ERROR "LV segments corrupted in %s.",
2377 dm_list_iterate_items(lvl, &vg->lvs) {
2378 if (!_lv_postorder(lvl->lv, _lv_validate_references_single, NULL))
2382 dm_list_iterate_items(lvl, &vg->lvs) {
2383 if (!(lvl->lv->status & PVMOVE))
2385 dm_list_iterate_items(seg, &lvl->lv->segments) {
2386 if (seg_is_mirrored(seg)) {
2387 if (seg->area_count != 2) {
2388 log_error(INTERNAL_ERROR
2389 "Segment %d in %s is not 2-way.",
2390 loop_counter1, lvl->lv->name);
2393 } else if (seg->area_count != 1) {
2394 log_error(INTERNAL_ERROR
2395 "Segment %d in %s has wrong number of areas: %d.",
2396 loop_counter1, lvl->lv->name, seg->area_count);
2402 if (!(vg->fid->fmt->features & FMT_UNLIMITED_VOLS) &&
2403 (!vg->max_lv || !vg->max_pv)) {
2404 log_error(INTERNAL_ERROR "Volume group %s has limited PV/LV count"
2405 " but limit is not set.", vg->name);
2409 if (vg_max_lv_reached(vg))
2416 * After vg_write() returns success,
2417 * caller MUST call either vg_commit() or vg_revert()
2419 int vg_write(struct volume_group *vg)
2421 struct dm_list *mdah;
2422 struct metadata_area *mda;
2424 if (!vg_validate(vg))
2427 if (vg->status & PARTIAL_VG) {
2428 log_error("Cannot update partial volume group %s.", vg->name);
2432 if (vg_missing_pv_count(vg) && !vg->cmd->handles_missing_pvs) {
2433 log_error("Cannot update volume group %s while physical "
2434 "volumes are missing.", vg->name);
2438 if (vg_has_unknown_segments(vg) && !vg->cmd->handles_unknown_segments) {
2439 log_error("Cannot update volume group %s with unknown segments in it!",
2444 if ((vg->fid->fmt->features & FMT_MDAS) && !_vg_adjust_ignored_mdas(vg))
2447 if (!vg_mda_used_count(vg)) {
2448 log_error("Aborting vg_write: No metadata areas to write to!");
2452 if (!drop_cached_metadata(vg)) {
2453 log_error("Unable to drop cached metadata for VG %s.", vg->name);
2459 /* Write to each copy of the metadata area */
2460 dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
2461 if (!mda->ops->vg_write) {
2462 log_error("Format does not support writing volume"
2463 "group metadata areas");
2465 dm_list_uniterate(mdah, &vg->fid->metadata_areas_in_use, &mda->list) {
2466 mda = dm_list_item(mdah, struct metadata_area);
2468 if (mda->ops->vg_revert &&
2469 !mda->ops->vg_revert(vg->fid, vg, mda)) {
2475 if (!mda->ops->vg_write(vg->fid, vg, mda)) {
2478 dm_list_uniterate(mdah, &vg->fid->metadata_areas_in_use, &mda->list) {
2479 mda = dm_list_item(mdah, struct metadata_area);
2481 if (mda->ops->vg_revert &&
2482 !mda->ops->vg_revert(vg->fid, vg, mda)) {
2490 /* Now pre-commit each copy of the new metadata */
2491 dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
2492 if (mda->ops->vg_precommit &&
2493 !mda->ops->vg_precommit(vg->fid, vg, mda)) {
2496 dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
2497 if (mda->ops->vg_revert &&
2498 !mda->ops->vg_revert(vg->fid, vg, mda)) {
2509 static int _vg_commit_mdas(struct volume_group *vg)
2511 struct metadata_area *mda, *tmda;
2512 struct dm_list ignored;
2514 int cache_updated = 0;
2516 /* Rearrange the metadata_areas_in_use so ignored mdas come first. */
2517 dm_list_init(&ignored);
2518 dm_list_iterate_items_safe(mda, tmda, &vg->fid->metadata_areas_in_use)
2519 if (mda_is_ignored(mda))
2520 dm_list_move(&ignored, &mda->list);
2522 dm_list_iterate_items_safe(mda, tmda, &ignored)
2523 dm_list_move(&vg->fid->metadata_areas_in_use, &mda->list);
2525 /* Commit to each copy of the metadata area */
2526 dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
2528 if (mda->ops->vg_commit &&
2529 !mda->ops->vg_commit(vg->fid, vg, mda)) {
2533 /* Update cache first time we succeed */
2534 if (!failed && !cache_updated) {
2535 lvmcache_update_vg(vg, 0);
2539 return cache_updated;
2542 /* Commit pending changes */
2543 int vg_commit(struct volume_group *vg)
2545 int cache_updated = 0;
2547 if (!vgname_is_locked(vg->name)) {
2548 log_error(INTERNAL_ERROR "Attempt to write new VG metadata "
2549 "without locking %s", vg->name);
2550 return cache_updated;
2553 cache_updated = _vg_commit_mdas(vg);
2555 if (cache_updated) {
2556 /* Instruct remote nodes to upgrade cached metadata. */
2557 remote_commit_cached_metadata(vg);
2559 * We need to clear old_name after a successful commit.
2560 * The volume_group structure could be reused later.
2562 vg->old_name = NULL;
2565 /* If update failed, remove any cached precommitted metadata. */
2566 if (!cache_updated && !drop_cached_metadata(vg))
2567 log_error("Attempt to drop cached metadata failed "
2568 "after commit for VG %s.", vg->name);
2570 /* If at least one mda commit succeeded, it was committed */
2571 return cache_updated;
2574 /* Don't commit any pending changes */
2575 int vg_revert(struct volume_group *vg)
2577 struct metadata_area *mda;
2579 dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) {
2580 if (mda->ops->vg_revert &&
2581 !mda->ops->vg_revert(vg->fid, vg, mda)) {
2586 if (!drop_cached_metadata(vg))
2587 log_error("Attempt to drop cached metadata failed "
2588 "after reverted update for VG %s.", vg->name);
2590 remote_revert_cached_metadata(vg);
2595 /* Make orphan PVs look like a VG */
2596 static struct volume_group *_vg_read_orphans(struct cmd_context *cmd,
2598 const char *orphan_vgname)
2600 struct lvmcache_vginfo *vginfo;
2601 struct lvmcache_info *info;
2602 struct pv_list *pvl;
2603 struct volume_group *vg;
2604 struct physical_volume *pv;
2605 struct dm_pool *mem;
2607 lvmcache_label_scan(cmd, 0);
2609 if (!(vginfo = vginfo_from_vgname(orphan_vgname, NULL)))
2612 if (!(mem = dm_pool_create("vg_read orphan", VG_MEMPOOL_CHUNK)))
2615 if (!(vg = dm_pool_zalloc(mem, sizeof(*vg)))) {
2616 log_error("vg allocation failed");
2619 dm_list_init(&vg->pvs);
2620 dm_list_init(&vg->lvs);
2621 dm_list_init(&vg->tags);
2622 dm_list_init(&vg->removed_pvs);
2625 if (!(vg->name = dm_pool_strdup(mem, orphan_vgname))) {
2626 log_error("vg name allocation failed");
2630 /* create format instance with appropriate metadata area */
2631 if (!(vg->fid = vginfo->fmt->ops->create_instance(vginfo->fmt,
2632 orphan_vgname, NULL,
2634 log_error("Failed to create format instance");
2638 dm_list_iterate_items(info, &vginfo->infos) {
2639 if (!(pv = _pv_read(cmd, mem, dev_name(info->dev), NULL, NULL, warnings, 0))) {
2642 if (!(pvl = dm_pool_zalloc(mem, sizeof(*pvl)))) {
2643 log_error("pv_list allocation failed");
2647 add_pvl_to_vgs(vg, pvl);
2652 dm_pool_destroy(mem);
2656 static int _update_pv_list(struct dm_pool *pvmem, struct dm_list *all_pvs, struct volume_group *vg)
2658 struct pv_list *pvl, *pvl2;
2660 dm_list_iterate_items(pvl, &vg->pvs) {
2661 dm_list_iterate_items(pvl2, all_pvs) {
2662 if (pvl->pv->dev == pvl2->pv->dev)
2667 * PV is not on list so add it.
2669 if (!(pvl2 = _copy_pvl(pvmem, pvl))) {
2670 log_error("pv_list allocation for '%s' failed",
2671 pv_dev_name(pvl->pv));
2674 dm_list_add(all_pvs, &pvl2->list);
2682 int vg_missing_pv_count(const struct volume_group *vg)
2685 struct pv_list *pvl;
2686 dm_list_iterate_items(pvl, &vg->pvs) {
2687 if (is_missing_pv(pvl->pv))
2693 static void check_reappeared_pv(struct volume_group *correct_vg,
2694 struct physical_volume *pv)
2696 struct pv_list *pvl;
2699 * Skip these checks in case the tool is going to deal with missing
2700 * PVs, especially since the resulting messages can be pretty
2703 if (correct_vg->cmd->handles_missing_pvs)
2706 dm_list_iterate_items(pvl, &correct_vg->pvs)
2707 if (pv->dev == pvl->pv->dev && is_missing_pv(pvl->pv)) {
2708 log_warn("Missing device %s reappeared, updating "
2709 "metadata for VG %s to version %u.",
2710 pv_dev_name(pvl->pv), pv_vg_name(pvl->pv),
2712 if (pvl->pv->pe_alloc_count == 0) {
2713 pv->status &= ~MISSING_PV;
2714 pvl->pv->status &= ~MISSING_PV;
2716 log_warn("Device still marked missing because of allocated data "
2717 "on it, remove volumes and consider vgreduce --removemissing.");
2720 /* Caller sets consistent to 1 if it's safe for vg_read_internal to correct
2721 * inconsistent metadata on disk (i.e. the VG write lock is held).
2722 * This guarantees only consistent metadata is returned.
2723 * If consistent is 0, caller must check whether consistent == 1 on return
2724 * and take appropriate action if it isn't (e.g. abort; get write lock
2725 * and call vg_read_internal again).
2727 * If precommitted is set, use precommitted metadata if present.
2729 * Either of vgname or vgid may be NULL.
2731 static struct volume_group *_vg_read(struct cmd_context *cmd,
2735 int *consistent, unsigned precommitted)
2737 struct format_instance *fid;
2738 const struct format_type *fmt;
2739 struct volume_group *vg, *correct_vg = NULL;
2740 struct metadata_area *mda;
2741 struct lvmcache_info *info;
2742 int inconsistent = 0;
2743 int inconsistent_vgid = 0;
2744 int inconsistent_pvs = 0;
2745 int inconsistent_seqno = 0;
2746 int inconsistent_mdas = 0;
2747 unsigned use_precommitted = precommitted;
2748 unsigned saved_handles_missing_pvs = cmd->handles_missing_pvs;
2749 struct dm_list *pvids;
2750 struct pv_list *pvl, *pvl2;
2751 struct dm_list all_pvs;
2752 char uuid[64] __attribute__((aligned(8)));
2754 if (is_orphan_vg(vgname)) {
2755 if (use_precommitted) {
2756 log_error(INTERNAL_ERROR "vg_read_internal requires vgname "
2757 "with pre-commit.");
2761 return _vg_read_orphans(cmd, warnings, vgname);
2765 * If cached metadata was inconsistent and *consistent is set
2766 * then repair it now. Otherwise just return it.
2767 * Also return if use_precommitted is set due to the FIXME in
2768 * the missing PV logic below.
2770 if ((correct_vg = lvmcache_get_vg(vgid, precommitted)) &&
2771 (use_precommitted || !*consistent || !(correct_vg->status & INCONSISTENT_VG))) {
2772 if (!(correct_vg->status & INCONSISTENT_VG))
2774 else /* Inconsistent but we can't repair it */
2775 correct_vg->status &= ~INCONSISTENT_VG;
2777 if (vg_missing_pv_count(correct_vg)) {
2778 log_verbose("There are %d physical volumes missing.",
2779 vg_missing_pv_count(correct_vg));
2780 vg_mark_partial_lvs(correct_vg);
2784 free_vg(correct_vg);
2788 /* Find the vgname in the cache */
2789 /* If it's not there we must do full scan to be completely sure */
2790 if (!(fmt = fmt_from_vgname(vgname, vgid, 1))) {
2791 lvmcache_label_scan(cmd, 0);
2792 if (!(fmt = fmt_from_vgname(vgname, vgid, 1))) {
2793 /* Independent MDAs aren't supported under low memory */
2794 if (!cmd->independent_metadata_areas && memlock())
2796 lvmcache_label_scan(cmd, 2);
2797 if (!(fmt = fmt_from_vgname(vgname, vgid, 0)))
2802 /* Now determine the correct vgname if none was supplied */
2803 if (!vgname && !(vgname = vgname_from_vgid(cmd->mem, vgid)))
2806 if (use_precommitted && !(fmt->features & FMT_PRECOMMIT))
2807 use_precommitted = 0;
2809 /* create format instance with appropriate metadata area */
2810 if (!(fid = fmt->ops->create_instance(fmt, vgname, vgid, NULL))) {
2811 log_error("Failed to create format instance");
2815 /* Store pvids for later so we can check if any are missing */
2816 if (!(pvids = lvmcache_get_pvids(cmd, vgname, vgid)))
2819 /* Ensure contents of all metadata areas match - else do recovery */
2820 dm_list_iterate_items(mda, &fid->metadata_areas_in_use) {
2821 if ((use_precommitted &&
2822 !(vg = mda->ops->vg_read_precommit(fid, vgname, mda))) ||
2823 (!use_precommitted &&
2824 !(vg = mda->ops->vg_read(fid, vgname, mda)))) {
2834 /* FIXME Also ensure contents same - checksum compare? */
2835 if (correct_vg->seqno != vg->seqno) {
2836 if (cmd->metadata_read_only)
2837 log_very_verbose("Not repairing VG %s metadata seqno (%d != %d) "
2838 "as global/metadata_read_only is set.",
2839 vgname, vg->seqno, correct_vg->seqno);
2842 inconsistent_seqno = 1;
2844 if (vg->seqno > correct_vg->seqno) {
2845 free_vg(correct_vg);
2850 if (vg != correct_vg)
2854 /* Ensure every PV in the VG was in the cache */
2857 * If the VG has PVs without mdas, or ignored mdas, they may
2858 * still be orphans in the cache: update the cache state here,
2859 * and update the metadata lists in the vg.
2861 if (!inconsistent &&
2862 dm_list_size(&correct_vg->pvs) > dm_list_size(pvids)) {
2863 dm_list_iterate_items(pvl, &correct_vg->pvs) {
2864 if (!pvl->pv->dev) {
2865 inconsistent_pvs = 1;
2869 if (str_list_match_item(pvids, pvl->pv->dev->pvid))
2873 * PV not marked as belonging to this VG in cache.
2874 * Check it's an orphan without metadata area
2877 if (!(info = info_from_pvid(pvl->pv->dev->pvid, 1)) ||
2878 !info->vginfo || !is_orphan_vg(info->vginfo->vgname)) {
2879 inconsistent_pvs = 1;
2882 if (dm_list_size(&info->mdas)) {
2883 if (!fid_add_mdas(fid, &info->mdas))
2886 log_debug("Empty mda found for VG %s.", vgname);
2888 if (inconsistent_mdas)
2892 * If any newly-added mdas are in-use then their
2893 * metadata needs updating.
2895 dm_list_iterate_items(mda, &info->mdas)
2896 if (!mda_is_ignored(mda)) {
2897 inconsistent_mdas = 1;
2903 /* If the check passed, let's update VG and recalculate pvids */
2904 if (!inconsistent_pvs) {
2905 log_debug("Updating cache for PVs without mdas "
2906 "in VG %s.", vgname);
2908 * If there is no precommitted metadata, committed metadata
2909 * is read and stored in the cache even if use_precommitted is set
2911 lvmcache_update_vg(correct_vg, correct_vg->status & PRECOMMITTED);
2913 if (!(pvids = lvmcache_get_pvids(cmd, vgname, vgid)))
2918 if (dm_list_size(&correct_vg->pvs) !=
2919 dm_list_size(pvids) + vg_missing_pv_count(correct_vg)) {
2920 log_debug("Cached VG %s had incorrect PV list",
2926 free_vg(correct_vg);
2929 } else dm_list_iterate_items(pvl, &correct_vg->pvs) {
2930 if (is_missing_pv(pvl->pv))
2932 if (!str_list_match_item(pvids, pvl->pv->dev->pvid)) {
2933 log_debug("Cached VG %s had incorrect PV list",
2935 free_vg(correct_vg);
2941 if (correct_vg && inconsistent_mdas) {
2942 free_vg(correct_vg);
2947 dm_list_init(&all_pvs);
2949 /* Failed to find VG where we expected it - full scan and retry */
2953 /* Independent MDAs aren't supported under low memory */
2954 if (!cmd->independent_metadata_areas && memlock())
2956 lvmcache_label_scan(cmd, 2);
2957 if (!(fmt = fmt_from_vgname(vgname, vgid, 0)))
2960 if (precommitted && !(fmt->features & FMT_PRECOMMIT))
2961 use_precommitted = 0;
2963 /* create format instance with appropriate metadata area */
2964 if (!(fid = fmt->ops->create_instance(fmt, vgname, vgid, NULL))) {
2965 log_error("Failed to create format instance");
2969 /* Ensure contents of all metadata areas match - else recover */
2970 dm_list_iterate_items(mda, &fid->metadata_areas_in_use) {
2971 if ((use_precommitted &&
2972 !(vg = mda->ops->vg_read_precommit(fid, vgname,
2974 (!use_precommitted &&
2975 !(vg = mda->ops->vg_read(fid, vgname, mda)))) {
2981 if (!_update_pv_list(cmd->mem, &all_pvs, correct_vg)) {
2988 if (strncmp((char *)vg->id.uuid,
2989 (char *)correct_vg->id.uuid, ID_LEN)) {
2991 inconsistent_vgid = 1;
2994 /* FIXME Also ensure contents same - checksums same? */
2995 if (correct_vg->seqno != vg->seqno) {
2996 /* Ignore inconsistent seqno if told to skip repair logic */
2997 if (cmd->metadata_read_only)
2998 log_very_verbose("Not repairing VG %s metadata seqno (%d != %d) "
2999 "as global/metadata_read_only is set.",
3000 vgname, vg->seqno, correct_vg->seqno);
3003 inconsistent_seqno = 1;
3005 if (!_update_pv_list(cmd->mem, &all_pvs, vg)) {
3007 free_vg(correct_vg);
3010 if (vg->seqno > correct_vg->seqno) {
3011 free_vg(correct_vg);
3016 if (vg != correct_vg)
3020 /* Give up looking */
3026 * If there is no precommitted metadata, committed metadata
3027 * is read and stored in the cache even if use_precommitted is set
3029 lvmcache_update_vg(correct_vg, correct_vg->status & PRECOMMITTED &
3030 (inconsistent ? INCONSISTENT_VG : 0));
3033 /* FIXME Test should be if we're *using* precommitted metadata not if we were searching for it */
3034 if (use_precommitted) {
3035 log_error("Inconsistent pre-commit metadata copies "
3036 "for volume group %s", vgname);
3037 /* FIXME: during repair, there is inconsistent flag set because some metadata areas
3038 * are missing (on missing PVs). Code should create list of missing PVs, compare it
3039 * with PV marked missing in metadata and if equals, use it as consistent vg.
3040 * For now, return precommited metadata if remainng seq match here to allow
3041 * preloading table in suspend call.
3043 if (!inconsistent_seqno) {
3047 free_vg(correct_vg);
3054 /* Don't touch if vgids didn't match */
3055 if (inconsistent_vgid) {
3056 log_error("Inconsistent metadata UUIDs found for "
3057 "volume group %s", vgname);
3062 log_warn("WARNING: Inconsistent metadata found for VG %s - updating "
3063 "to use version %u", vgname, correct_vg->seqno);
3066 * If PV is marked missing but we found it,
3067 * update metadata and remove MISSING flag
3069 dm_list_iterate_items(pvl, &all_pvs)
3070 check_reappeared_pv(correct_vg, pvl->pv);
3072 cmd->handles_missing_pvs = 1;
3073 if (!vg_write(correct_vg)) {
3074 log_error("Automatic metadata correction failed");
3075 free_vg(correct_vg);
3076 cmd->handles_missing_pvs = saved_handles_missing_pvs;
3079 cmd->handles_missing_pvs = saved_handles_missing_pvs;
3081 if (!vg_commit(correct_vg)) {
3082 log_error("Automatic metadata correction commit "
3084 free_vg(correct_vg);
3088 dm_list_iterate_items(pvl, &all_pvs) {
3089 dm_list_iterate_items(pvl2, &correct_vg->pvs) {
3090 if (pvl->pv->dev == pvl2->pv->dev)
3093 if (!id_write_format(&pvl->pv->id, uuid, sizeof(uuid))) {
3094 free_vg(correct_vg);
3097 log_error("Removing PV %s (%s) that no longer belongs to VG %s",
3098 pv_dev_name(pvl->pv), uuid, correct_vg->name);
3099 if (!pv_write_orphan(cmd, pvl->pv)) {
3100 free_vg(correct_vg);
3104 /* Refresh metadata after orphan write */
3105 drop_cached_metadata(correct_vg);
3111 if (vg_missing_pv_count(correct_vg)) {
3112 log_verbose("There are %d physical volumes missing.",
3113 vg_missing_pv_count(correct_vg));
3114 vg_mark_partial_lvs(correct_vg);
3117 if ((correct_vg->status & PVMOVE) && !pvmove_mode()) {
3118 log_error("WARNING: Interrupted pvmove detected in "
3119 "volume group %s", correct_vg->name);
3120 log_error("Please restore the metadata by running "
3122 free_vg(correct_vg);
3130 struct volume_group *vg_read_internal(struct cmd_context *cmd, const char *vgname,
3131 const char *vgid, int warnings, int *consistent)
3133 struct volume_group *vg;
3134 struct lv_list *lvl;
3136 if (!(vg = _vg_read(cmd, vgname, vgid, warnings, consistent, 0)))
3139 if (!check_pv_segments(vg)) {
3140 log_error(INTERNAL_ERROR "PV segments corrupted in %s.",
3146 dm_list_iterate_items(lvl, &vg->lvs) {
3147 if (!check_lv_segments(lvl->lv, 0)) {
3148 log_error(INTERNAL_ERROR "LV segments corrupted in %s.",
3155 dm_list_iterate_items(lvl, &vg->lvs) {
3157 * Checks that cross-reference other LVs.
3159 if (!check_lv_segments(lvl->lv, 1)) {
3160 log_error(INTERNAL_ERROR "LV segments corrupted in %s.",
3170 void free_vg(struct volume_group *vg)
3175 if (vg->cmd && vg->vgmem == vg->cmd->mem) {
3176 log_error(INTERNAL_ERROR "global memory pool used for VG %s",
3181 dm_pool_destroy(vg->vgmem);
3184 /* This is only called by lv_from_lvid, which is only called from
3185 * activate.c so we know the appropriate VG lock is already held and
3186 * the vg_read_internal is therefore safe.
3188 static struct volume_group *_vg_read_by_vgid(struct cmd_context *cmd,
3190 unsigned precommitted)
3193 struct dm_list *vgnames;
3194 struct volume_group *vg;
3195 struct lvmcache_vginfo *vginfo;
3196 struct str_list *strl;
3199 /* Is corresponding vgname already cached? */
3200 if ((vginfo = vginfo_from_vgid(vgid)) &&
3201 vginfo->vgname && !is_orphan_vg(vginfo->vgname)) {
3202 if ((vg = _vg_read(cmd, NULL, vgid, 1,
3203 &consistent, precommitted)) &&
3204 !strncmp((char *)vg->id.uuid, vgid, ID_LEN)) {
3206 log_error("Volume group %s metadata is "
3207 "inconsistent", vg->name);
3213 /* Mustn't scan if memory locked: ensure cache gets pre-populated! */
3217 /* FIXME Need a genuine read by ID here - don't vg_read_internal by name! */
3218 /* FIXME Disabled vgrenames while active for now because we aren't
3219 * allowed to do a full scan here any more. */
3221 // The slow way - full scan required to cope with vgrename
3222 lvmcache_label_scan(cmd, 2);
3223 if (!(vgnames = get_vgnames(cmd, 0))) {
3224 log_error("vg_read_by_vgid: get_vgnames failed");
3228 dm_list_iterate_items(strl, vgnames) {
3231 continue; // FIXME Unnecessary?
3233 if ((vg = _vg_read(cmd, vgname, vgid, 1, &consistent,
3235 !strncmp((char *)vg->id.uuid, vgid, ID_LEN)) {
3237 log_error("Volume group %s metadata is "
3238 "inconsistent", vgname);
3250 /* Only called by activate.c */
3251 struct logical_volume *lv_from_lvid(struct cmd_context *cmd, const char *lvid_s,
3252 unsigned precommitted)
3254 struct lv_list *lvl;
3255 struct volume_group *vg;
3256 const union lvid *lvid;
3258 lvid = (const union lvid *) lvid_s;
3260 log_very_verbose("Finding volume group for uuid %s", lvid_s);
3261 if (!(vg = _vg_read_by_vgid(cmd, (const char *)lvid->id[0].uuid, precommitted))) {
3262 log_error("Volume group for uuid not found: %s", lvid_s);
3266 log_verbose("Found volume group \"%s\"", vg->name);
3267 if (vg->status & EXPORTED_VG) {
3268 log_error("Volume group \"%s\" is exported", vg->name);
3271 if (!(lvl = find_lv_in_vg_by_lvid(vg, lvid))) {
3272 log_very_verbose("Can't find logical volume id %s", lvid_s);
3283 const char *find_vgname_from_pvid(struct cmd_context *cmd,
3287 struct lvmcache_info *info;
3289 vgname = lvmcache_vgname_from_pvid(cmd, pvid);
3291 if (is_orphan_vg(vgname)) {
3292 if (!(info = info_from_pvid(pvid, 0))) {
3296 * If an orphan PV has no MDAs, or it has MDAs but the
3297 * MDA is ignored, it may appear to be an orphan until
3298 * the metadata is read off another PV in the same VG.
3299 * Detecting this means checking every VG by scanning
3300 * every PV on the system.
3302 if (mdas_empty_or_ignored(&info->mdas)) {
3303 if (!scan_vgs_for_pvs(cmd, 1)) {
3304 log_error("Rescan for PVs without "
3305 "metadata areas failed.");
3309 * Ask lvmcache again - we may have a non-orphan
3312 vgname = lvmcache_vgname_from_pvid(cmd, pvid);
3319 const char *find_vgname_from_pvname(struct cmd_context *cmd,
3324 pvid = pvid_from_devname(cmd, pvname);
3329 return find_vgname_from_pvid(cmd, pvid);
3333 * pv_read - read and return a handle to a physical volume
3334 * @cmd: LVM command initiating the pv_read
3335 * @pv_name: full device name of the PV, including the path
3336 * @mdas: list of metadata areas of the PV
3337 * @label_sector: sector number where the PV label is stored on @pv_name
3341 * PV handle - valid pv_name and successful read of the PV, or
3342 * NULL - invalid parameter or error in reading the PV
3345 * FIXME - liblvm todo - make into function that returns handle
3347 struct physical_volume *pv_read(struct cmd_context *cmd, const char *pv_name,
3348 struct dm_list *mdas, uint64_t *label_sector,
3349 int warnings, int scan_label_only)
3351 return _pv_read(cmd, cmd->mem, pv_name, mdas, label_sector, warnings, scan_label_only);
3354 /* FIXME Use label functions instead of PV functions */
3355 static struct physical_volume *_pv_read(struct cmd_context *cmd,
3356 struct dm_pool *pvmem,
3357 const char *pv_name,
3358 struct dm_list *mdas,
3359 uint64_t *label_sector,
3360 int warnings, int scan_label_only)
3362 struct physical_volume *pv;
3363 struct label *label;
3364 struct lvmcache_info *info;
3367 if (!(dev = dev_cache_get(pv_name, cmd->filter)))
3370 if (!(label_read(dev, &label, UINT64_C(0)))) {
3372 log_error("No physical volume label read from %s",
3377 info = (struct lvmcache_info *) label->info;
3378 if (label_sector && *label_sector)
3379 *label_sector = label->sector;
3381 pv = _alloc_pv(pvmem, dev);
3383 log_error("pv allocation for '%s' failed", pv_name);
3387 /* FIXME Move more common code up here */
3388 if (!(info->fmt->ops->pv_read(info->fmt, pv_name, pv, mdas,
3389 scan_label_only))) {
3390 log_error("Failed to read existing physical volume '%s'",
3398 if (!alloc_pv_segment_whole_pv(pvmem, pv))
3403 _free_pv(pvmem, pv);
3407 /* May return empty list */
3408 struct dm_list *get_vgnames(struct cmd_context *cmd, int include_internal)
3410 return lvmcache_get_vgnames(cmd, include_internal);
3413 struct dm_list *get_vgids(struct cmd_context *cmd, int include_internal)
3415 return lvmcache_get_vgids(cmd, include_internal);
3418 static int _get_pvs(struct cmd_context *cmd, int warnings, struct dm_list **pvslist)
3420 struct str_list *strl;
3421 struct dm_list * uninitialized_var(results);
3422 const char *vgname, *vgid;
3423 struct pv_list *pvl, *pvl_copy;
3424 struct dm_list *vgids;
3425 struct volume_group *vg;
3429 lvmcache_label_scan(cmd, 0);
3432 if (!(results = dm_pool_alloc(cmd->mem, sizeof(*results)))) {
3433 log_error("PV list allocation failed");
3437 dm_list_init(results);
3440 /* Get list of VGs */
3441 if (!(vgids = get_vgids(cmd, 1))) {
3442 log_error("get_pvs: get_vgids failed");
3446 /* Read every VG to ensure cache consistency */
3447 /* Orphan VG is last on list */
3448 old_pvmove = pvmove_mode();
3450 dm_list_iterate_items(strl, vgids) {
3453 continue; /* FIXME Unnecessary? */
3455 if (!(vgname = vgname_from_vgid(NULL, vgid))) {
3459 if (!(vg = vg_read_internal(cmd, vgname, vgid, warnings, &consistent))) {
3464 log_warn("WARNING: Volume Group %s is not consistent",
3467 /* Move PVs onto results list */
3469 dm_list_iterate_items(pvl, &vg->pvs) {
3470 if (!(pvl_copy = _copy_pvl(cmd->mem, pvl))) {
3471 log_error("PV list allocation failed");
3475 dm_list_add(results, &pvl_copy->list);
3479 init_pvmove(old_pvmove);
3484 dm_pool_free(cmd->mem, vgids);
3489 struct dm_list *get_pvs(struct cmd_context *cmd)
3491 struct dm_list *results;
3493 if (!_get_pvs(cmd, 1, &results))
3499 int scan_vgs_for_pvs(struct cmd_context *cmd, int warnings)
3501 return _get_pvs(cmd, warnings, NULL);
3504 int pv_write(struct cmd_context *cmd __attribute__((unused)),
3505 struct physical_volume *pv,
3506 struct dm_list *mdas, int64_t label_sector)
3508 if (!pv->fmt->ops->pv_write) {
3509 log_error("Format does not support writing physical volumes");
3513 if (!is_orphan_vg(pv->vg_name) || pv->pe_alloc_count) {
3514 log_error("Assertion failed: can't _pv_write non-orphan PV "
3515 "(in VG %s)", pv->vg_name);
3519 if (!pv->fmt->ops->pv_write(pv->fmt, pv, mdas, label_sector))
3525 int pv_write_orphan(struct cmd_context *cmd, struct physical_volume *pv)
3527 const char *old_vg_name = pv->vg_name;
3529 pv->vg_name = cmd->fmt->orphan_vg_name;
3530 pv->status = ALLOCATABLE_PV;
3531 pv->pe_alloc_count = 0;
3533 if (!dev_get_size(pv->dev, &pv->size)) {
3534 log_error("%s: Couldn't get size.", pv_dev_name(pv));
3538 if (!pv_write(cmd, pv, NULL, INT64_C(-1))) {
3539 log_error("Failed to clear metadata from physical "
3540 "volume \"%s\" after removal from \"%s\"",
3541 pv_dev_name(pv), old_vg_name);
3548 int is_global_vg(const char *vg_name)
3550 return (vg_name && !strcmp(vg_name, VG_GLOBAL)) ? 1 : 0;
3554 * is_orphan_vg - Determine whether a vg_name is an orphan
3555 * @vg_name: pointer to the vg_name
3557 int is_orphan_vg(const char *vg_name)
3559 return (vg_name && !strncmp(vg_name, ORPHAN_PREFIX, sizeof(ORPHAN_PREFIX) - 1)) ? 1 : 0;
3567 int pv_analyze(struct cmd_context *cmd, const char *pv_name,
3568 uint64_t label_sector)
3570 struct label *label;
3572 struct metadata_area *mda;
3573 struct lvmcache_info *info;
3575 dev = dev_cache_get(pv_name, cmd->filter);
3577 log_error("Device %s not found (or ignored by filtering).",
3583 * First, scan for LVM labels.
3585 if (!label_read(dev, &label, label_sector)) {
3586 log_error("Could not find LVM label on %s",
3591 log_print("Found label on %s, sector %"PRIu64", type=%s",
3592 pv_name, label->sector, label->type);
3595 * Next, loop through metadata areas
3598 dm_list_iterate_items(mda, &info->mdas)
3599 mda->ops->pv_analyze_mda(info->fmt, mda);
3604 /* FIXME: remove / combine this with locking? */
3605 int vg_check_write_mode(struct volume_group *vg)
3607 if (vg->open_mode != 'w') {
3608 log_errno(EPERM, "Attempt to modify a read-only VG");
3615 * Performs a set of checks against a VG according to bits set in status
3616 * and returns FAILED_* bits for those that aren't acceptable.
3618 * FIXME Remove the unnecessary duplicate definitions and return bits directly.
3620 static uint32_t _vg_bad_status_bits(const struct volume_group *vg,
3623 uint32_t failure = 0;
3625 if ((status & CLUSTERED) &&
3626 (vg_is_clustered(vg)) && !locking_is_clustered()) {
3627 log_error("Skipping clustered volume group %s", vg->name);
3628 /* Return because other flags are considered undefined. */
3629 return FAILED_CLUSTERED;
3632 if ((status & EXPORTED_VG) &&
3633 vg_is_exported(vg)) {
3634 log_error("Volume group %s is exported", vg->name);
3635 failure |= FAILED_EXPORTED;
3638 if ((status & LVM_WRITE) &&
3639 !(vg->status & LVM_WRITE)) {
3640 log_error("Volume group %s is read-only", vg->name);
3641 failure |= FAILED_READ_ONLY;
3644 if ((status & RESIZEABLE_VG) &&
3645 !vg_is_resizeable(vg)) {
3646 log_error("Volume group %s is not resizeable.", vg->name);
3647 failure |= FAILED_RESIZEABLE;
3654 * vg_check_status - check volume group status flags and log error
3655 * @vg - volume group to check status flags
3656 * @status - specific status flags to check (e.g. EXPORTED_VG)
3658 int vg_check_status(const struct volume_group *vg, uint64_t status)
3660 return !_vg_bad_status_bits(vg, status);
3663 static struct volume_group *_recover_vg(struct cmd_context *cmd,
3664 const char *vg_name, const char *vgid)
3667 struct volume_group *vg;
3669 unlock_vg(cmd, vg_name);
3673 if (!lock_vol(cmd, vg_name, LCK_VG_WRITE))
3676 if (!(vg = vg_read_internal(cmd, vg_name, vgid, 1, &consistent)))
3684 return (struct volume_group *)vg;
3688 * Consolidated locking, reading, and status flag checking.
3690 * If the metadata is inconsistent, setting READ_ALLOW_INCONSISTENT in
3691 * misc_flags will return it with FAILED_INCONSISTENT set instead of
3692 * giving you nothing.
3694 * Use vg_read_error(vg) to determine the result. Nonzero means there were
3695 * problems reading the volume group.
3696 * Zero value means that the VG is open and appropriate locks are held.
3698 static struct volume_group *_vg_lock_and_read(struct cmd_context *cmd, const char *vg_name,
3699 const char *vgid, uint32_t lock_flags,
3700 uint64_t status_flags, uint32_t misc_flags)
3702 struct volume_group *vg = NULL;
3705 uint32_t failure = 0;
3708 if (misc_flags & READ_ALLOW_INCONSISTENT || lock_flags != LCK_VG_WRITE)
3711 if (!validate_name(vg_name) && !is_orphan_vg(vg_name)) {
3712 log_error("Volume group name %s has invalid characters",
3717 already_locked = vgname_is_locked(vg_name);
3719 if (!already_locked && !(misc_flags & READ_WITHOUT_LOCK) &&
3720 !lock_vol(cmd, vg_name, lock_flags)) {
3721 log_error("Can't get lock for %s", vg_name);
3722 return _vg_make_handle(cmd, vg, FAILED_LOCKING);
3725 if (is_orphan_vg(vg_name))
3726 status_flags &= ~LVM_WRITE;
3728 consistent_in = consistent;
3730 /* If consistent == 1, we get NULL here if correction fails. */
3731 if (!(vg = vg_read_internal(cmd, vg_name, vgid, 1, &consistent))) {
3732 if (consistent_in && !consistent) {
3733 log_error("Volume group \"%s\" inconsistent.", vg_name);
3734 failure |= FAILED_INCONSISTENT;
3738 log_error("Volume group \"%s\" not found", vg_name);
3740 failure |= FAILED_NOTFOUND;
3744 if (vg_is_clustered(vg) && !locking_is_clustered()) {
3745 log_error("Skipping clustered volume group %s", vg->name);
3746 failure |= FAILED_CLUSTERED;
3750 /* consistent == 0 when VG is not found, but failed == FAILED_NOTFOUND */
3751 if (!consistent && !failure) {
3753 if (!(vg = _recover_vg(cmd, vg_name, vgid))) {
3754 log_error("Recovery of volume group \"%s\" failed.",
3756 failure |= FAILED_INCONSISTENT;
3762 * Check that the tool can handle tricky cases -- missing PVs and
3763 * unknown segment types.
3766 if (!cmd->handles_missing_pvs && vg_missing_pv_count(vg) &&
3767 lock_flags == LCK_VG_WRITE) {
3768 log_error("Cannot change VG %s while PVs are missing.", vg->name);
3769 log_error("Consider vgreduce --removemissing.");
3770 failure |= FAILED_INCONSISTENT; /* FIXME new failure code here? */
3774 if (!cmd->handles_unknown_segments && vg_has_unknown_segments(vg) &&
3775 lock_flags == LCK_VG_WRITE) {
3776 log_error("Cannot change VG %s with unknown segments in it!",
3778 failure |= FAILED_INCONSISTENT; /* FIXME new failure code here? */
3782 failure |= _vg_bad_status_bits(vg, status_flags);
3786 return _vg_make_handle(cmd, vg, failure);
3789 if (!already_locked && !(misc_flags & READ_WITHOUT_LOCK))
3790 unlock_vg(cmd, vg_name);
3792 return _vg_make_handle(cmd, vg, failure);
3796 * vg_read: High-level volume group metadata read function.
3798 * vg_read_error() must be used on any handle returned to check for errors.
3800 * - metadata inconsistent and automatic correction failed: FAILED_INCONSISTENT
3801 * - VG is read-only: FAILED_READ_ONLY
3802 * - VG is EXPORTED, unless flags has READ_ALLOW_EXPORTED: FAILED_EXPORTED
3803 * - VG is not RESIZEABLE: FAILED_RESIZEABLE
3804 * - locking failed: FAILED_LOCKING
3806 * On failures, all locks are released, unless one of the following applies:
3807 * - vgname_is_locked(lock_name) is true
3808 * FIXME: remove the above 2 conditions if possible and make an error always
3811 * Volume groups are opened read-only unless flags contains READ_FOR_UPDATE.
3813 * Checking for VG existence:
3815 * FIXME: We want vg_read to attempt automatic recovery after acquiring a
3816 * temporary write lock: if that fails, we bail out as usual, with failed &
3817 * FAILED_INCONSISTENT. If it works, we are good to go. Code that's been in
3818 * toollib just set lock_flags to LCK_VG_WRITE and called vg_read_internal with
3821 struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name,
3822 const char *vgid, uint32_t flags)
3824 uint64_t status = UINT64_C(0);
3825 uint32_t lock_flags = LCK_VG_READ;
3827 if (flags & READ_FOR_UPDATE) {
3828 status |= EXPORTED_VG | LVM_WRITE;
3829 lock_flags = LCK_VG_WRITE;
3832 if (flags & READ_ALLOW_EXPORTED)
3833 status &= ~EXPORTED_VG;
3835 return _vg_lock_and_read(cmd, vg_name, vgid, lock_flags, status, flags);
3839 * A high-level volume group metadata reading function. Open a volume group for
3840 * later update (this means the user code can change the metadata and later
3841 * request the new metadata to be written and committed).
3843 struct volume_group *vg_read_for_update(struct cmd_context *cmd, const char *vg_name,
3844 const char *vgid, uint32_t flags)
3846 return vg_read(cmd, vg_name, vgid, flags | READ_FOR_UPDATE);
3850 * Test the validity of a VG handle returned by vg_read() or vg_read_for_update().
3852 uint32_t vg_read_error(struct volume_group *vg_handle)
3855 return FAILED_ALLOCATION;
3857 return vg_handle->read_status;
3861 * Lock a vgname and/or check for existence.
3862 * Takes a WRITE lock on the vgname before scanning.
3863 * If scanning fails or vgname found, release the lock.
3864 * NOTE: If you find the return codes confusing, you might think of this
3865 * function as similar to an open() call with O_CREAT and O_EXCL flags
3866 * (open returns fail with -EEXIST if file already exists).
3869 * FAILED_LOCKING - Cannot lock name
3870 * FAILED_EXIST - VG name already exists - cannot reserve
3871 * SUCCESS - VG name does not exist in system and WRITE lock held
3873 uint32_t vg_lock_newname(struct cmd_context *cmd, const char *vgname)
3875 if (!lock_vol(cmd, vgname, LCK_VG_WRITE)) {
3876 return FAILED_LOCKING;
3879 /* Find the vgname in the cache */
3880 /* If it's not there we must do full scan to be completely sure */
3881 if (!fmt_from_vgname(vgname, NULL, 1)) {
3882 lvmcache_label_scan(cmd, 0);
3883 if (!fmt_from_vgname(vgname, NULL, 1)) {
3884 /* Independent MDAs aren't supported under low memory */
3885 if (!cmd->independent_metadata_areas && memlock()) {
3887 * FIXME: Disallow calling this function if
3888 * memlock() is true.
3890 unlock_vg(cmd, vgname);
3891 return FAILED_LOCKING;
3893 lvmcache_label_scan(cmd, 2);
3894 if (!fmt_from_vgname(vgname, NULL, 0)) {
3895 /* vgname not found after scanning */
3901 /* Found vgname so cannot reserve. */
3902 unlock_vg(cmd, vgname);
3903 return FAILED_EXIST;
3906 void fid_add_mda(struct format_instance *fid, struct metadata_area *mda)
3908 dm_list_add(mda_is_ignored(mda) ? &fid->metadata_areas_ignored :
3909 &fid->metadata_areas_in_use, &mda->list);
3912 int fid_add_mdas(struct format_instance *fid, struct dm_list *mdas)
3914 struct metadata_area *mda, *mda_new;
3916 dm_list_iterate_items(mda, mdas) {
3917 mda_new = mda_copy(fid->fmt->cmd->mem, mda);
3920 fid_add_mda(fid, mda_new);
3926 * Copy constructor for a metadata_area.
3928 struct metadata_area *mda_copy(struct dm_pool *mem,
3929 struct metadata_area *mda)
3931 struct metadata_area *mda_new;
3933 if (!(mda_new = dm_pool_alloc(mem, sizeof(*mda_new)))) {
3934 log_error("metadata_area allocation failed");
3937 memcpy(mda_new, mda, sizeof(*mda));
3938 if (mda->ops->mda_metadata_locn_copy && mda->metadata_locn) {
3939 mda_new->metadata_locn =
3940 mda->ops->mda_metadata_locn_copy(mem, mda->metadata_locn);
3941 if (!mda_new->metadata_locn) {
3942 dm_pool_free(mem, mda_new);
3947 dm_list_init(&mda_new->list);
3952 * This function provides a way to answer the question on a format specific
3953 * basis - does the format specfic context of these two metadata areas
3956 * A metatdata_area is defined to be independent of the underlying context.
3957 * This has the benefit that we can use the same abstraction to read disks
3958 * (see _metadata_text_raw_ops) or files (see _metadata_text_file_ops).
3959 * However, one downside is there is no format-independent way to determine
3960 * whether a given metadata_area is attached to a specific device - in fact,
3961 * it may not be attached to a device at all.
3963 * Thus, LVM is structured such that an mda is not a member of struct
3964 * physical_volume. The location of the mda depends on whether
3965 * the PV is in a volume group. A PV not in a VG has an mda on the
3966 * 'info->mda' list in lvmcache, while a PV in a VG has an mda on
3967 * the vg->fid->metadata_areas_in_use list. For further details, see _vg_read(),
3968 * and the sequence of creating the format_instance with fid->metadata_areas_in_use
3969 * list, as well as the construction of the VG, with list of PVs (comes
3970 * after the construction of the fid and list of mdas).
3972 unsigned mda_locns_match(struct metadata_area *mda1, struct metadata_area *mda2)
3974 if (!mda1->ops->mda_locns_match || !mda2->ops->mda_locns_match ||
3975 mda1->ops->mda_locns_match != mda2->ops->mda_locns_match)
3978 return mda1->ops->mda_locns_match(mda1, mda2);
3981 unsigned mda_is_ignored(struct metadata_area *mda)
3983 return (mda->status & MDA_IGNORED);
3986 void mda_set_ignored(struct metadata_area *mda, unsigned mda_ignored)
3988 void *locn = mda->metadata_locn;
3989 unsigned old_mda_ignored = mda_is_ignored(mda);
3991 if (mda_ignored && !old_mda_ignored)
3992 mda->status |= MDA_IGNORED;
3993 else if (!mda_ignored && old_mda_ignored)
3994 mda->status &= ~MDA_IGNORED;
3996 return; /* No change */
3998 log_debug("%s ignored flag for mda %s at offset %" PRIu64 ".",
3999 mda_ignored ? "Setting" : "Clearing",
4000 mda->ops->mda_metadata_locn_name ? mda->ops->mda_metadata_locn_name(locn) : "",
4001 mda->ops->mda_metadata_locn_offset ? mda->ops->mda_metadata_locn_offset(locn) : UINT64_C(0));
4004 int mdas_empty_or_ignored(struct dm_list *mdas)
4006 struct metadata_area *mda;
4008 if (!dm_list_size(mdas))
4010 dm_list_iterate_items(mda, mdas) {
4011 if (mda_is_ignored(mda))
4017 int pv_change_metadataignore(struct physical_volume *pv, uint32_t mda_ignored)
4019 const char *pv_name = pv_dev_name(pv);
4021 if (mda_ignored && !pv_mda_used_count(pv)) {
4022 log_error("Metadata areas on physical volume \"%s\" already "
4023 "ignored.", pv_name);
4027 if (!mda_ignored && (pv_mda_used_count(pv) == pv_mda_count(pv))) {
4028 log_error("Metadata areas on physical volume \"%s\" already "
4029 "marked as in-use.", pv_name);
4033 if (!pv_mda_count(pv)) {
4034 log_error("Physical volume \"%s\" has no metadata "
4039 log_verbose("Marking metadata areas on physical volume \"%s\" "
4040 "as %s.", pv_name, mda_ignored ? "ignored" : "in-use");
4042 if (!pv_mda_set_ignored(pv, mda_ignored))
4046 * Update vg_mda_copies based on the mdas in this PV.
4047 * This is most likely what the user would expect - if they
4048 * specify a specific PV to be ignored/un-ignored, they will
4049 * most likely not want LVM to turn around and change the
4050 * ignore / un-ignore value when it writes the VG to disk.
4051 * This does not guarantee this PV's ignore bits will be
4052 * preserved in future operations.
4054 if (!is_orphan(pv) &&
4055 vg_mda_copies(pv->vg) != VGMETADATACOPIES_UNMANAGED) {
4056 log_warn("WARNING: Changing preferred number of copies of VG %s "
4057 "metadata from %"PRIu32" to %"PRIu32, pv_vg_name(pv),
4058 vg_mda_copies(pv->vg), vg_mda_used_count(pv->vg));
4059 vg_set_mda_copies(pv->vg, vg_mda_used_count(pv->vg));
4065 char *tags_format_and_copy(struct dm_pool *mem, const struct dm_list *tags)
4067 struct str_list *sl;
4069 if (!dm_pool_begin_object(mem, 256)) {
4070 log_error("dm_pool_begin_object failed");
4074 dm_list_iterate_items(sl, tags) {
4075 if (!dm_pool_grow_object(mem, sl->str, strlen(sl->str)) ||
4076 (sl->list.n != tags && !dm_pool_grow_object(mem, ",", 1))) {
4077 log_error("dm_pool_grow_object failed");
4082 if (!dm_pool_grow_object(mem, "\0", 1)) {
4083 log_error("dm_pool_grow_object failed");
4086 return dm_pool_end_object(mem);
4090 * pv_by_path - Given a device path return a PV handle if it is a PV
4091 * @cmd - handle to the LVM command instance
4092 * @pv_name - device path to read for the PV
4095 * NULL - device path does not contain a valid PV
4096 * non-NULL - PV handle corresponding to device path
4098 * FIXME: merge with find_pv_by_name ?
4100 struct physical_volume *pv_by_path(struct cmd_context *cmd, const char *pv_name)
4102 struct dm_list mdas;
4104 dm_list_init(&mdas);
4105 return _pv_read(cmd, cmd->mem, pv_name, &mdas, NULL, 1, 0);