md: Whenassemble the array, consult the superblock of the freshest device

author Alex Lyakas <alex.lyakas@zadara.com>

Wed, 13 Dec 2023 12:24:31 +0000 (14:24 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 5 Feb 2024 20:14:24 +0000 (20:14 +0000)
author Alex Lyakas <alex.lyakas@zadara.com>
Wed, 13 Dec 2023 12:24:31 +0000 (14:24 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 5 Feb 2024 20:14:24 +0000 (20:14 +0000)
diff --git a/drivers/md/md.c b/drivers/md/md.c

index 8c40c1c..dccf270 100644 (file)
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1122,6 +1122,7 @@ struct super_type  {
                                           struct md_rdev *refdev,
                                           int minor_version);
         int                 (*validate_super)(struct mddev *mddev,
+                                             struct md_rdev *freshest,
                                               struct md_rdev *rdev);
         void                (*sync_super)(struct mddev *mddev,
                                           struct md_rdev *rdev);
@@ -1259,8 +1260,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
  
  /*
   * validate_super for 0.90.0
+ * note: we are not using "freshest" for 0.9 superblock
   */
-static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
  {
         mdp_disk_t *desc;
         mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1772,7 +1774,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
         return ret;
  }
  
-static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
  {
         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
         __u64 ev1 = le64_to_cpu(sb->events);
@@ -1868,13 +1870,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                 }
         } else if (mddev->pers == NULL) {
                 /* Insist of good event counter while assembling, except for
-                * spares (which don't need an event count) */
-               ++ev1;
+                * spares (which don't need an event count).
+                * Similar to mdadm, we allow event counter difference of 1
+                * from the freshest device.
+                */
                 if (rdev->desc_nr >= 0 &&
                     rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
                     (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
                      le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
-                       if (ev1 < mddev->events)
+                       if (ev1 + 1 < mddev->events)
                                 return -EINVAL;
         } else if (mddev->bitmap) {
                 /* If adding to array with a bitmap, then we can accept an
@@ -1895,8 +1899,38 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                     rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
                         role = MD_DISK_ROLE_SPARE;
                         rdev->desc_nr = -1;
-               } else
+               } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
+                       /*
+                        * If we are assembling, and our event counter is smaller than the
+                        * highest event counter, we cannot trust our superblock about the role.
+                        * It could happen that our rdev was marked as Faulty, and all other
+                        * superblocks were updated with +1 event counter.
+                        * Then, before the next superblock update, which typically happens when
+                        * remove_and_add_spares() removes the device from the array, there was
+                        * a crash or reboot.
+                        * If we allow current rdev without consulting the freshest superblock,
+                        * we could cause data corruption.
+                        * Note that in this case our event counter is smaller by 1 than the
+                        * highest, otherwise, this rdev would not be allowed into array;
+                        * both kernel and mdadm allow event counter difference of 1.
+                        */
+                       struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
+                       u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
+
+                       if (rdev->desc_nr >= freshest_max_dev) {
+                               /* this is unexpected, better not proceed */
+                               pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
+                                               mdname(mddev), rdev->bdev, rdev->desc_nr,
+                                               freshest->bdev, freshest_max_dev);
+                               return -EUCLEAN;
+                       }
+
+                       role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
+                       pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
+                                    mdname(mddev), rdev->bdev, role, role, freshest->bdev);
+               } else {
                         role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+               }
                 switch(role) {
                 case MD_DISK_ROLE_SPARE: /* spare */
                         break;
@@ -2804,7 +2838,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
                  * and should be added immediately.
                  */
                 super_types[mddev->major_version].
-                       validate_super(mddev, rdev);
+                       validate_super(mddev, NULL/*freshest*/, rdev);
                 if (add_journal)
                         mddev_suspend(mddev);
                 err = mddev->pers->hot_add_disk(mddev, rdev);
@@ -3742,7 +3776,7 @@ static int analyze_sbs(struct mddev *mddev)
         }
  
         super_types[mddev->major_version].
-               validate_super(mddev, freshest);
+               validate_super(mddev, NULL/*freshest*/, freshest);
  
         i = 0;
         rdev_for_each_safe(rdev, tmp, mddev) {
@@ -3757,7 +3791,7 @@ static int analyze_sbs(struct mddev *mddev)
                 }
                 if (rdev != freshest) {
                         if (super_types[mddev->major_version].
-                           validate_super(mddev, rdev)) {
+                           validate_super(mddev, freshest, rdev)) {
                                 pr_warn("md: kicking non-fresh %pg from array!\n",
                                         rdev->bdev);
                                 md_kick_rdev_from_array(rdev);
@@ -6809,7 +6843,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
                         rdev->saved_raid_disk = rdev->raid_disk;
                 } else
                         super_types[mddev->major_version].
-                               validate_super(mddev, rdev);
+                               validate_super(mddev, NULL/*freshest*/, rdev);
                 if ((info->state & (1<<MD_DISK_SYNC)) &&
                      rdev->raid_disk != info->raid_disk) {
                         /* This was a hot-add request, but events doesn't
author	Alex Lyakas <alex.lyakas@zadara.com>
	Wed, 13 Dec 2023 12:24:31 +0000 (14:24 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 5 Feb 2024 20:14:24 +0000 (20:14 +0000)