f2fs: Check write pointer consistency of open zones
authorShin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Mon, 9 Dec 2019 10:44:44 +0000 (19:44 +0900)
committerJaegeuk Kim <jaegeuk@kernel.org>
Wed, 15 Jan 2020 21:42:14 +0000 (13:42 -0800)
On sudden f2fs shutdown, write pointers of zoned block devices can go
further but f2fs meta data keeps current segments at positions before the
write operations. After remounting the f2fs, this inconsistency causes
write operations not at write pointers and "Unaligned write command"
error is reported.

To avoid the error, compare current segments with write pointers of open
zones the current segments point to, during mount operation. If the write
pointer position is not aligned with the current segment position, assign
a new zone to the current segment. Also check the newly assigned zone has
write pointer at zone start. If not, reset write pointer of the zone.

Perform the consistency check during fsync recovery. Not to lose the
fsync data, do the check after fsync data gets restored and before
checkpoint commit which flushes data at current segment positions. Not to
cause conflict with kworker's dirfy data/node flush, do the fix within
SBI_POR_DOING protection.

Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
fs/f2fs/f2fs.h
fs/f2fs/recovery.c
fs/f2fs/segment.c

index 5a888a0..002c417 100644 (file)
@@ -3155,6 +3155,7 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
 int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
                        unsigned int val, int alloc);
 void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi);
 int f2fs_build_segment_manager(struct f2fs_sb_info *sbi);
 void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
 int __init f2fs_create_segment_manager_caches(void);
index 76477f7..763d5c0 100644 (file)
@@ -723,6 +723,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
        int ret = 0;
        unsigned long s_flags = sbi->sb->s_flags;
        bool need_writecp = false;
+       bool fix_curseg_write_pointer = false;
 #ifdef CONFIG_QUOTA
        int quota_enabled;
 #endif
@@ -774,6 +775,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
                sbi->sb->s_flags = s_flags;
        }
 skip:
+       fix_curseg_write_pointer = !check_only || list_empty(&inode_list);
+
        destroy_fsync_dnodes(&inode_list, err);
        destroy_fsync_dnodes(&tmp_inode_list, err);
 
@@ -784,9 +787,22 @@ skip:
        if (err) {
                truncate_inode_pages_final(NODE_MAPPING(sbi));
                truncate_inode_pages_final(META_MAPPING(sbi));
-       } else {
-               clear_sbi_flag(sbi, SBI_POR_DOING);
        }
+
+       /*
+        * If fsync data succeeds or there is no fsync data to recover,
+        * and the f2fs is not read only, check and fix zoned block devices'
+        * write pointer consistency.
+        */
+       if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) &&
+                       f2fs_sb_has_blkzoned(sbi)) {
+               err = f2fs_fix_curseg_write_pointer(sbi);
+               ret = err;
+       }
+
+       if (!err)
+               clear_sbi_flag(sbi, SBI_POR_DOING);
+
        mutex_unlock(&sbi->cp_mutex);
 
        /* let's drop all the directory inodes for clean checkpoint */
index 56e8144..5c30187 100644 (file)
@@ -4368,6 +4368,137 @@ out:
        return 0;
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+
+static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi,
+                                                 block_t zone_blkaddr)
+{
+       int i;
+
+       for (i = 0; i < sbi->s_ndevs; i++) {
+               if (!bdev_is_zoned(FDEV(i).bdev))
+                       continue;
+               if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr &&
+                               zone_blkaddr <= FDEV(i).end_blk))
+                       return &FDEV(i);
+       }
+
+       return NULL;
+}
+
+static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx,
+                             void *data) {
+       memcpy(data, zone, sizeof(struct blk_zone));
+       return 0;
+}
+
+static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
+{
+       struct curseg_info *cs = CURSEG_I(sbi, type);
+       struct f2fs_dev_info *zbd;
+       struct blk_zone zone;
+       unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off;
+       block_t cs_zone_block, wp_block;
+       unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
+       sector_t zone_sector;
+       int err;
+
+       cs_section = GET_SEC_FROM_SEG(sbi, cs->segno);
+       cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section));
+
+       zbd = get_target_zoned_dev(sbi, cs_zone_block);
+       if (!zbd)
+               return 0;
+
+       /* report zone for the sector the curseg points to */
+       zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
+               << log_sectors_per_block;
+       err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
+                                 report_one_zone_cb, &zone);
+       if (err != 1) {
+               f2fs_err(sbi, "Report zone failed: %s errno=(%d)",
+                        zbd->path, err);
+               return err;
+       }
+
+       if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+               return 0;
+
+       wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
+       wp_segno = GET_SEGNO(sbi, wp_block);
+       wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
+       wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
+
+       if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
+               wp_sector_off == 0)
+               return 0;
+
+       f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
+                   "curseg[0x%x,0x%x] wp[0x%x,0x%x]",
+                   type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff);
+
+       f2fs_notice(sbi, "Assign new section to curseg[%d]: "
+                   "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
+       allocate_segment_by_default(sbi, type, true);
+
+       /* check newly assigned zone */
+       cs_section = GET_SEC_FROM_SEG(sbi, cs->segno);
+       cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section));
+
+       zbd = get_target_zoned_dev(sbi, cs_zone_block);
+       if (!zbd)
+               return 0;
+
+       zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
+               << log_sectors_per_block;
+       err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
+                                 report_one_zone_cb, &zone);
+       if (err != 1) {
+               f2fs_err(sbi, "Report zone failed: %s errno=(%d)",
+                        zbd->path, err);
+               return err;
+       }
+
+       if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+               return 0;
+
+       if (zone.wp != zone.start) {
+               f2fs_notice(sbi,
+                           "New zone for curseg[%d] is not yet discarded. "
+                           "Reset the zone: curseg[0x%x,0x%x]",
+                           type, cs->segno, cs->next_blkoff);
+               err = __f2fs_issue_discard_zone(sbi, zbd->bdev,
+                               zone_sector >> log_sectors_per_block,
+                               zone.len >> log_sectors_per_block);
+               if (err) {
+                       f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
+                                zbd->path, err);
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
+{
+       int i, ret;
+
+       for (i = 0; i < NO_CHECK_TYPE; i++) {
+               ret = fix_curseg_write_pointer(sbi, i);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+#else
+int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
+{
+       return 0;
+}
+#endif
+
 /*
  * Update min, max modified time for cost-benefit GC algorithm
  */