btrfs: zoned: finish fully written block group
authorNaohiro Aota <naohiro.aota@wdc.com>
Thu, 19 Aug 2021 12:19:23 +0000 (21:19 +0900)
committerDavid Sterba <dsterba@suse.com>
Tue, 26 Oct 2021 17:08:00 +0000 (19:08 +0200)
If we have written to the zone capacity, the device automatically
deactivates the zone. Sync up block group side (the active BG list and
zone_is_active flag) with it.

We need to do it both on data BGs and metadata BGs. On data side, we add a
hook to btrfs_finish_ordered_io(). On metadata side, we use
end_extent_buffer_writeback().

To reduce excess lookup of a block group, we mark the last extent buffer in
a block group with EXTENT_BUFFER_ZONE_FINISH flag. This cannot be done for
data (ordered_extent), because the address may change due to
REQ_OP_ZONE_APPEND.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/inode.c
fs/btrfs/zoned.c
fs/btrfs/zoned.h

index fdc066c..5ad749e 100644 (file)
@@ -4159,6 +4159,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
 
 static void end_extent_buffer_writeback(struct extent_buffer *eb)
 {
+       if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
+               btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
+
        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
        smp_mb__after_atomic();
        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4760,8 +4763,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
                free_extent_buffer(eb);
                return ret;
        }
-       if (cache)
+       if (cache) {
+               /* Impiles write in zoned mode */
                btrfs_put_block_group(cache);
+               /* Mark the last eb in a block group */
+               if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
+                       set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+       }
        ret = write_one_eb(eb, wbc, epd);
        free_extent_buffer(eb);
        if (ret < 0)
index 53abdc2..9f3e0a4 100644 (file)
@@ -32,6 +32,7 @@ enum {
        /* write IO error */
        EXTENT_BUFFER_WRITE_ERR,
        EXTENT_BUFFER_NO_CHECK,
+       EXTENT_BUFFER_ZONE_FINISH,
 };
 
 /* these are flags for __process_pages_contig */
index 487533c..10efab2 100644 (file)
@@ -3010,8 +3010,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                goto out;
        }
 
-       if (ordered_extent->bdev)
+       /* A valid bdev implies a write on a sequential zone */
+       if (ordered_extent->bdev) {
                btrfs_rewrite_logical_zoned(ordered_extent);
+               btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+                                       ordered_extent->disk_num_bytes);
+       }
 
        btrfs_free_io_failure_record(inode, start, end);
 
index 7980694..28a06c2 100644 (file)
@@ -1904,3 +1904,53 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index
 
        return ret;
 }
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+       struct btrfs_block_group *block_group;
+       struct map_lookup *map;
+       struct btrfs_device *device;
+       u64 physical;
+
+       if (!btrfs_is_zoned(fs_info))
+               return;
+
+       block_group = btrfs_lookup_block_group(fs_info, logical);
+       ASSERT(block_group);
+
+       if (logical + length < block_group->start + block_group->zone_capacity)
+               goto out;
+
+       spin_lock(&block_group->lock);
+
+       if (!block_group->zone_is_active) {
+               spin_unlock(&block_group->lock);
+               goto out;
+       }
+
+       block_group->zone_is_active = 0;
+       /* We should have consumed all the free space */
+       ASSERT(block_group->alloc_offset == block_group->zone_capacity);
+       ASSERT(block_group->free_space_ctl->free_space == 0);
+       btrfs_clear_treelog_bg(block_group);
+       spin_unlock(&block_group->lock);
+
+       map = block_group->physical_map;
+       device = map->stripes[0].dev;
+       physical = map->stripes[0].physical;
+
+       if (!device->zone_info->max_active_zones)
+               goto out;
+
+       btrfs_dev_clear_active_zone(device, physical);
+
+       spin_lock(&fs_info->zone_active_bgs_lock);
+       ASSERT(!list_empty(&block_group->active_bg_list));
+       list_del_init(&block_group->active_bg_list);
+       spin_unlock(&fs_info->zone_active_bgs_lock);
+
+       btrfs_put_block_group(block_group);
+
+out:
+       btrfs_put_block_group(block_group);
+}
index ade6588..9c51240 100644 (file)
@@ -73,6 +73,8 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group);
 int btrfs_zone_finish(struct btrfs_block_group *block_group);
 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
                             int raid_index);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+                            u64 length);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
                                     struct blk_zone *zone)
@@ -224,6 +226,9 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
        return true;
 }
 
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+                                          u64 logical, u64 length) { }
+
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)