fs/btrfs/zoned.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/bitops.h>
   4 #include <linux/slab.h>
   5 #include <linux/blkdev.h>
   6 #include <linux/sched/mm.h>
   7 #include <linux/vmalloc.h>
   8 #include "ctree.h"
   9 #include "volumes.h"
  10 #include "zoned.h"
  11 #include "rcu-string.h"
  12 #include "disk-io.h"
  13 #include "block-group.h"
  14 #include "transaction.h"
  15 #include "dev-replace.h"
  16 #include "space-info.h"
  17
  18 /* Maximum number of zones to report per blkdev_report_zones() call */
  19 #define BTRFS_REPORT_NR_ZONES   4096
  20 /* Invalid allocation pointer value for missing devices */
  21 #define WP_MISSING_DEV ((u64)-1)
  22 /* Pseudo write pointer value for conventional zone */
  23 #define WP_CONVENTIONAL ((u64)-2)
  24
  25 /*
  26  * Location of the first zone of superblock logging zone pairs.
  27  *
  28  * - primary superblock:    0B (zone 0)
  29  * - first copy:          512G (zone starting at that offset)
  30  * - second copy:           4T (zone starting at that offset)
  31  */
  32 #define BTRFS_SB_LOG_PRIMARY_OFFSET     (0ULL)
  33 #define BTRFS_SB_LOG_FIRST_OFFSET       (512ULL * SZ_1G)
  34 #define BTRFS_SB_LOG_SECOND_OFFSET      (4096ULL * SZ_1G)
  35
  36 #define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
  37 #define BTRFS_SB_LOG_SECOND_SHIFT       const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
  38
  39 /* Number of superblock log zones */
  40 #define BTRFS_NR_SB_LOG_ZONES 2
  41
  42 /*
  43  * Maximum supported zone size. Currently, SMR disks have a zone size of
  44  * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
  45  * expect the zone size to become larger than 8GiB in the near future.
  46  */
  47 #define BTRFS_MAX_ZONE_SIZE             SZ_8G
  48
  49 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
  50 {
  51         struct blk_zone *zones = data;
  52
  53         memcpy(&zones[idx], zone, sizeof(*zone));
  54
  55         return 0;
  56 }
  57
  58 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
  59                             u64 *wp_ret)
  60 {
  61         bool empty[BTRFS_NR_SB_LOG_ZONES];
  62         bool full[BTRFS_NR_SB_LOG_ZONES];
  63         sector_t sector;
  64
  65         ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
  66                zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
  67
  68         empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
  69         empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
  70         full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
  71         full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
  72
  73         /*
  74          * Possible states of log buffer zones
  75          *
  76          *           Empty[0]  In use[0]  Full[0]
  77          * Empty[1]         *          x        0
  78          * In use[1]        0          x        0
  79          * Full[1]          1          1        C
  80          *
  81          * Log position:
  82          *   *: Special case, no superblock is written
  83          *   0: Use write pointer of zones[0]
  84          *   1: Use write pointer of zones[1]
  85          *   C: Compare super blocks from zones[0] and zones[1], use the latest
  86          *      one determined by generation
  87          *   x: Invalid state
  88          */
  89
  90         if (empty[0] && empty[1]) {
  91                 /* Special case to distinguish no superblock to read */
  92                 *wp_ret = zones[0].start << SECTOR_SHIFT;
  93                 return -ENOENT;
  94         } else if (full[0] && full[1]) {
  95                 /* Compare two super blocks */
  96                 struct address_space *mapping = bdev->bd_inode->i_mapping;
  97                 struct page *page[BTRFS_NR_SB_LOG_ZONES];
  98                 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
  99                 int i;
 100
 101                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
 102                         u64 bytenr;
 103
 104                         bytenr = ((zones[i].start + zones[i].len)
 105                                    << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
 106
 107                         page[i] = read_cache_page_gfp(mapping,
 108                                         bytenr >> PAGE_SHIFT, GFP_NOFS);
 109                         if (IS_ERR(page[i])) {
 110                                 if (i == 1)
 111                                         btrfs_release_disk_super(super[0]);
 112                                 return PTR_ERR(page[i]);
 113                         }
 114                         super[i] = page_address(page[i]);
 115                 }
 116
 117                 if (btrfs_super_generation(super[0]) >
 118                     btrfs_super_generation(super[1]))
 119                         sector = zones[1].start;
 120                 else
 121                         sector = zones[0].start;
 122
 123                 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
 124                         btrfs_release_disk_super(super[i]);
 125         } else if (!full[0] && (empty[1] || full[1])) {
 126                 sector = zones[0].wp;
 127         } else if (full[0]) {
 128                 sector = zones[1].wp;
 129         } else {
 130                 return -EUCLEAN;
 131         }
 132         *wp_ret = sector << SECTOR_SHIFT;
 133         return 0;
 134 }
 135
 136 /*
 137  * Get the first zone number of the superblock mirror
 138  */
 139 static inline u32 sb_zone_number(int shift, int mirror)
 140 {
 141         u64 zone;
 142
 143         ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
 144         switch (mirror) {
 145         case 0: zone = 0; break;
 146         case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
 147         case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
 148         }
 149
 150         ASSERT(zone <= U32_MAX);
 151
 152         return (u32)zone;
 153 }
 154
 155 static inline sector_t zone_start_sector(u32 zone_number,
 156                                          struct block_device *bdev)
 157 {
 158         return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
 159 }
 160
 161 static inline u64 zone_start_physical(u32 zone_number,
 162                                       struct btrfs_zoned_device_info *zone_info)
 163 {
 164         return (u64)zone_number << zone_info->zone_size_shift;
 165 }
 166
 167 /*
 168  * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
 169  * device into static sized chunks and fake a conventional zone on each of
 170  * them.
 171  */
 172 static int emulate_report_zones(struct btrfs_device *device, u64 pos,
 173                                 struct blk_zone *zones, unsigned int nr_zones)
 174 {
 175         const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
 176         sector_t bdev_size = bdev_nr_sectors(device->bdev);
 177         unsigned int i;
 178
 179         pos >>= SECTOR_SHIFT;
 180         for (i = 0; i < nr_zones; i++) {
 181                 zones[i].start = i * zone_sectors + pos;
 182                 zones[i].len = zone_sectors;
 183                 zones[i].capacity = zone_sectors;
 184                 zones[i].wp = zones[i].start + zone_sectors;
 185                 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
 186                 zones[i].cond = BLK_ZONE_COND_NOT_WP;
 187
 188                 if (zones[i].wp >= bdev_size) {
 189                         i++;
 190                         break;
 191                 }
 192         }
 193
 194         return i;
 195 }
 196
 197 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 198                                struct blk_zone *zones, unsigned int *nr_zones)
 199 {
 200         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 201         u32 zno;
 202         int ret;
 203
 204         if (!*nr_zones)
 205                 return 0;
 206
 207         if (!bdev_is_zoned(device->bdev)) {
 208                 ret = emulate_report_zones(device, pos, zones, *nr_zones);
 209                 *nr_zones = ret;
 210                 return 0;
 211         }
 212
 213         /* Check cache */
 214         if (zinfo->zone_cache) {
 215                 unsigned int i;
 216
 217                 ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
 218                 zno = pos >> zinfo->zone_size_shift;
 219                 /*
 220                  * We cannot report zones beyond the zone end. So, it is OK to
 221                  * cap *nr_zones to at the end.
 222                  */
 223                 *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
 224
 225                 for (i = 0; i < *nr_zones; i++) {
 226                         struct blk_zone *zone_info;
 227
 228                         zone_info = &zinfo->zone_cache[zno + i];
 229                         if (!zone_info->len)
 230                                 break;
 231                 }
 232
 233                 if (i == *nr_zones) {
 234                         /* Cache hit on all the zones */
 235                         memcpy(zones, zinfo->zone_cache + zno,
 236                                sizeof(*zinfo->zone_cache) * *nr_zones);
 237                         return 0;
 238                 }
 239         }
 240
 241         ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
 242                                   copy_zone_info_cb, zones);
 243         if (ret < 0) {
 244                 btrfs_err_in_rcu(device->fs_info,
 245                                  "zoned: failed to read zone %llu on %s (devid %llu)",
 246                                  pos, rcu_str_deref(device->name),
 247                                  device->devid);
 248                 return ret;
 249         }
 250         *nr_zones = ret;
 251         if (!ret)
 252                 return -EIO;
 253
 254         /* Populate cache */
 255         if (zinfo->zone_cache)
 256                 memcpy(zinfo->zone_cache + zno, zones,
 257                        sizeof(*zinfo->zone_cache) * *nr_zones);
 258
 259         return 0;
 260 }
 261
 262 /* The emulated zone size is determined from the size of device extent */
 263 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
 264 {
 265         struct btrfs_path *path;
 266         struct btrfs_root *root = fs_info->dev_root;
 267         struct btrfs_key key;
 268         struct extent_buffer *leaf;
 269         struct btrfs_dev_extent *dext;
 270         int ret = 0;
 271
 272         key.objectid = 1;
 273         key.type = BTRFS_DEV_EXTENT_KEY;
 274         key.offset = 0;
 275
 276         path = btrfs_alloc_path();
 277         if (!path)
 278                 return -ENOMEM;
 279
 280         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 281         if (ret < 0)
 282                 goto out;
 283
 284         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 285                 ret = btrfs_next_leaf(root, path);
 286                 if (ret < 0)
 287                         goto out;
 288                 /* No dev extents at all? Not good */
 289                 if (ret > 0) {
 290                         ret = -EUCLEAN;
 291                         goto out;
 292                 }
 293         }
 294
 295         leaf = path->nodes[0];
 296         dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
 297         fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
 298         ret = 0;
 299
 300 out:
 301         btrfs_free_path(path);
 302
 303         return ret;
 304 }
 305
 306 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 307 {
 308         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 309         struct btrfs_device *device;
 310         int ret = 0;
 311
 312         /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
 313         if (!btrfs_fs_incompat(fs_info, ZONED))
 314                 return 0;
 315
 316         mutex_lock(&fs_devices->device_list_mutex);
 317         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 318                 /* We can skip reading of zone info for missing devices */
 319                 if (!device->bdev)
 320                         continue;
 321
 322                 ret = btrfs_get_dev_zone_info(device, true);
 323                 if (ret)
 324                         break;
 325         }
 326         mutex_unlock(&fs_devices->device_list_mutex);
 327
 328         return ret;
 329 }
 330
 331 int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
 332 {
 333         struct btrfs_fs_info *fs_info = device->fs_info;
 334         struct btrfs_zoned_device_info *zone_info = NULL;
 335         struct block_device *bdev = device->bdev;
 336         sector_t nr_sectors;
 337         sector_t sector = 0;
 338         struct blk_zone *zones = NULL;
 339         unsigned int i, nreported = 0, nr_zones;
 340         sector_t zone_sectors;
 341         char *model, *emulated;
 342         int ret;
 343
 344         /*
 345          * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
 346          * yet be set.
 347          */
 348         if (!btrfs_fs_incompat(fs_info, ZONED))
 349                 return 0;
 350
 351         if (device->zone_info)
 352                 return 0;
 353
 354         zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
 355         if (!zone_info)
 356                 return -ENOMEM;
 357
 358         device->zone_info = zone_info;
 359
 360         if (!bdev_is_zoned(bdev)) {
 361                 if (!fs_info->zone_size) {
 362                         ret = calculate_emulated_zone_size(fs_info);
 363                         if (ret)
 364                                 goto out;
 365                 }
 366
 367                 ASSERT(fs_info->zone_size);
 368                 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
 369         } else {
 370                 zone_sectors = bdev_zone_sectors(bdev);
 371         }
 372
 373         /* Check if it's power of 2 (see is_power_of_2) */
 374         ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
 375         zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
 376
 377         /* We reject devices with a zone size larger than 8GB */
 378         if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
 379                 btrfs_err_in_rcu(fs_info,
 380                 "zoned: %s: zone size %llu larger than supported maximum %llu",
 381                                  rcu_str_deref(device->name),
 382                                  zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
 383                 ret = -EINVAL;
 384                 goto out;
 385         }
 386
 387         nr_sectors = bdev_nr_sectors(bdev);
 388         zone_info->zone_size_shift = ilog2(zone_info->zone_size);
 389         zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
 390         /*
 391          * We limit max_zone_append_size also by max_segments *
 392          * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
 393          * since btrfs adds the pages one by one to a bio, and btrfs cannot
 394          * increase the metadata reservation even if it increases the number of
 395          * extents, it is safe to stick with the limit.
 396          *
 397          * With the zoned emulation, we can have non-zoned device on the zoned
 398          * mode. In this case, we don't have a valid max zone append size. So,
 399          * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
 400          */
 401         if (bdev_is_zoned(bdev)) {
 402                 zone_info->max_zone_append_size = min_t(u64,
 403                         (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
 404                         (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
 405         } else {
 406                 zone_info->max_zone_append_size =
 407                         (u64)bdev_max_segments(bdev) << PAGE_SHIFT;
 408         }
 409         if (!IS_ALIGNED(nr_sectors, zone_sectors))
 410                 zone_info->nr_zones++;
 411
 412         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 413         if (!zone_info->seq_zones) {
 414                 ret = -ENOMEM;
 415                 goto out;
 416         }
 417
 418         zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 419         if (!zone_info->empty_zones) {
 420                 ret = -ENOMEM;
 421                 goto out;
 422         }
 423
 424         zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
 425         if (!zones) {
 426                 ret = -ENOMEM;
 427                 goto out;
 428         }
 429
 430         /*
 431          * Enable zone cache only for a zoned device. On a non-zoned device, we
 432          * fill the zone info with emulated CONVENTIONAL zones, so no need to
 433          * use the cache.
 434          */
 435         if (populate_cache && bdev_is_zoned(device->bdev)) {
 436                 zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
 437                                                 zone_info->nr_zones);
 438                 if (!zone_info->zone_cache) {
 439                         btrfs_err_in_rcu(device->fs_info,
 440                                 "zoned: failed to allocate zone cache for %s",
 441                                 rcu_str_deref(device->name));
 442                         ret = -ENOMEM;
 443                         goto out;
 444                 }
 445         }
 446
 447         /* Get zones type */
 448         while (sector < nr_sectors) {
 449                 nr_zones = BTRFS_REPORT_NR_ZONES;
 450                 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
 451                                           &nr_zones);
 452                 if (ret)
 453                         goto out;
 454
 455                 for (i = 0; i < nr_zones; i++) {
 456                         if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
 457                                 __set_bit(nreported, zone_info->seq_zones);
 458                         if (zones[i].cond == BLK_ZONE_COND_EMPTY)
 459                                 __set_bit(nreported, zone_info->empty_zones);
 460                         nreported++;
 461                 }
 462                 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
 463         }
 464
 465         if (nreported != zone_info->nr_zones) {
 466                 btrfs_err_in_rcu(device->fs_info,
 467                                  "inconsistent number of zones on %s (%u/%u)",
 468                                  rcu_str_deref(device->name), nreported,
 469                                  zone_info->nr_zones);
 470                 ret = -EIO;
 471                 goto out;
 472         }
 473
 474         /* Validate superblock log */
 475         nr_zones = BTRFS_NR_SB_LOG_ZONES;
 476         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 477                 u32 sb_zone;
 478                 u64 sb_wp;
 479                 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
 480
 481                 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
 482                 if (sb_zone + 1 >= zone_info->nr_zones)
 483                         continue;
 484
 485                 ret = btrfs_get_dev_zones(device,
 486                                           zone_start_physical(sb_zone, zone_info),
 487                                           &zone_info->sb_zones[sb_pos],
 488                                           &nr_zones);
 489                 if (ret)
 490                         goto out;
 491
 492                 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
 493                         btrfs_err_in_rcu(device->fs_info,
 494         "zoned: failed to read super block log zone info at devid %llu zone %u",
 495                                          device->devid, sb_zone);
 496                         ret = -EUCLEAN;
 497                         goto out;
 498                 }
 499
 500                 /*
 501                  * If zones[0] is conventional, always use the beginning of the
 502                  * zone to record superblock. No need to validate in that case.
 503                  */
 504                 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
 505                     BLK_ZONE_TYPE_CONVENTIONAL)
 506                         continue;
 507
 508                 ret = sb_write_pointer(device->bdev,
 509                                        &zone_info->sb_zones[sb_pos], &sb_wp);
 510                 if (ret != -ENOENT && ret) {
 511                         btrfs_err_in_rcu(device->fs_info,
 512                         "zoned: super block log zone corrupted devid %llu zone %u",
 513                                          device->devid, sb_zone);
 514                         ret = -EUCLEAN;
 515                         goto out;
 516                 }
 517         }
 518
 519
 520         kvfree(zones);
 521
 522         switch (bdev_zoned_model(bdev)) {
 523         case BLK_ZONED_HM:
 524                 model = "host-managed zoned";
 525                 emulated = "";
 526                 break;
 527         case BLK_ZONED_HA:
 528                 model = "host-aware zoned";
 529                 emulated = "";
 530                 break;
 531         case BLK_ZONED_NONE:
 532                 model = "regular";
 533                 emulated = "emulated ";
 534                 break;
 535         default:
 536                 /* Just in case */
 537                 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
 538                                  bdev_zoned_model(bdev),
 539                                  rcu_str_deref(device->name));
 540                 ret = -EOPNOTSUPP;
 541                 goto out_free_zone_info;
 542         }
 543
 544         btrfs_info_in_rcu(fs_info,
 545                 "%s block device %s, %u %szones of %llu bytes",
 546                 model, rcu_str_deref(device->name), zone_info->nr_zones,
 547                 emulated, zone_info->zone_size);
 548
 549         return 0;
 550
 551 out:
 552         kvfree(zones);
 553 out_free_zone_info:
 554         btrfs_destroy_dev_zone_info(device);
 555
 556         return ret;
 557 }
 558
 559 void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
 560 {
 561         struct btrfs_zoned_device_info *zone_info = device->zone_info;
 562
 563         if (!zone_info)
 564                 return;
 565
 566         bitmap_free(zone_info->seq_zones);
 567         bitmap_free(zone_info->empty_zones);
 568         vfree(zone_info->zone_cache);
 569         kfree(zone_info);
 570         device->zone_info = NULL;
 571 }
 572
 573 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 574                        struct blk_zone *zone)
 575 {
 576         unsigned int nr_zones = 1;
 577         int ret;
 578
 579         ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
 580         if (ret != 0 || !nr_zones)
 581                 return ret ? ret : -EIO;
 582
 583         return 0;
 584 }
 585
 586 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 587 {
 588         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 589         struct btrfs_device *device;
 590         u64 zoned_devices = 0;
 591         u64 nr_devices = 0;
 592         u64 zone_size = 0;
 593         u64 max_zone_append_size = 0;
 594         const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
 595         int ret = 0;
 596
 597         /* Count zoned devices */
 598         list_for_each_entry(device, &fs_devices->devices, dev_list) {
 599                 enum blk_zoned_model model;
 600
 601                 if (!device->bdev)
 602                         continue;
 603
 604                 model = bdev_zoned_model(device->bdev);
 605                 /*
 606                  * A Host-Managed zoned device must be used as a zoned device.
 607                  * A Host-Aware zoned device and a non-zoned devices can be
 608                  * treated as a zoned device, if ZONED flag is enabled in the
 609                  * superblock.
 610                  */
 611                 if (model == BLK_ZONED_HM ||
 612                     (model == BLK_ZONED_HA && incompat_zoned) ||
 613                     (model == BLK_ZONED_NONE && incompat_zoned)) {
 614                         struct btrfs_zoned_device_info *zone_info =
 615                                 device->zone_info;
 616
 617                         zone_info = device->zone_info;
 618                         zoned_devices++;
 619                         if (!zone_size) {
 620                                 zone_size = zone_info->zone_size;
 621                         } else if (zone_info->zone_size != zone_size) {
 622                                 btrfs_err(fs_info,
 623                 "zoned: unequal block device zone sizes: have %llu found %llu",
 624                                           device->zone_info->zone_size,
 625                                           zone_size);
 626                                 ret = -EINVAL;
 627                                 goto out;
 628                         }
 629                         if (!max_zone_append_size ||
 630                             (zone_info->max_zone_append_size &&
 631                              zone_info->max_zone_append_size < max_zone_append_size))
 632                                 max_zone_append_size =
 633                                         zone_info->max_zone_append_size;
 634                 }
 635                 nr_devices++;
 636         }
 637
 638         if (!zoned_devices && !incompat_zoned)
 639                 goto out;
 640
 641         if (!zoned_devices && incompat_zoned) {
 642                 /* No zoned block device found on ZONED filesystem */
 643                 btrfs_err(fs_info,
 644                           "zoned: no zoned devices found on a zoned filesystem");
 645                 ret = -EINVAL;
 646                 goto out;
 647         }
 648
 649         if (zoned_devices && !incompat_zoned) {
 650                 btrfs_err(fs_info,
 651                           "zoned: mode not enabled but zoned device found");
 652                 ret = -EINVAL;
 653                 goto out;
 654         }
 655
 656         if (zoned_devices != nr_devices) {
 657                 btrfs_err(fs_info,
 658                           "zoned: cannot mix zoned and regular devices");
 659                 ret = -EINVAL;
 660                 goto out;
 661         }
 662
 663         /*
 664          * stripe_size is always aligned to BTRFS_STRIPE_LEN in
 665          * btrfs_create_chunk(). Since we want stripe_len == zone_size,
 666          * check the alignment here.
 667          */
 668         if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
 669                 btrfs_err(fs_info,
 670                           "zoned: zone size %llu not aligned to stripe %u",
 671                           zone_size, BTRFS_STRIPE_LEN);
 672                 ret = -EINVAL;
 673                 goto out;
 674         }
 675
 676         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 677                 btrfs_err(fs_info, "zoned: mixed block groups not supported");
 678                 ret = -EINVAL;
 679                 goto out;
 680         }
 681
 682         fs_info->zone_size = zone_size;
 683         fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
 684                                                    fs_info->sectorsize);
 685         fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
 686         if (fs_info->max_zone_append_size < fs_info->max_extent_size)
 687                 fs_info->max_extent_size = fs_info->max_zone_append_size;
 688
 689         /*
 690          * Check mount options here, because we might change fs_info->zoned
 691          * from fs_info->zone_size.
 692          */
 693         ret = btrfs_check_mountopts_zoned(fs_info);
 694         if (ret)
 695                 goto out;
 696
 697         btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
 698 out:
 699         return ret;
 700 }
 701
 702 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 703 {
 704         if (!btrfs_is_zoned(info))
 705                 return 0;
 706
 707         /*
 708          * Space cache writing is not COWed. Disable that to avoid write errors
 709          * in sequential zones.
 710          */
 711         if (btrfs_test_opt(info, SPACE_CACHE)) {
 712                 btrfs_err(info, "zoned: space cache v1 is not supported");
 713                 return -EINVAL;
 714         }
 715
 716         if (btrfs_test_opt(info, NODATACOW)) {
 717                 btrfs_err(info, "zoned: NODATACOW not supported");
 718                 return -EINVAL;
 719         }
 720
 721         return 0;
 722 }
 723
 724 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
 725                            int rw, u64 *bytenr_ret)
 726 {
 727         u64 wp;
 728         int ret;
 729
 730         if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
 731                 *bytenr_ret = zones[0].start << SECTOR_SHIFT;
 732                 return 0;
 733         }
 734
 735         ret = sb_write_pointer(bdev, zones, &wp);
 736         if (ret != -ENOENT && ret < 0)
 737                 return ret;
 738
 739         if (rw == WRITE) {
 740                 struct blk_zone *reset = NULL;
 741
 742                 if (wp == zones[0].start << SECTOR_SHIFT)
 743                         reset = &zones[0];
 744                 else if (wp == zones[1].start << SECTOR_SHIFT)
 745                         reset = &zones[1];
 746
 747                 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
 748                         ASSERT(reset->cond == BLK_ZONE_COND_FULL);
 749
 750                         ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 751                                                reset->start, reset->len,
 752                                                GFP_NOFS);
 753                         if (ret)
 754                                 return ret;
 755
 756                         reset->cond = BLK_ZONE_COND_EMPTY;
 757                         reset->wp = reset->start;
 758                 }
 759         } else if (ret != -ENOENT) {
 760                 /* For READ, we want the precious one */
 761                 if (wp == zones[0].start << SECTOR_SHIFT)
 762                         wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
 763                 wp -= BTRFS_SUPER_INFO_SIZE;
 764         }
 765
 766         *bytenr_ret = wp;
 767         return 0;
 768
 769 }
 770
 771 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
 772                                u64 *bytenr_ret)
 773 {
 774         struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
 775         sector_t zone_sectors;
 776         u32 sb_zone;
 777         int ret;
 778         u8 zone_sectors_shift;
 779         sector_t nr_sectors;
 780         u32 nr_zones;
 781
 782         if (!bdev_is_zoned(bdev)) {
 783                 *bytenr_ret = btrfs_sb_offset(mirror);
 784                 return 0;
 785         }
 786
 787         ASSERT(rw == READ || rw == WRITE);
 788
 789         zone_sectors = bdev_zone_sectors(bdev);
 790         if (!is_power_of_2(zone_sectors))
 791                 return -EINVAL;
 792         zone_sectors_shift = ilog2(zone_sectors);
 793         nr_sectors = bdev_nr_sectors(bdev);
 794         nr_zones = nr_sectors >> zone_sectors_shift;
 795
 796         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 797         if (sb_zone + 1 >= nr_zones)
 798                 return -ENOENT;
 799
 800         ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
 801                                   BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
 802                                   zones);
 803         if (ret < 0)
 804                 return ret;
 805         if (ret != BTRFS_NR_SB_LOG_ZONES)
 806                 return -EIO;
 807
 808         return sb_log_location(bdev, zones, rw, bytenr_ret);
 809 }
 810
 811 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
 812                           u64 *bytenr_ret)
 813 {
 814         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 815         u32 zone_num;
 816
 817         /*
 818          * For a zoned filesystem on a non-zoned block device, use the same
 819          * super block locations as regular filesystem. Doing so, the super
 820          * block can always be retrieved and the zoned flag of the volume
 821          * detected from the super block information.
 822          */
 823         if (!bdev_is_zoned(device->bdev)) {
 824                 *bytenr_ret = btrfs_sb_offset(mirror);
 825                 return 0;
 826         }
 827
 828         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 829         if (zone_num + 1 >= zinfo->nr_zones)
 830                 return -ENOENT;
 831
 832         return sb_log_location(device->bdev,
 833                                &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
 834                                rw, bytenr_ret);
 835 }
 836
 837 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
 838                                   int mirror)
 839 {
 840         u32 zone_num;
 841
 842         if (!zinfo)
 843                 return false;
 844
 845         zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 846         if (zone_num + 1 >= zinfo->nr_zones)
 847                 return false;
 848
 849         if (!test_bit(zone_num, zinfo->seq_zones))
 850                 return false;
 851
 852         return true;
 853 }
 854
 855 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 856 {
 857         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 858         struct blk_zone *zone;
 859
 860         if (!is_sb_log_zone(zinfo, mirror))
 861                 return;
 862
 863         zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
 864         if (zone->cond != BLK_ZONE_COND_FULL) {
 865                 if (zone->cond == BLK_ZONE_COND_EMPTY)
 866                         zone->cond = BLK_ZONE_COND_IMP_OPEN;
 867
 868                 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 869
 870                 if (zone->wp == zone->start + zone->len)
 871                         zone->cond = BLK_ZONE_COND_FULL;
 872
 873                 return;
 874         }
 875
 876         zone++;
 877         ASSERT(zone->cond != BLK_ZONE_COND_FULL);
 878         if (zone->cond == BLK_ZONE_COND_EMPTY)
 879                 zone->cond = BLK_ZONE_COND_IMP_OPEN;
 880
 881         zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 882
 883         if (zone->wp == zone->start + zone->len)
 884                 zone->cond = BLK_ZONE_COND_FULL;
 885 }
 886
 887 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 888 {
 889         sector_t zone_sectors;
 890         sector_t nr_sectors;
 891         u8 zone_sectors_shift;
 892         u32 sb_zone;
 893         u32 nr_zones;
 894
 895         zone_sectors = bdev_zone_sectors(bdev);
 896         zone_sectors_shift = ilog2(zone_sectors);
 897         nr_sectors = bdev_nr_sectors(bdev);
 898         nr_zones = nr_sectors >> zone_sectors_shift;
 899
 900         sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 901         if (sb_zone + 1 >= nr_zones)
 902                 return -ENOENT;
 903
 904         return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 905                                 zone_start_sector(sb_zone, bdev),
 906                                 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
 907 }
 908
 909 /**
 910  * btrfs_find_allocatable_zones - find allocatable zones within a given region
 911  *
 912  * @device:     the device to allocate a region on
 913  * @hole_start: the position of the hole to allocate the region
 914  * @num_bytes:  size of wanted region
 915  * @hole_end:   the end of the hole
 916  * @return:     position of allocatable zones
 917  *
 918  * Allocatable region should not contain any superblock locations.
 919  */
 920 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
 921                                  u64 hole_end, u64 num_bytes)
 922 {
 923         struct btrfs_zoned_device_info *zinfo = device->zone_info;
 924         const u8 shift = zinfo->zone_size_shift;
 925         u64 nzones = num_bytes >> shift;
 926         u64 pos = hole_start;
 927         u64 begin, end;
 928         bool have_sb;
 929         int i;
 930
 931         ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
 932         ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
 933
 934         while (pos < hole_end) {
 935                 begin = pos >> shift;
 936                 end = begin + nzones;
 937
 938                 if (end > zinfo->nr_zones)
 939                         return hole_end;
 940
 941                 /* Check if zones in the region are all empty */
 942                 if (btrfs_dev_is_sequential(device, pos) &&
 943                     find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
 944                         pos += zinfo->zone_size;
 945                         continue;
 946                 }
 947
 948                 have_sb = false;
 949                 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 950                         u32 sb_zone;
 951                         u64 sb_pos;
 952
 953                         sb_zone = sb_zone_number(shift, i);
 954                         if (!(end <= sb_zone ||
 955                               sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
 956                                 have_sb = true;
 957                                 pos = zone_start_physical(
 958                                         sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
 959                                 break;
 960                         }
 961
 962                         /* We also need to exclude regular superblock positions */
 963                         sb_pos = btrfs_sb_offset(i);
 964                         if (!(pos + num_bytes <= sb_pos ||
 965                               sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
 966                                 have_sb = true;
 967                                 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
 968                                             zinfo->zone_size);
 969                                 break;
 970                         }
 971                 }
 972                 if (!have_sb)
 973                         break;
 974         }
 975
 976         return pos;
 977 }
 978
 979 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
 980                             u64 length, u64 *bytes)
 981 {
 982         int ret;
 983
 984         *bytes = 0;
 985         ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
 986                                physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
 987                                GFP_NOFS);
 988         if (ret)
 989                 return ret;
 990
 991         *bytes = length;
 992         while (length) {
 993                 btrfs_dev_set_zone_empty(device, physical);
 994                 physical += device->zone_info->zone_size;
 995                 length -= device->zone_info->zone_size;
 996         }
 997
 998         return 0;
 999 }
1000
1001 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
1002 {
1003         struct btrfs_zoned_device_info *zinfo = device->zone_info;
1004         const u8 shift = zinfo->zone_size_shift;
1005         unsigned long begin = start >> shift;
1006         unsigned long end = (start + size) >> shift;
1007         u64 pos;
1008         int ret;
1009
1010         ASSERT(IS_ALIGNED(start, zinfo->zone_size));
1011         ASSERT(IS_ALIGNED(size, zinfo->zone_size));
1012
1013         if (end > zinfo->nr_zones)
1014                 return -ERANGE;
1015
1016         /* All the zones are conventional */
1017         if (find_next_bit(zinfo->seq_zones, begin, end) == end)
1018                 return 0;
1019
1020         /* All the zones are sequential and empty */
1021         if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
1022             find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
1023                 return 0;
1024
1025         for (pos = start; pos < start + size; pos += zinfo->zone_size) {
1026                 u64 reset_bytes;
1027
1028                 if (!btrfs_dev_is_sequential(device, pos) ||
1029                     btrfs_dev_is_empty_zone(device, pos))
1030                         continue;
1031
1032                 /* Free regions should be empty */
1033                 btrfs_warn_in_rcu(
1034                         device->fs_info,
1035                 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1036                         rcu_str_deref(device->name), device->devid, pos >> shift);
1037                 WARN_ON_ONCE(1);
1038
1039                 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
1040                                               &reset_bytes);
1041                 if (ret)
1042                         return ret;
1043         }
1044
1045         return 0;
1046 }
1047
1048 /*
1049  * Calculate an allocation pointer from the extent allocation information
1050  * for a block group consist of conventional zones. It is pointed to the
1051  * end of the highest addressed extent in the block group as an allocation
1052  * offset.
1053  */
1054 static int calculate_alloc_pointer(struct btrfs_block_group *cache,
1055                                    u64 *offset_ret)
1056 {
1057         struct btrfs_fs_info *fs_info = cache->fs_info;
1058         struct btrfs_root *root = fs_info->extent_root;
1059         struct btrfs_path *path;
1060         struct btrfs_key key;
1061         struct btrfs_key found_key;
1062         int ret;
1063         u64 length;
1064
1065         path = btrfs_alloc_path();
1066         if (!path)
1067                 return -ENOMEM;
1068
1069         key.objectid = cache->start + cache->length;
1070         key.type = 0;
1071         key.offset = 0;
1072
1073         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1074         /* We should not find the exact match */
1075         if (!ret)
1076                 ret = -EUCLEAN;
1077         if (ret < 0)
1078                 goto out;
1079
1080         ret = btrfs_previous_extent_item(root, path, cache->start);
1081         if (ret) {
1082                 if (ret == 1) {
1083                         ret = 0;
1084                         *offset_ret = 0;
1085                 }
1086                 goto out;
1087         }
1088
1089         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1090
1091         if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1092                 length = found_key.offset;
1093         else
1094                 length = fs_info->nodesize;
1095
1096         if (!(found_key.objectid >= cache->start &&
1097                found_key.objectid + length <= cache->start + cache->length)) {
1098                 ret = -EUCLEAN;
1099                 goto out;
1100         }
1101         *offset_ret = found_key.objectid + length - cache->start;
1102         ret = 0;
1103
1104 out:
1105         btrfs_free_path(path);
1106         return ret;
1107 }
1108
1109 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1110 {
1111         struct btrfs_fs_info *fs_info = cache->fs_info;
1112         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1113         struct extent_map *em;
1114         struct map_lookup *map;
1115         struct btrfs_device *device;
1116         u64 logical = cache->start;
1117         u64 length = cache->length;
1118         u64 physical = 0;
1119         int ret;
1120         int i;
1121         unsigned int nofs_flag;
1122         u64 *alloc_offsets = NULL;
1123         u64 last_alloc = 0;
1124         u32 num_sequential = 0, num_conventional = 0;
1125
1126         if (!btrfs_is_zoned(fs_info))
1127                 return 0;
1128
1129         /* Sanity check */
1130         if (!IS_ALIGNED(length, fs_info->zone_size)) {
1131                 btrfs_err(fs_info,
1132                 "zoned: block group %llu len %llu unaligned to zone size %llu",
1133                           logical, length, fs_info->zone_size);
1134                 return -EIO;
1135         }
1136
1137         /* Get the chunk mapping */
1138         read_lock(&em_tree->lock);
1139         em = lookup_extent_mapping(em_tree, logical, length);
1140         read_unlock(&em_tree->lock);
1141
1142         if (!em)
1143                 return -EINVAL;
1144
1145         map = em->map_lookup;
1146
1147         alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
1148         if (!alloc_offsets) {
1149                 free_extent_map(em);
1150                 return -ENOMEM;
1151         }
1152
1153         for (i = 0; i < map->num_stripes; i++) {
1154                 bool is_sequential;
1155                 struct blk_zone zone;
1156                 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1157                 int dev_replace_is_ongoing = 0;
1158
1159                 device = map->stripes[i].dev;
1160                 physical = map->stripes[i].physical;
1161
1162                 if (device->bdev == NULL) {
1163                         alloc_offsets[i] = WP_MISSING_DEV;
1164                         continue;
1165                 }
1166
1167                 is_sequential = btrfs_dev_is_sequential(device, physical);
1168                 if (is_sequential)
1169                         num_sequential++;
1170                 else
1171                         num_conventional++;
1172
1173                 if (!is_sequential) {
1174                         alloc_offsets[i] = WP_CONVENTIONAL;
1175                         continue;
1176                 }
1177
1178                 /*
1179                  * This zone will be used for allocation, so mark this zone
1180                  * non-empty.
1181                  */
1182                 btrfs_dev_clear_zone_empty(device, physical);
1183
1184                 down_read(&dev_replace->rwsem);
1185                 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1186                 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1187                         btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
1188                 up_read(&dev_replace->rwsem);
1189
1190                 /*
1191                  * The group is mapped to a sequential zone. Get the zone write
1192                  * pointer to determine the allocation offset within the zone.
1193                  */
1194                 WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
1195                 nofs_flag = memalloc_nofs_save();
1196                 ret = btrfs_get_dev_zone(device, physical, &zone);
1197                 memalloc_nofs_restore(nofs_flag);
1198                 if (ret == -EIO || ret == -EOPNOTSUPP) {
1199                         ret = 0;
1200                         alloc_offsets[i] = WP_MISSING_DEV;
1201                         continue;
1202                 } else if (ret) {
1203                         goto out;
1204                 }
1205
1206                 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
1207                         btrfs_err_in_rcu(fs_info,
1208         "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1209                                 zone.start << SECTOR_SHIFT,
1210                                 rcu_str_deref(device->name), device->devid);
1211                         ret = -EIO;
1212                         goto out;
1213                 }
1214
1215                 switch (zone.cond) {
1216                 case BLK_ZONE_COND_OFFLINE:
1217                 case BLK_ZONE_COND_READONLY:
1218                         btrfs_err(fs_info,
1219                 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1220                                   physical >> device->zone_info->zone_size_shift,
1221                                   rcu_str_deref(device->name), device->devid);
1222                         alloc_offsets[i] = WP_MISSING_DEV;
1223                         break;
1224                 case BLK_ZONE_COND_EMPTY:
1225                         alloc_offsets[i] = 0;
1226                         break;
1227                 case BLK_ZONE_COND_FULL:
1228                         alloc_offsets[i] = fs_info->zone_size;
1229                         break;
1230                 default:
1231                         /* Partially used zone */
1232                         alloc_offsets[i] =
1233                                         ((zone.wp - zone.start) << SECTOR_SHIFT);
1234                         break;
1235                 }
1236         }
1237
1238         if (num_sequential > 0)
1239                 cache->seq_zone = true;
1240
1241         if (num_conventional > 0) {
1242                 /*
1243                  * Avoid calling calculate_alloc_pointer() for new BG. It
1244                  * is no use for new BG. It must be always 0.
1245                  *
1246                  * Also, we have a lock chain of extent buffer lock ->
1247                  * chunk mutex.  For new BG, this function is called from
1248                  * btrfs_make_block_group() which is already taking the
1249                  * chunk mutex. Thus, we cannot call
1250                  * calculate_alloc_pointer() which takes extent buffer
1251                  * locks to avoid deadlock.
1252                  */
1253                 if (new) {
1254                         cache->alloc_offset = 0;
1255                         goto out;
1256                 }
1257                 ret = calculate_alloc_pointer(cache, &last_alloc);
1258                 if (ret || map->num_stripes == num_conventional) {
1259                         if (!ret)
1260                                 cache->alloc_offset = last_alloc;
1261                         else
1262                                 btrfs_err(fs_info,
1263                         "zoned: failed to determine allocation offset of bg %llu",
1264                                           cache->start);
1265                         goto out;
1266                 }
1267         }
1268
1269         switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1270         case 0: /* single */
1271                 if (alloc_offsets[0] == WP_MISSING_DEV) {
1272                         btrfs_err(fs_info,
1273                         "zoned: cannot recover write pointer for zone %llu",
1274                                 physical);
1275                         ret = -EIO;
1276                         goto out;
1277                 }
1278                 cache->alloc_offset = alloc_offsets[0];
1279                 break;
1280         case BTRFS_BLOCK_GROUP_DUP:
1281         case BTRFS_BLOCK_GROUP_RAID1:
1282         case BTRFS_BLOCK_GROUP_RAID0:
1283         case BTRFS_BLOCK_GROUP_RAID10:
1284         case BTRFS_BLOCK_GROUP_RAID5:
1285         case BTRFS_BLOCK_GROUP_RAID6:
1286                 /* non-single profiles are not supported yet */
1287         default:
1288                 btrfs_err(fs_info, "zoned: profile %s not yet supported",
1289                           btrfs_bg_type_to_raid_name(map->type));
1290                 ret = -EINVAL;
1291                 goto out;
1292         }
1293
1294 out:
1295         if (cache->alloc_offset > fs_info->zone_size) {
1296                 btrfs_err(fs_info,
1297                         "zoned: invalid write pointer %llu in block group %llu",
1298                         cache->alloc_offset, cache->start);
1299                 ret = -EIO;
1300         }
1301
1302         /* An extent is allocated after the write pointer */
1303         if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1304                 btrfs_err(fs_info,
1305                           "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1306                           logical, last_alloc, cache->alloc_offset);
1307                 ret = -EIO;
1308         }
1309
1310         if (!ret)
1311                 cache->meta_write_pointer = cache->alloc_offset + cache->start;
1312
1313         kfree(alloc_offsets);
1314         free_extent_map(em);
1315
1316         return ret;
1317 }
1318
1319 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1320 {
1321         u64 unusable, free;
1322
1323         if (!btrfs_is_zoned(cache->fs_info))
1324                 return;
1325
1326         WARN_ON(cache->bytes_super != 0);
1327         unusable = cache->alloc_offset - cache->used;
1328         free = cache->length - cache->alloc_offset;
1329
1330         /* We only need ->free_space in ALLOC_SEQ block groups */
1331         cache->last_byte_to_unpin = (u64)-1;
1332         cache->cached = BTRFS_CACHE_FINISHED;
1333         cache->free_space_ctl->free_space = free;
1334         cache->zone_unusable = unusable;
1335
1336         /* Should not have any excluded extents. Just in case, though */
1337         btrfs_free_excluded_extents(cache);
1338 }
1339
1340 void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1341                             struct extent_buffer *eb)
1342 {
1343         struct btrfs_fs_info *fs_info = eb->fs_info;
1344
1345         if (!btrfs_is_zoned(fs_info) ||
1346             btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
1347             !list_empty(&eb->release_list))
1348                 return;
1349
1350         set_extent_buffer_dirty(eb);
1351         set_extent_bits_nowait(&trans->dirty_pages, eb->start,
1352                                eb->start + eb->len - 1, EXTENT_DIRTY);
1353         memzero_extent_buffer(eb, 0, eb->len);
1354         set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1355
1356         spin_lock(&trans->releasing_ebs_lock);
1357         list_add_tail(&eb->release_list, &trans->releasing_ebs);
1358         spin_unlock(&trans->releasing_ebs_lock);
1359         atomic_inc(&eb->refs);
1360 }
1361
1362 void btrfs_free_redirty_list(struct btrfs_transaction *trans)
1363 {
1364         spin_lock(&trans->releasing_ebs_lock);
1365         while (!list_empty(&trans->releasing_ebs)) {
1366                 struct extent_buffer *eb;
1367
1368                 eb = list_first_entry(&trans->releasing_ebs,
1369                                       struct extent_buffer, release_list);
1370                 list_del_init(&eb->release_list);
1371                 free_extent_buffer(eb);
1372         }
1373         spin_unlock(&trans->releasing_ebs_lock);
1374 }
1375
1376 bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
1377 {
1378         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1379         struct btrfs_block_group *cache;
1380         bool ret = false;
1381
1382         if (!btrfs_is_zoned(fs_info))
1383                 return false;
1384
1385         if (!is_data_inode(&inode->vfs_inode))
1386                 return false;
1387
1388         /*
1389          * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
1390          * extent layout the relocation code has.
1391          * Furthermore we have set aside own block-group from which only the
1392          * relocation "process" can allocate and make sure only one process at a
1393          * time can add pages to an extent that gets relocated, so it's safe to
1394          * use regular REQ_OP_WRITE for this special case.
1395          */
1396         if (btrfs_is_data_reloc_root(inode->root))
1397                 return false;
1398
1399         cache = btrfs_lookup_block_group(fs_info, start);
1400         ASSERT(cache);
1401         if (!cache)
1402                 return false;
1403
1404         ret = cache->seq_zone;
1405         btrfs_put_block_group(cache);
1406
1407         return ret;
1408 }
1409
1410 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
1411                                  struct bio *bio)
1412 {
1413         struct btrfs_ordered_extent *ordered;
1414         const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
1415
1416         if (bio_op(bio) != REQ_OP_ZONE_APPEND)
1417                 return;
1418
1419         ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
1420         if (WARN_ON(!ordered))
1421                 return;
1422
1423         ordered->physical = physical;
1424         ordered->bdev = bio->bi_bdev;
1425
1426         btrfs_put_ordered_extent(ordered);
1427 }
1428
1429 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
1430 {
1431         struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1432         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1433         struct extent_map_tree *em_tree;
1434         struct extent_map *em;
1435         struct btrfs_ordered_sum *sum;
1436         u64 orig_logical = ordered->disk_bytenr;
1437         u64 *logical = NULL;
1438         int nr, stripe_len;
1439
1440         /* Zoned devices should not have partitions. So, we can assume it is 0 */
1441         ASSERT(!bdev_is_partition(ordered->bdev));
1442         if (WARN_ON(!ordered->bdev))
1443                 return;
1444
1445         if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
1446                                      ordered->physical, &logical, &nr,
1447                                      &stripe_len)))
1448                 goto out;
1449
1450         WARN_ON(nr != 1);
1451
1452         if (orig_logical == *logical)
1453                 goto out;
1454
1455         ordered->disk_bytenr = *logical;
1456
1457         em_tree = &inode->extent_tree;
1458         write_lock(&em_tree->lock);
1459         em = search_extent_mapping(em_tree, ordered->file_offset,
1460                                    ordered->num_bytes);
1461         em->block_start = *logical;
1462         free_extent_map(em);
1463         write_unlock(&em_tree->lock);
1464
1465         list_for_each_entry(sum, &ordered->list, list) {
1466                 if (*logical < orig_logical)
1467                         sum->bytenr -= orig_logical - *logical;
1468                 else
1469                         sum->bytenr += *logical - orig_logical;
1470         }
1471
1472 out:
1473         kfree(logical);
1474 }
1475
1476 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1477                                     struct extent_buffer *eb,
1478                                     struct btrfs_block_group **cache_ret)
1479 {
1480         struct btrfs_block_group *cache;
1481         bool ret = true;
1482
1483         if (!btrfs_is_zoned(fs_info))
1484                 return true;
1485
1486         cache = *cache_ret;
1487
1488         if (cache && (eb->start < cache->start ||
1489                       cache->start + cache->length <= eb->start)) {
1490                 btrfs_put_block_group(cache);
1491                 cache = NULL;
1492                 *cache_ret = NULL;
1493         }
1494
1495         if (!cache)
1496                 cache = btrfs_lookup_block_group(fs_info, eb->start);
1497
1498         if (cache) {
1499                 if (cache->meta_write_pointer != eb->start) {
1500                         btrfs_put_block_group(cache);
1501                         cache = NULL;
1502                         ret = false;
1503                 } else {
1504                         cache->meta_write_pointer = eb->start + eb->len;
1505                 }
1506
1507                 *cache_ret = cache;
1508         }
1509
1510         return ret;
1511 }
1512
1513 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
1514                                      struct extent_buffer *eb)
1515 {
1516         if (!btrfs_is_zoned(eb->fs_info) || !cache)
1517                 return;
1518
1519         ASSERT(cache->meta_write_pointer == eb->start + eb->len);
1520         cache->meta_write_pointer = eb->start;
1521 }
1522
1523 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1524 {
1525         if (!btrfs_dev_is_sequential(device, physical))
1526                 return -EOPNOTSUPP;
1527
1528         return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1529                                     length >> SECTOR_SHIFT, GFP_NOFS, 0);
1530 }
1531
1532 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1533                           struct blk_zone *zone)
1534 {
1535         struct btrfs_io_context *bioc = NULL;
1536         u64 mapped_length = PAGE_SIZE;
1537         unsigned int nofs_flag;
1538         int nmirrors;
1539         int i, ret;
1540
1541         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1542                                &mapped_length, &bioc);
1543         if (ret || !bioc || mapped_length < PAGE_SIZE) {
1544                 ret = -EIO;
1545                 goto out_put_bioc;
1546         }
1547
1548         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1549                 ret = -EINVAL;
1550                 goto out_put_bioc;
1551         }
1552
1553         nofs_flag = memalloc_nofs_save();
1554         nmirrors = (int)bioc->num_stripes;
1555         for (i = 0; i < nmirrors; i++) {
1556                 u64 physical = bioc->stripes[i].physical;
1557                 struct btrfs_device *dev = bioc->stripes[i].dev;
1558
1559                 /* Missing device */
1560                 if (!dev->bdev)
1561                         continue;
1562
1563                 ret = btrfs_get_dev_zone(dev, physical, zone);
1564                 /* Failing device */
1565                 if (ret == -EIO || ret == -EOPNOTSUPP)
1566                         continue;
1567                 break;
1568         }
1569         memalloc_nofs_restore(nofs_flag);
1570 out_put_bioc:
1571         btrfs_put_bioc(bioc);
1572         return ret;
1573 }
1574
1575 /*
1576  * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1577  * filling zeros between @physical_pos to a write pointer of dev-replace
1578  * source device.
1579  */
1580 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1581                                     u64 physical_start, u64 physical_pos)
1582 {
1583         struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1584         struct blk_zone zone;
1585         u64 length;
1586         u64 wp;
1587         int ret;
1588
1589         if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1590                 return 0;
1591
1592         ret = read_zone_info(fs_info, logical, &zone);
1593         if (ret)
1594                 return ret;
1595
1596         wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1597
1598         if (physical_pos == wp)
1599                 return 0;
1600
1601         if (physical_pos > wp)
1602                 return -EUCLEAN;
1603
1604         length = wp - physical_pos;
1605         return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1606 }
1607
1608 struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
1609                                             u64 logical, u64 length)
1610 {
1611         struct btrfs_device *device;
1612         struct extent_map *em;
1613         struct map_lookup *map;
1614
1615         em = btrfs_get_chunk_map(fs_info, logical, length);
1616         if (IS_ERR(em))
1617                 return ERR_CAST(em);
1618
1619         map = em->map_lookup;
1620         /* We only support single profile for now */
1621         ASSERT(map->num_stripes == 1);
1622         device = map->stripes[0].dev;
1623
1624         free_extent_map(em);
1625
1626         return device;
1627 }
1628
1629 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
1630 {
1631         struct btrfs_fs_info *fs_info = bg->fs_info;
1632
1633         spin_lock(&fs_info->relocation_bg_lock);
1634         if (fs_info->data_reloc_bg == bg->start)
1635                 fs_info->data_reloc_bg = 0;
1636         spin_unlock(&fs_info->relocation_bg_lock);
1637 }
1638
1639 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
1640 {
1641         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1642         struct btrfs_device *device;
1643
1644         if (!btrfs_is_zoned(fs_info))
1645                 return;
1646
1647         mutex_lock(&fs_devices->device_list_mutex);
1648         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1649                 if (device->zone_info) {
1650                         vfree(device->zone_info->zone_cache);
1651                         device->zone_info->zone_cache = NULL;
1652                 }
1653         }
1654         mutex_unlock(&fs_devices->device_list_mutex);
1655 }
1656
1657 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
1658                                        u64 length)
1659 {
1660         struct btrfs_block_group *block_group;
1661
1662         if (!btrfs_is_zoned(fs_info))
1663                 return;
1664
1665         block_group = btrfs_lookup_block_group(fs_info, logical);
1666         /* It should be called on a previous data relocation block group. */
1667         ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
1668
1669         spin_lock(&block_group->lock);
1670         if (!block_group->zoned_data_reloc_ongoing)
1671                 goto out;
1672
1673         /* All relocation extents are written. */
1674         if (block_group->start + block_group->alloc_offset == logical + length) {
1675                 /* Now, release this block group for further allocations. */
1676                 block_group->zoned_data_reloc_ongoing = 0;
1677         }
1678
1679 out:
1680         spin_unlock(&block_group->lock);
1681         btrfs_put_block_group(block_group);
1682 }