fs/gfs2/bmap.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   4  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   5  */
   6
   7 #include <linux/spinlock.h>
   8 #include <linux/completion.h>
   9 #include <linux/buffer_head.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/gfs2_ondisk.h>
  12 #include <linux/crc32.h>
  13 #include <linux/iomap.h>
  14 #include <linux/ktime.h>
  15
  16 #include "gfs2.h"
  17 #include "incore.h"
  18 #include "bmap.h"
  19 #include "glock.h"
  20 #include "inode.h"
  21 #include "meta_io.h"
  22 #include "quota.h"
  23 #include "rgrp.h"
  24 #include "log.h"
  25 #include "super.h"
  26 #include "trans.h"
  27 #include "dir.h"
  28 #include "util.h"
  29 #include "aops.h"
  30 #include "trace_gfs2.h"
  31
  32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  33  * block is 512, so __u16 is fine for that. It saves stack space to
  34  * keep it small.
  35  */
  36 struct metapath {
  37         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  38         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  39         int mp_fheight; /* find_metapath height */
  40         int mp_aheight; /* actual height (lookup height) */
  41 };
  42
  43 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  44
  45 /**
  46  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  47  * @ip: the inode
  48  * @dibh: the dinode buffer
  49  * @block: the block number that was allocated
  50  * @page: The (optional) page. This is looked up if @page is NULL
  51  *
  52  * Returns: errno
  53  */
  54
  55 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  56                                u64 block, struct page *page)
  57 {
  58         struct inode *inode = &ip->i_inode;
  59
  60         if (!PageUptodate(page)) {
  61                 void *kaddr = kmap(page);
  62                 u64 dsize = i_size_read(inode);
  63
  64                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  65                 memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  66                 kunmap(page);
  67
  68                 SetPageUptodate(page);
  69         }
  70
  71         if (gfs2_is_jdata(ip)) {
  72                 struct buffer_head *bh;
  73
  74                 if (!page_has_buffers(page))
  75                         create_empty_buffers(page, BIT(inode->i_blkbits),
  76                                              BIT(BH_Uptodate));
  77
  78                 bh = page_buffers(page);
  79                 if (!buffer_mapped(bh))
  80                         map_bh(bh, inode->i_sb, block);
  81
  82                 set_buffer_uptodate(bh);
  83                 gfs2_trans_add_data(ip->i_gl, bh);
  84         } else {
  85                 set_page_dirty(page);
  86                 gfs2_ordered_add_inode(ip);
  87         }
  88
  89         return 0;
  90 }
  91
  92 static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
  93 {
  94         struct buffer_head *bh, *dibh;
  95         struct gfs2_dinode *di;
  96         u64 block = 0;
  97         int isdir = gfs2_is_dir(ip);
  98         int error;
  99
 100         error = gfs2_meta_inode_buffer(ip, &dibh);
 101         if (error)
 102                 return error;
 103
 104         if (i_size_read(&ip->i_inode)) {
 105                 /* Get a free block, fill it with the stuffed data,
 106                    and write it out to disk */
 107
 108                 unsigned int n = 1;
 109                 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 110                 if (error)
 111                         goto out_brelse;
 112                 if (isdir) {
 113                         gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
 114                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 115                         if (error)
 116                                 goto out_brelse;
 117                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 118                                               dibh, sizeof(struct gfs2_dinode));
 119                         brelse(bh);
 120                 } else {
 121                         error = gfs2_unstuffer_page(ip, dibh, block, page);
 122                         if (error)
 123                                 goto out_brelse;
 124                 }
 125         }
 126
 127         /*  Set up the pointer to the new block  */
 128
 129         gfs2_trans_add_meta(ip->i_gl, dibh);
 130         di = (struct gfs2_dinode *)dibh->b_data;
 131         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 132
 133         if (i_size_read(&ip->i_inode)) {
 134                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 135                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 136                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 137         }
 138
 139         ip->i_height = 1;
 140         di->di_height = cpu_to_be16(1);
 141
 142 out_brelse:
 143         brelse(dibh);
 144         return error;
 145 }
 146
 147 /**
 148  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 149  * @ip: The GFS2 inode to unstuff
 150  *
 151  * This routine unstuffs a dinode and returns it to a "normal" state such
 152  * that the height can be grown in the traditional way.
 153  *
 154  * Returns: errno
 155  */
 156
 157 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
 158 {
 159         struct inode *inode = &ip->i_inode;
 160         struct page *page;
 161         int error;
 162
 163         down_write(&ip->i_rw_mutex);
 164         page = grab_cache_page(inode->i_mapping, 0);
 165         error = -ENOMEM;
 166         if (!page)
 167                 goto out;
 168         error = __gfs2_unstuff_inode(ip, page);
 169         unlock_page(page);
 170         put_page(page);
 171 out:
 172         up_write(&ip->i_rw_mutex);
 173         return error;
 174 }
 175
 176 /**
 177  * find_metapath - Find path through the metadata tree
 178  * @sdp: The superblock
 179  * @block: The disk block to look up
 180  * @mp: The metapath to return the result in
 181  * @height: The pre-calculated height of the metadata tree
 182  *
 183  *   This routine returns a struct metapath structure that defines a path
 184  *   through the metadata of inode "ip" to get to block "block".
 185  *
 186  *   Example:
 187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 188  *   filesystem with a blocksize of 4096.
 189  *
 190  *   find_metapath() would return a struct metapath structure set to:
 191  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 192  *
 193  *   That means that in order to get to the block containing the byte at
 194  *   offset 101342453, we would load the indirect block pointed to by pointer
 195  *   0 in the dinode.  We would then load the indirect block pointed to by
 196  *   pointer 48 in that indirect block.  We would then load the data block
 197  *   pointed to by pointer 165 in that indirect block.
 198  *
 199  *             ----------------------------------------
 200  *             | Dinode |                             |
 201  *             |        |                            4|
 202  *             |        |0 1 2 3 4 5                 9|
 203  *             |        |                            6|
 204  *             ----------------------------------------
 205  *                       |
 206  *                       |
 207  *                       V
 208  *             ----------------------------------------
 209  *             | Indirect Block                       |
 210  *             |                                     5|
 211  *             |            4 4 4 4 4 5 5            1|
 212  *             |0           5 6 7 8 9 0 1            2|
 213  *             ----------------------------------------
 214  *                                |
 215  *                                |
 216  *                                V
 217  *             ----------------------------------------
 218  *             | Indirect Block                       |
 219  *             |                         1 1 1 1 1   5|
 220  *             |                         6 6 6 6 6   1|
 221  *             |0                        3 4 5 6 7   2|
 222  *             ----------------------------------------
 223  *                                           |
 224  *                                           |
 225  *                                           V
 226  *             ----------------------------------------
 227  *             | Data block containing offset         |
 228  *             |            101342453                 |
 229  *             |                                      |
 230  *             |                                      |
 231  *             ----------------------------------------
 232  *
 233  */
 234
 235 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 236                           struct metapath *mp, unsigned int height)
 237 {
 238         unsigned int i;
 239
 240         mp->mp_fheight = height;
 241         for (i = height; i--;)
 242                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 243 }
 244
 245 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 246 {
 247         if (mp->mp_list[0] == 0)
 248                 return 2;
 249         return 1;
 250 }
 251
 252 /**
 253  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 254  * @height: The metadata height (0 = dinode)
 255  * @mp: The metapath
 256  */
 257 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 258 {
 259         struct buffer_head *bh = mp->mp_bh[height];
 260         if (height == 0)
 261                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 262         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 263 }
 264
 265 /**
 266  * metapointer - Return pointer to start of metadata in a buffer
 267  * @height: The metadata height (0 = dinode)
 268  * @mp: The metapath
 269  *
 270  * Return a pointer to the block number of the next height of the metadata
 271  * tree given a buffer containing the pointer to the current height of the
 272  * metadata tree.
 273  */
 274
 275 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 276 {
 277         __be64 *p = metaptr1(height, mp);
 278         return p + mp->mp_list[height];
 279 }
 280
 281 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 282 {
 283         const struct buffer_head *bh = mp->mp_bh[height];
 284         return (const __be64 *)(bh->b_data + bh->b_size);
 285 }
 286
 287 static void clone_metapath(struct metapath *clone, struct metapath *mp)
 288 {
 289         unsigned int hgt;
 290
 291         *clone = *mp;
 292         for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 293                 get_bh(clone->mp_bh[hgt]);
 294 }
 295
 296 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 297 {
 298         const __be64 *t;
 299
 300         for (t = start; t < end; t++) {
 301                 struct buffer_head *rabh;
 302
 303                 if (!*t)
 304                         continue;
 305
 306                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 307                 if (trylock_buffer(rabh)) {
 308                         if (!buffer_uptodate(rabh)) {
 309                                 rabh->b_end_io = end_buffer_read_sync;
 310                                 submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
 311                                           REQ_PRIO, rabh);
 312                                 continue;
 313                         }
 314                         unlock_buffer(rabh);
 315                 }
 316                 brelse(rabh);
 317         }
 318 }
 319
 320 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 321                              unsigned int x, unsigned int h)
 322 {
 323         for (; x < h; x++) {
 324                 __be64 *ptr = metapointer(x, mp);
 325                 u64 dblock = be64_to_cpu(*ptr);
 326                 int ret;
 327
 328                 if (!dblock)
 329                         break;
 330                 ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
 331                 if (ret)
 332                         return ret;
 333         }
 334         mp->mp_aheight = x + 1;
 335         return 0;
 336 }
 337
 338 /**
 339  * lookup_metapath - Walk the metadata tree to a specific point
 340  * @ip: The inode
 341  * @mp: The metapath
 342  *
 343  * Assumes that the inode's buffer has already been looked up and
 344  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 345  * by find_metapath().
 346  *
 347  * If this function encounters part of the tree which has not been
 348  * allocated, it returns the current height of the tree at the point
 349  * at which it found the unallocated block. Blocks which are found are
 350  * added to the mp->mp_bh[] list.
 351  *
 352  * Returns: error
 353  */
 354
 355 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 356 {
 357         return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 358 }
 359
 360 /**
 361  * fillup_metapath - fill up buffers for the metadata path to a specific height
 362  * @ip: The inode
 363  * @mp: The metapath
 364  * @h: The height to which it should be mapped
 365  *
 366  * Similar to lookup_metapath, but does lookups for a range of heights
 367  *
 368  * Returns: error or the number of buffers filled
 369  */
 370
 371 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 372 {
 373         unsigned int x = 0;
 374         int ret;
 375
 376         if (h) {
 377                 /* find the first buffer we need to look up. */
 378                 for (x = h - 1; x > 0; x--) {
 379                         if (mp->mp_bh[x])
 380                                 break;
 381                 }
 382         }
 383         ret = __fillup_metapath(ip, mp, x, h);
 384         if (ret)
 385                 return ret;
 386         return mp->mp_aheight - x - 1;
 387 }
 388
 389 static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
 390 {
 391         sector_t factor = 1, block = 0;
 392         int hgt;
 393
 394         for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
 395                 if (hgt < mp->mp_aheight)
 396                         block += mp->mp_list[hgt] * factor;
 397                 factor *= sdp->sd_inptrs;
 398         }
 399         return block;
 400 }
 401
 402 static void release_metapath(struct metapath *mp)
 403 {
 404         int i;
 405
 406         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 407                 if (mp->mp_bh[i] == NULL)
 408                         break;
 409                 brelse(mp->mp_bh[i]);
 410                 mp->mp_bh[i] = NULL;
 411         }
 412 }
 413
 414 /**
 415  * gfs2_extent_length - Returns length of an extent of blocks
 416  * @bh: The metadata block
 417  * @ptr: Current position in @bh
 418  * @limit: Max extent length to return
 419  * @eob: Set to 1 if we hit "end of block"
 420  *
 421  * Returns: The length of the extent (minimum of one block)
 422  */
 423
 424 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
 425 {
 426         const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 427         const __be64 *first = ptr;
 428         u64 d = be64_to_cpu(*ptr);
 429
 430         *eob = 0;
 431         do {
 432                 ptr++;
 433                 if (ptr >= end)
 434                         break;
 435                 d++;
 436         } while(be64_to_cpu(*ptr) == d);
 437         if (ptr >= end)
 438                 *eob = 1;
 439         return ptr - first;
 440 }
 441
 442 enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
 443
 444 /*
 445  * gfs2_metadata_walker - walk an indirect block
 446  * @mp: Metapath to indirect block
 447  * @ptrs: Number of pointers to look at
 448  *
 449  * When returning WALK_FOLLOW, the walker must update @mp to point at the right
 450  * indirect block to follow.
 451  */
 452 typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
 453                                                    unsigned int ptrs);
 454
 455 /*
 456  * gfs2_walk_metadata - walk a tree of indirect blocks
 457  * @inode: The inode
 458  * @mp: Starting point of walk
 459  * @max_len: Maximum number of blocks to walk
 460  * @walker: Called during the walk
 461  *
 462  * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
 463  * past the end of metadata, and a negative error code otherwise.
 464  */
 465
 466 static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
 467                 u64 max_len, gfs2_metadata_walker walker)
 468 {
 469         struct gfs2_inode *ip = GFS2_I(inode);
 470         struct gfs2_sbd *sdp = GFS2_SB(inode);
 471         u64 factor = 1;
 472         unsigned int hgt;
 473         int ret;
 474
 475         /*
 476          * The walk starts in the lowest allocated indirect block, which may be
 477          * before the position indicated by @mp.  Adjust @max_len accordingly
 478          * to avoid a short walk.
 479          */
 480         for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
 481                 max_len += mp->mp_list[hgt] * factor;
 482                 mp->mp_list[hgt] = 0;
 483                 factor *= sdp->sd_inptrs;
 484         }
 485
 486         for (;;) {
 487                 u16 start = mp->mp_list[hgt];
 488                 enum walker_status status;
 489                 unsigned int ptrs;
 490                 u64 len;
 491
 492                 /* Walk indirect block. */
 493                 ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
 494                 len = ptrs * factor;
 495                 if (len > max_len)
 496                         ptrs = DIV_ROUND_UP_ULL(max_len, factor);
 497                 status = walker(mp, ptrs);
 498                 switch (status) {
 499                 case WALK_STOP:
 500                         return 1;
 501                 case WALK_FOLLOW:
 502                         BUG_ON(mp->mp_aheight == mp->mp_fheight);
 503                         ptrs = mp->mp_list[hgt] - start;
 504                         len = ptrs * factor;
 505                         break;
 506                 case WALK_CONTINUE:
 507                         break;
 508                 }
 509                 if (len >= max_len)
 510                         break;
 511                 max_len -= len;
 512                 if (status == WALK_FOLLOW)
 513                         goto fill_up_metapath;
 514
 515 lower_metapath:
 516                 /* Decrease height of metapath. */
 517                 brelse(mp->mp_bh[hgt]);
 518                 mp->mp_bh[hgt] = NULL;
 519                 mp->mp_list[hgt] = 0;
 520                 if (!hgt)
 521                         break;
 522                 hgt--;
 523                 factor *= sdp->sd_inptrs;
 524
 525                 /* Advance in metadata tree. */
 526                 (mp->mp_list[hgt])++;
 527                 if (hgt) {
 528                         if (mp->mp_list[hgt] >= sdp->sd_inptrs)
 529                                 goto lower_metapath;
 530                 } else {
 531                         if (mp->mp_list[hgt] >= sdp->sd_diptrs)
 532                                 break;
 533                 }
 534
 535 fill_up_metapath:
 536                 /* Increase height of metapath. */
 537                 ret = fillup_metapath(ip, mp, ip->i_height - 1);
 538                 if (ret < 0)
 539                         return ret;
 540                 hgt += ret;
 541                 for (; ret; ret--)
 542                         do_div(factor, sdp->sd_inptrs);
 543                 mp->mp_aheight = hgt + 1;
 544         }
 545         return 0;
 546 }
 547
 548 static enum walker_status gfs2_hole_walker(struct metapath *mp,
 549                                            unsigned int ptrs)
 550 {
 551         const __be64 *start, *ptr, *end;
 552         unsigned int hgt;
 553
 554         hgt = mp->mp_aheight - 1;
 555         start = metapointer(hgt, mp);
 556         end = start + ptrs;
 557
 558         for (ptr = start; ptr < end; ptr++) {
 559                 if (*ptr) {
 560                         mp->mp_list[hgt] += ptr - start;
 561                         if (mp->mp_aheight == mp->mp_fheight)
 562                                 return WALK_STOP;
 563                         return WALK_FOLLOW;
 564                 }
 565         }
 566         return WALK_CONTINUE;
 567 }
 568
 569 /**
 570  * gfs2_hole_size - figure out the size of a hole
 571  * @inode: The inode
 572  * @lblock: The logical starting block number
 573  * @len: How far to look (in blocks)
 574  * @mp: The metapath at lblock
 575  * @iomap: The iomap to store the hole size in
 576  *
 577  * This function modifies @mp.
 578  *
 579  * Returns: errno on error
 580  */
 581 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 582                           struct metapath *mp, struct iomap *iomap)
 583 {
 584         struct metapath clone;
 585         u64 hole_size;
 586         int ret;
 587
 588         clone_metapath(&clone, mp);
 589         ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
 590         if (ret < 0)
 591                 goto out;
 592
 593         if (ret == 1)
 594                 hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
 595         else
 596                 hole_size = len;
 597         iomap->length = hole_size << inode->i_blkbits;
 598         ret = 0;
 599
 600 out:
 601         release_metapath(&clone);
 602         return ret;
 603 }
 604
 605 static inline void gfs2_indirect_init(struct metapath *mp,
 606                                       struct gfs2_glock *gl, unsigned int i,
 607                                       unsigned offset, u64 bn)
 608 {
 609         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 610                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 611                                  sizeof(struct gfs2_dinode)));
 612         BUG_ON(i < 1);
 613         BUG_ON(mp->mp_bh[i] != NULL);
 614         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 615         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 616         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 617         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 618         ptr += offset;
 619         *ptr = cpu_to_be64(bn);
 620 }
 621
 622 enum alloc_state {
 623         ALLOC_DATA = 0,
 624         ALLOC_GROW_DEPTH = 1,
 625         ALLOC_GROW_HEIGHT = 2,
 626         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 627 };
 628
 629 /**
 630  * __gfs2_iomap_alloc - Build a metadata tree of the requested height
 631  * @inode: The GFS2 inode
 632  * @iomap: The iomap structure
 633  * @mp: The metapath, with proper height information calculated
 634  *
 635  * In this routine we may have to alloc:
 636  *   i) Indirect blocks to grow the metadata tree height
 637  *  ii) Indirect blocks to fill in lower part of the metadata tree
 638  * iii) Data blocks
 639  *
 640  * This function is called after __gfs2_iomap_get, which works out the
 641  * total number of blocks which we need via gfs2_alloc_size.
 642  *
 643  * We then do the actual allocation asking for an extent at a time (if
 644  * enough contiguous free blocks are available, there will only be one
 645  * allocation request per call) and uses the state machine to initialise
 646  * the blocks in order.
 647  *
 648  * Right now, this function will allocate at most one indirect block
 649  * worth of data -- with a default block size of 4K, that's slightly
 650  * less than 2M.  If this limitation is ever removed to allow huge
 651  * allocations, we would probably still want to limit the iomap size we
 652  * return to avoid stalling other tasks during huge writes; the next
 653  * iomap iteration would then find the blocks already allocated.
 654  *
 655  * Returns: errno on error
 656  */
 657
 658 static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 659                               struct metapath *mp)
 660 {
 661         struct gfs2_inode *ip = GFS2_I(inode);
 662         struct gfs2_sbd *sdp = GFS2_SB(inode);
 663         struct buffer_head *dibh = mp->mp_bh[0];
 664         u64 bn;
 665         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 666         size_t dblks = iomap->length >> inode->i_blkbits;
 667         const unsigned end_of_metadata = mp->mp_fheight - 1;
 668         int ret;
 669         enum alloc_state state;
 670         __be64 *ptr;
 671         __be64 zero_bn = 0;
 672
 673         BUG_ON(mp->mp_aheight < 1);
 674         BUG_ON(dibh == NULL);
 675         BUG_ON(dblks < 1);
 676
 677         gfs2_trans_add_meta(ip->i_gl, dibh);
 678
 679         down_write(&ip->i_rw_mutex);
 680
 681         if (mp->mp_fheight == mp->mp_aheight) {
 682                 /* Bottom indirect block exists */
 683                 state = ALLOC_DATA;
 684         } else {
 685                 /* Need to allocate indirect blocks */
 686                 if (mp->mp_fheight == ip->i_height) {
 687                         /* Writing into existing tree, extend tree down */
 688                         iblks = mp->mp_fheight - mp->mp_aheight;
 689                         state = ALLOC_GROW_DEPTH;
 690                 } else {
 691                         /* Building up tree height */
 692                         state = ALLOC_GROW_HEIGHT;
 693                         iblks = mp->mp_fheight - ip->i_height;
 694                         branch_start = metapath_branch_start(mp);
 695                         iblks += (mp->mp_fheight - branch_start);
 696                 }
 697         }
 698
 699         /* start of the second part of the function (state machine) */
 700
 701         blks = dblks + iblks;
 702         i = mp->mp_aheight;
 703         do {
 704                 n = blks - alloced;
 705                 ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 706                 if (ret)
 707                         goto out;
 708                 alloced += n;
 709                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 710                         gfs2_trans_remove_revoke(sdp, bn, n);
 711                 switch (state) {
 712                 /* Growing height of tree */
 713                 case ALLOC_GROW_HEIGHT:
 714                         if (i == 1) {
 715                                 ptr = (__be64 *)(dibh->b_data +
 716                                                  sizeof(struct gfs2_dinode));
 717                                 zero_bn = *ptr;
 718                         }
 719                         for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 720                              i++, n--)
 721                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 722                         if (i - 1 == mp->mp_fheight - ip->i_height) {
 723                                 i--;
 724                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 725                                                 sizeof(struct gfs2_meta_header),
 726                                                 dibh, sizeof(struct gfs2_dinode));
 727                                 gfs2_buffer_clear_tail(dibh,
 728                                                 sizeof(struct gfs2_dinode) +
 729                                                 sizeof(__be64));
 730                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 731                                         sizeof(struct gfs2_meta_header));
 732                                 *ptr = zero_bn;
 733                                 state = ALLOC_GROW_DEPTH;
 734                                 for(i = branch_start; i < mp->mp_fheight; i++) {
 735                                         if (mp->mp_bh[i] == NULL)
 736                                                 break;
 737                                         brelse(mp->mp_bh[i]);
 738                                         mp->mp_bh[i] = NULL;
 739                                 }
 740                                 i = branch_start;
 741                         }
 742                         if (n == 0)
 743                                 break;
 744                         fallthrough;    /* To branching from existing tree */
 745                 case ALLOC_GROW_DEPTH:
 746                         if (i > 1 && i < mp->mp_fheight)
 747                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 748                         for (; i < mp->mp_fheight && n > 0; i++, n--)
 749                                 gfs2_indirect_init(mp, ip->i_gl, i,
 750                                                    mp->mp_list[i-1], bn++);
 751                         if (i == mp->mp_fheight)
 752                                 state = ALLOC_DATA;
 753                         if (n == 0)
 754                                 break;
 755                         fallthrough;    /* To tree complete, adding data blocks */
 756                 case ALLOC_DATA:
 757                         BUG_ON(n > dblks);
 758                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 759                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 760                         dblks = n;
 761                         ptr = metapointer(end_of_metadata, mp);
 762                         iomap->addr = bn << inode->i_blkbits;
 763                         iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 764                         while (n-- > 0)
 765                                 *ptr++ = cpu_to_be64(bn++);
 766                         break;
 767                 }
 768         } while (iomap->addr == IOMAP_NULL_ADDR);
 769
 770         iomap->type = IOMAP_MAPPED;
 771         iomap->length = (u64)dblks << inode->i_blkbits;
 772         ip->i_height = mp->mp_fheight;
 773         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 774         gfs2_dinode_out(ip, dibh->b_data);
 775 out:
 776         up_write(&ip->i_rw_mutex);
 777         return ret;
 778 }
 779
 780 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 781
 782 /**
 783  * gfs2_alloc_size - Compute the maximum allocation size
 784  * @inode: The inode
 785  * @mp: The metapath
 786  * @size: Requested size in blocks
 787  *
 788  * Compute the maximum size of the next allocation at @mp.
 789  *
 790  * Returns: size in blocks
 791  */
 792 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 793 {
 794         struct gfs2_inode *ip = GFS2_I(inode);
 795         struct gfs2_sbd *sdp = GFS2_SB(inode);
 796         const __be64 *first, *ptr, *end;
 797
 798         /*
 799          * For writes to stuffed files, this function is called twice via
 800          * __gfs2_iomap_get, before and after unstuffing. The size we return the
 801          * first time needs to be large enough to get the reservation and
 802          * allocation sizes right.  The size we return the second time must
 803          * be exact or else __gfs2_iomap_alloc won't do the right thing.
 804          */
 805
 806         if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 807                 unsigned int maxsize = mp->mp_fheight > 1 ?
 808                         sdp->sd_inptrs : sdp->sd_diptrs;
 809                 maxsize -= mp->mp_list[mp->mp_fheight - 1];
 810                 if (size > maxsize)
 811                         size = maxsize;
 812                 return size;
 813         }
 814
 815         first = metapointer(ip->i_height - 1, mp);
 816         end = metaend(ip->i_height - 1, mp);
 817         if (end - first > size)
 818                 end = first + size;
 819         for (ptr = first; ptr < end; ptr++) {
 820                 if (*ptr)
 821                         break;
 822         }
 823         return ptr - first;
 824 }
 825
 826 /**
 827  * __gfs2_iomap_get - Map blocks from an inode to disk blocks
 828  * @inode: The inode
 829  * @pos: Starting position in bytes
 830  * @length: Length to map, in bytes
 831  * @flags: iomap flags
 832  * @iomap: The iomap structure
 833  * @mp: The metapath
 834  *
 835  * Returns: errno
 836  */
 837 static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 838                             unsigned flags, struct iomap *iomap,
 839                             struct metapath *mp)
 840 {
 841         struct gfs2_inode *ip = GFS2_I(inode);
 842         struct gfs2_sbd *sdp = GFS2_SB(inode);
 843         loff_t size = i_size_read(inode);
 844         __be64 *ptr;
 845         sector_t lblock;
 846         sector_t lblock_stop;
 847         int ret;
 848         int eob;
 849         u64 len;
 850         struct buffer_head *dibh = NULL, *bh;
 851         u8 height;
 852
 853         if (!length)
 854                 return -EINVAL;
 855
 856         down_read(&ip->i_rw_mutex);
 857
 858         ret = gfs2_meta_inode_buffer(ip, &dibh);
 859         if (ret)
 860                 goto unlock;
 861         mp->mp_bh[0] = dibh;
 862
 863         if (gfs2_is_stuffed(ip)) {
 864                 if (flags & IOMAP_WRITE) {
 865                         loff_t max_size = gfs2_max_stuffed_size(ip);
 866
 867                         if (pos + length > max_size)
 868                                 goto unstuff;
 869                         iomap->length = max_size;
 870                 } else {
 871                         if (pos >= size) {
 872                                 if (flags & IOMAP_REPORT) {
 873                                         ret = -ENOENT;
 874                                         goto unlock;
 875                                 } else {
 876                                         iomap->offset = pos;
 877                                         iomap->length = length;
 878                                         goto hole_found;
 879                                 }
 880                         }
 881                         iomap->length = size;
 882                 }
 883                 iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 884                               sizeof(struct gfs2_dinode);
 885                 iomap->type = IOMAP_INLINE;
 886                 iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 887                 goto out;
 888         }
 889
 890 unstuff:
 891         lblock = pos >> inode->i_blkbits;
 892         iomap->offset = lblock << inode->i_blkbits;
 893         lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 894         len = lblock_stop - lblock + 1;
 895         iomap->length = len << inode->i_blkbits;
 896
 897         height = ip->i_height;
 898         while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 899                 height++;
 900         find_metapath(sdp, lblock, mp, height);
 901         if (height > ip->i_height || gfs2_is_stuffed(ip))
 902                 goto do_alloc;
 903
 904         ret = lookup_metapath(ip, mp);
 905         if (ret)
 906                 goto unlock;
 907
 908         if (mp->mp_aheight != ip->i_height)
 909                 goto do_alloc;
 910
 911         ptr = metapointer(ip->i_height - 1, mp);
 912         if (*ptr == 0)
 913                 goto do_alloc;
 914
 915         bh = mp->mp_bh[ip->i_height - 1];
 916         len = gfs2_extent_length(bh, ptr, len, &eob);
 917
 918         iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 919         iomap->length = len << inode->i_blkbits;
 920         iomap->type = IOMAP_MAPPED;
 921         iomap->flags |= IOMAP_F_MERGED;
 922         if (eob)
 923                 iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 924
 925 out:
 926         iomap->bdev = inode->i_sb->s_bdev;
 927 unlock:
 928         up_read(&ip->i_rw_mutex);
 929         return ret;
 930
 931 do_alloc:
 932         if (flags & IOMAP_REPORT) {
 933                 if (pos >= size)
 934                         ret = -ENOENT;
 935                 else if (height == ip->i_height)
 936                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 937                 else
 938                         iomap->length = size - iomap->offset;
 939         } else if (flags & IOMAP_WRITE) {
 940                 u64 alloc_size;
 941
 942                 if (flags & IOMAP_DIRECT)
 943                         goto out;  /* (see gfs2_file_direct_write) */
 944
 945                 len = gfs2_alloc_size(inode, mp, len);
 946                 alloc_size = len << inode->i_blkbits;
 947                 if (alloc_size < iomap->length)
 948                         iomap->length = alloc_size;
 949         } else {
 950                 if (pos < size && height == ip->i_height)
 951                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 952         }
 953 hole_found:
 954         iomap->addr = IOMAP_NULL_ADDR;
 955         iomap->type = IOMAP_HOLE;
 956         goto out;
 957 }
 958
 959 static struct folio *
 960 gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
 961 {
 962         struct inode *inode = iter->inode;
 963         unsigned int blockmask = i_blocksize(inode) - 1;
 964         struct gfs2_sbd *sdp = GFS2_SB(inode);
 965         unsigned int blocks;
 966         struct folio *folio;
 967         int status;
 968
 969         blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
 970         status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
 971         if (status)
 972                 return ERR_PTR(status);
 973
 974         folio = iomap_get_folio(iter, pos);
 975         if (IS_ERR(folio))
 976                 gfs2_trans_end(sdp);
 977         return folio;
 978 }
 979
 980 static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
 981                                  unsigned copied, struct folio *folio)
 982 {
 983         struct gfs2_trans *tr = current->journal_info;
 984         struct gfs2_inode *ip = GFS2_I(inode);
 985         struct gfs2_sbd *sdp = GFS2_SB(inode);
 986
 987         if (!gfs2_is_stuffed(ip))
 988                 gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos),
 989                                         copied);
 990
 991         folio_unlock(folio);
 992         folio_put(folio);
 993
 994         if (tr->tr_num_buf_new)
 995                 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 996
 997         gfs2_trans_end(sdp);
 998 }
 999
1000 static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
1001         .get_folio = gfs2_iomap_get_folio,
1002         .put_folio = gfs2_iomap_put_folio,
1003 };
1004
1005 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1006                                   loff_t length, unsigned flags,
1007                                   struct iomap *iomap,
1008                                   struct metapath *mp)
1009 {
1010         struct gfs2_inode *ip = GFS2_I(inode);
1011         struct gfs2_sbd *sdp = GFS2_SB(inode);
1012         bool unstuff;
1013         int ret;
1014
1015         unstuff = gfs2_is_stuffed(ip) &&
1016                   pos + length > gfs2_max_stuffed_size(ip);
1017
1018         if (unstuff || iomap->type == IOMAP_HOLE) {
1019                 unsigned int data_blocks, ind_blocks;
1020                 struct gfs2_alloc_parms ap = {};
1021                 unsigned int rblocks;
1022                 struct gfs2_trans *tr;
1023
1024                 gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1025                                        &ind_blocks);
1026                 ap.target = data_blocks + ind_blocks;
1027                 ret = gfs2_quota_lock_check(ip, &ap);
1028                 if (ret)
1029                         return ret;
1030
1031                 ret = gfs2_inplace_reserve(ip, &ap);
1032                 if (ret)
1033                         goto out_qunlock;
1034
1035                 rblocks = RES_DINODE + ind_blocks;
1036                 if (gfs2_is_jdata(ip))
1037                         rblocks += data_blocks;
1038                 if (ind_blocks || data_blocks)
1039                         rblocks += RES_STATFS + RES_QUOTA;
1040                 if (inode == sdp->sd_rindex)
1041                         rblocks += 2 * RES_STATFS;
1042                 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1043
1044                 ret = gfs2_trans_begin(sdp, rblocks,
1045                                        iomap->length >> inode->i_blkbits);
1046                 if (ret)
1047                         goto out_trans_fail;
1048
1049                 if (unstuff) {
1050                         ret = gfs2_unstuff_dinode(ip);
1051                         if (ret)
1052                                 goto out_trans_end;
1053                         release_metapath(mp);
1054                         ret = __gfs2_iomap_get(inode, iomap->offset,
1055                                                iomap->length, flags, iomap, mp);
1056                         if (ret)
1057                                 goto out_trans_end;
1058                 }
1059
1060                 if (iomap->type == IOMAP_HOLE) {
1061                         ret = __gfs2_iomap_alloc(inode, iomap, mp);
1062                         if (ret) {
1063                                 gfs2_trans_end(sdp);
1064                                 gfs2_inplace_release(ip);
1065                                 punch_hole(ip, iomap->offset, iomap->length);
1066                                 goto out_qunlock;
1067                         }
1068                 }
1069
1070                 tr = current->journal_info;
1071                 if (tr->tr_num_buf_new)
1072                         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1073
1074                 gfs2_trans_end(sdp);
1075         }
1076
1077         if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1078                 iomap->folio_ops = &gfs2_iomap_folio_ops;
1079         return 0;
1080
1081 out_trans_end:
1082         gfs2_trans_end(sdp);
1083 out_trans_fail:
1084         gfs2_inplace_release(ip);
1085 out_qunlock:
1086         gfs2_quota_unlock(ip);
1087         return ret;
1088 }
1089
1090 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1091                             unsigned flags, struct iomap *iomap,
1092                             struct iomap *srcmap)
1093 {
1094         struct gfs2_inode *ip = GFS2_I(inode);
1095         struct metapath mp = { .mp_aheight = 1, };
1096         int ret;
1097
1098         if (gfs2_is_jdata(ip))
1099                 iomap->flags |= IOMAP_F_BUFFER_HEAD;
1100
1101         trace_gfs2_iomap_start(ip, pos, length, flags);
1102         ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1103         if (ret)
1104                 goto out_unlock;
1105
1106         switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1107         case IOMAP_WRITE:
1108                 if (flags & IOMAP_DIRECT) {
1109                         /*
1110                          * Silently fall back to buffered I/O for stuffed files
1111                          * or if we've got a hole (see gfs2_file_direct_write).
1112                          */
1113                         if (iomap->type != IOMAP_MAPPED)
1114                                 ret = -ENOTBLK;
1115                         goto out_unlock;
1116                 }
1117                 break;
1118         case IOMAP_ZERO:
1119                 if (iomap->type == IOMAP_HOLE)
1120                         goto out_unlock;
1121                 break;
1122         default:
1123                 goto out_unlock;
1124         }
1125
1126         ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1127
1128 out_unlock:
1129         release_metapath(&mp);
1130         trace_gfs2_iomap_end(ip, iomap, ret);
1131         return ret;
1132 }
1133
1134 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1135                           ssize_t written, unsigned flags, struct iomap *iomap)
1136 {
1137         struct gfs2_inode *ip = GFS2_I(inode);
1138         struct gfs2_sbd *sdp = GFS2_SB(inode);
1139
1140         switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1141         case IOMAP_WRITE:
1142                 if (flags & IOMAP_DIRECT)
1143                         return 0;
1144                 break;
1145         case IOMAP_ZERO:
1146                  if (iomap->type == IOMAP_HOLE)
1147                          return 0;
1148                  break;
1149         default:
1150                  return 0;
1151         }
1152
1153         if (!gfs2_is_stuffed(ip))
1154                 gfs2_ordered_add_inode(ip);
1155
1156         if (inode == sdp->sd_rindex)
1157                 adjust_fs_space(inode);
1158
1159         gfs2_inplace_release(ip);
1160
1161         if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1162                 gfs2_quota_unlock(ip);
1163
1164         if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1165                 /* Deallocate blocks that were just allocated. */
1166                 loff_t hstart = round_up(pos + written, i_blocksize(inode));
1167                 loff_t hend = iomap->offset + iomap->length;
1168
1169                 if (hstart < hend) {
1170                         truncate_pagecache_range(inode, hstart, hend - 1);
1171                         punch_hole(ip, hstart, hend - hstart);
1172                 }
1173         }
1174
1175         if (unlikely(!written))
1176                 return 0;
1177
1178         if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1179                 mark_inode_dirty(inode);
1180         set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1181         return 0;
1182 }
1183
1184 const struct iomap_ops gfs2_iomap_ops = {
1185         .iomap_begin = gfs2_iomap_begin,
1186         .iomap_end = gfs2_iomap_end,
1187 };
1188
1189 /**
1190  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1191  * @inode: The inode
1192  * @lblock: The logical block number
1193  * @bh_map: The bh to be mapped
1194  * @create: True if its ok to alloc blocks to satify the request
1195  *
1196  * The size of the requested mapping is defined in bh_map->b_size.
1197  *
1198  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1199  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1200  * bh_map->b_size to indicate the size of the mapping when @lblock and
1201  * successive blocks are mapped, up to the requested size.
1202  *
1203  * Sets buffer_boundary() if a read of metadata will be required
1204  * before the next block can be mapped. Sets buffer_new() if new
1205  * blocks were allocated.
1206  *
1207  * Returns: errno
1208  */
1209
1210 int gfs2_block_map(struct inode *inode, sector_t lblock,
1211                    struct buffer_head *bh_map, int create)
1212 {
1213         struct gfs2_inode *ip = GFS2_I(inode);
1214         loff_t pos = (loff_t)lblock << inode->i_blkbits;
1215         loff_t length = bh_map->b_size;
1216         struct iomap iomap = { };
1217         int ret;
1218
1219         clear_buffer_mapped(bh_map);
1220         clear_buffer_new(bh_map);
1221         clear_buffer_boundary(bh_map);
1222         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1223
1224         if (!create)
1225                 ret = gfs2_iomap_get(inode, pos, length, &iomap);
1226         else
1227                 ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1228         if (ret)
1229                 goto out;
1230
1231         if (iomap.length > bh_map->b_size) {
1232                 iomap.length = bh_map->b_size;
1233                 iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1234         }
1235         if (iomap.addr != IOMAP_NULL_ADDR)
1236                 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1237         bh_map->b_size = iomap.length;
1238         if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1239                 set_buffer_boundary(bh_map);
1240         if (iomap.flags & IOMAP_F_NEW)
1241                 set_buffer_new(bh_map);
1242
1243 out:
1244         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1245         return ret;
1246 }
1247
1248 int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1249                     unsigned int *extlen)
1250 {
1251         unsigned int blkbits = inode->i_blkbits;
1252         struct iomap iomap = { };
1253         unsigned int len;
1254         int ret;
1255
1256         ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1257                              &iomap);
1258         if (ret)
1259                 return ret;
1260         if (iomap.type != IOMAP_MAPPED)
1261                 return -EIO;
1262         *dblock = iomap.addr >> blkbits;
1263         len = iomap.length >> blkbits;
1264         if (len < *extlen)
1265                 *extlen = len;
1266         return 0;
1267 }
1268
1269 int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1270                       unsigned int *extlen, bool *new)
1271 {
1272         unsigned int blkbits = inode->i_blkbits;
1273         struct iomap iomap = { };
1274         unsigned int len;
1275         int ret;
1276
1277         ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1278                                &iomap);
1279         if (ret)
1280                 return ret;
1281         if (iomap.type != IOMAP_MAPPED)
1282                 return -EIO;
1283         *dblock = iomap.addr >> blkbits;
1284         len = iomap.length >> blkbits;
1285         if (len < *extlen)
1286                 *extlen = len;
1287         *new = iomap.flags & IOMAP_F_NEW;
1288         return 0;
1289 }
1290
1291 /*
1292  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1293  * uses iomap write to perform its actions, which begin their own transactions
1294  * (iomap_begin, get_folio, etc.)
1295  */
1296 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1297                                  unsigned int length)
1298 {
1299         BUG_ON(current->journal_info);
1300         return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1301 }
1302
1303 #define GFS2_JTRUNC_REVOKES 8192
1304
1305 /**
1306  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1307  * @inode: The inode being truncated
1308  * @oldsize: The original (larger) size
1309  * @newsize: The new smaller size
1310  *
1311  * With jdata files, we have to journal a revoke for each block which is
1312  * truncated. As a result, we need to split this into separate transactions
1313  * if the number of pages being truncated gets too large.
1314  */
1315
1316 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1317 {
1318         struct gfs2_sbd *sdp = GFS2_SB(inode);
1319         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1320         u64 chunk;
1321         int error;
1322
1323         while (oldsize != newsize) {
1324                 struct gfs2_trans *tr;
1325                 unsigned int offs;
1326
1327                 chunk = oldsize - newsize;
1328                 if (chunk > max_chunk)
1329                         chunk = max_chunk;
1330
1331                 offs = oldsize & ~PAGE_MASK;
1332                 if (offs && chunk > PAGE_SIZE)
1333                         chunk = offs + ((chunk - offs) & PAGE_MASK);
1334
1335                 truncate_pagecache(inode, oldsize - chunk);
1336                 oldsize -= chunk;
1337
1338                 tr = current->journal_info;
1339                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1340                         continue;
1341
1342                 gfs2_trans_end(sdp);
1343                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1344                 if (error)
1345                         return error;
1346         }
1347
1348         return 0;
1349 }
1350
1351 static int trunc_start(struct inode *inode, u64 newsize)
1352 {
1353         struct gfs2_inode *ip = GFS2_I(inode);
1354         struct gfs2_sbd *sdp = GFS2_SB(inode);
1355         struct buffer_head *dibh = NULL;
1356         int journaled = gfs2_is_jdata(ip);
1357         u64 oldsize = inode->i_size;
1358         int error;
1359
1360         if (!gfs2_is_stuffed(ip)) {
1361                 unsigned int blocksize = i_blocksize(inode);
1362                 unsigned int offs = newsize & (blocksize - 1);
1363                 if (offs) {
1364                         error = gfs2_block_zero_range(inode, newsize,
1365                                                       blocksize - offs);
1366                         if (error)
1367                                 return error;
1368                 }
1369         }
1370         if (journaled)
1371                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1372         else
1373                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1374         if (error)
1375                 return error;
1376
1377         error = gfs2_meta_inode_buffer(ip, &dibh);
1378         if (error)
1379                 goto out;
1380
1381         gfs2_trans_add_meta(ip->i_gl, dibh);
1382
1383         if (gfs2_is_stuffed(ip))
1384                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1385         else
1386                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1387
1388         i_size_write(inode, newsize);
1389         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1390         gfs2_dinode_out(ip, dibh->b_data);
1391
1392         if (journaled)
1393                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1394         else
1395                 truncate_pagecache(inode, newsize);
1396
1397 out:
1398         brelse(dibh);
1399         if (current->journal_info)
1400                 gfs2_trans_end(sdp);
1401         return error;
1402 }
1403
1404 int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1405                    struct iomap *iomap)
1406 {
1407         struct metapath mp = { .mp_aheight = 1, };
1408         int ret;
1409
1410         ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1411         release_metapath(&mp);
1412         return ret;
1413 }
1414
1415 int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1416                      struct iomap *iomap)
1417 {
1418         struct metapath mp = { .mp_aheight = 1, };
1419         int ret;
1420
1421         ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1422         if (!ret && iomap->type == IOMAP_HOLE)
1423                 ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1424         release_metapath(&mp);
1425         return ret;
1426 }
1427
1428 /**
1429  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1430  * @ip: inode
1431  * @rd_gh: holder of resource group glock
1432  * @bh: buffer head to sweep
1433  * @start: starting point in bh
1434  * @end: end point in bh
1435  * @meta: true if bh points to metadata (rather than data)
1436  * @btotal: place to keep count of total blocks freed
1437  *
1438  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1439  * free, and free them all. However, we do it one rgrp at a time. If this
1440  * block has references to multiple rgrps, we break it into individual
1441  * transactions. This allows other processes to use the rgrps while we're
1442  * focused on a single one, for better concurrency / performance.
1443  * At every transaction boundary, we rewrite the inode into the journal.
1444  * That way the bitmaps are kept consistent with the inode and we can recover
1445  * if we're interrupted by power-outages.
1446  *
1447  * Returns: 0, or return code if an error occurred.
1448  *          *btotal has the total number of blocks freed
1449  */
1450 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1451                               struct buffer_head *bh, __be64 *start, __be64 *end,
1452                               bool meta, u32 *btotal)
1453 {
1454         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1455         struct gfs2_rgrpd *rgd;
1456         struct gfs2_trans *tr;
1457         __be64 *p;
1458         int blks_outside_rgrp;
1459         u64 bn, bstart, isize_blks;
1460         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1461         int ret = 0;
1462         bool buf_in_tr = false; /* buffer was added to transaction */
1463
1464 more_rgrps:
1465         rgd = NULL;
1466         if (gfs2_holder_initialized(rd_gh)) {
1467                 rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1468                 gfs2_assert_withdraw(sdp,
1469                              gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1470         }
1471         blks_outside_rgrp = 0;
1472         bstart = 0;
1473         blen = 0;
1474
1475         for (p = start; p < end; p++) {
1476                 if (!*p)
1477                         continue;
1478                 bn = be64_to_cpu(*p);
1479
1480                 if (rgd) {
1481                         if (!rgrp_contains_block(rgd, bn)) {
1482                                 blks_outside_rgrp++;
1483                                 continue;
1484                         }
1485                 } else {
1486                         rgd = gfs2_blk2rgrpd(sdp, bn, true);
1487                         if (unlikely(!rgd)) {
1488                                 ret = -EIO;
1489                                 goto out;
1490                         }
1491                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1492                                                  LM_FLAG_NODE_SCOPE, rd_gh);
1493                         if (ret)
1494                                 goto out;
1495
1496                         /* Must be done with the rgrp glock held: */
1497                         if (gfs2_rs_active(&ip->i_res) &&
1498                             rgd == ip->i_res.rs_rgd)
1499                                 gfs2_rs_deltree(&ip->i_res);
1500                 }
1501
1502                 /* The size of our transactions will be unknown until we
1503                    actually process all the metadata blocks that relate to
1504                    the rgrp. So we estimate. We know it can't be more than
1505                    the dinode's i_blocks and we don't want to exceed the
1506                    journal flush threshold, sd_log_thresh2. */
1507                 if (current->journal_info == NULL) {
1508                         unsigned int jblocks_rqsted, revokes;
1509
1510                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1511                                 RES_INDIRECT;
1512                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1513                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1514                                 jblocks_rqsted +=
1515                                         atomic_read(&sdp->sd_log_thresh2);
1516                         else
1517                                 jblocks_rqsted += isize_blks;
1518                         revokes = jblocks_rqsted;
1519                         if (meta)
1520                                 revokes += end - start;
1521                         else if (ip->i_depth)
1522                                 revokes += sdp->sd_inptrs;
1523                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1524                         if (ret)
1525                                 goto out_unlock;
1526                         down_write(&ip->i_rw_mutex);
1527                 }
1528                 /* check if we will exceed the transaction blocks requested */
1529                 tr = current->journal_info;
1530                 if (tr->tr_num_buf_new + RES_STATFS +
1531                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1532                         /* We set blks_outside_rgrp to ensure the loop will
1533                            be repeated for the same rgrp, but with a new
1534                            transaction. */
1535                         blks_outside_rgrp++;
1536                         /* This next part is tricky. If the buffer was added
1537                            to the transaction, we've already set some block
1538                            pointers to 0, so we better follow through and free
1539                            them, or we will introduce corruption (so break).
1540                            This may be impossible, or at least rare, but I
1541                            decided to cover the case regardless.
1542
1543                            If the buffer was not added to the transaction
1544                            (this call), doing so would exceed our transaction
1545                            size, so we need to end the transaction and start a
1546                            new one (so goto). */
1547
1548                         if (buf_in_tr)
1549                                 break;
1550                         goto out_unlock;
1551                 }
1552
1553                 gfs2_trans_add_meta(ip->i_gl, bh);
1554                 buf_in_tr = true;
1555                 *p = 0;
1556                 if (bstart + blen == bn) {
1557                         blen++;
1558                         continue;
1559                 }
1560                 if (bstart) {
1561                         __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1562                         (*btotal) += blen;
1563                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1564                 }
1565                 bstart = bn;
1566                 blen = 1;
1567         }
1568         if (bstart) {
1569                 __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1570                 (*btotal) += blen;
1571                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1572         }
1573 out_unlock:
1574         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1575                                             outside the rgrp we just processed,
1576                                             do it all over again. */
1577                 if (current->journal_info) {
1578                         struct buffer_head *dibh;
1579
1580                         ret = gfs2_meta_inode_buffer(ip, &dibh);
1581                         if (ret)
1582                                 goto out;
1583
1584                         /* Every transaction boundary, we rewrite the dinode
1585                            to keep its di_blocks current in case of failure. */
1586                         ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1587                                 current_time(&ip->i_inode);
1588                         gfs2_trans_add_meta(ip->i_gl, dibh);
1589                         gfs2_dinode_out(ip, dibh->b_data);
1590                         brelse(dibh);
1591                         up_write(&ip->i_rw_mutex);
1592                         gfs2_trans_end(sdp);
1593                         buf_in_tr = false;
1594                 }
1595                 gfs2_glock_dq_uninit(rd_gh);
1596                 cond_resched();
1597                 goto more_rgrps;
1598         }
1599 out:
1600         return ret;
1601 }
1602
1603 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1604 {
1605         if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1606                 return false;
1607         return true;
1608 }
1609
1610 /**
1611  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1612  * @sdp: The superblock
1613  * @mp: starting metapath
1614  * @h: desired height to search
1615  * @end_list: See punch_hole().
1616  * @end_aligned: See punch_hole().
1617  *
1618  * Assumes the metapath is valid (with buffers) out to height h.
1619  * Returns: true if a non-null pointer was found in the metapath buffer
1620  *          false if all remaining pointers are NULL in the buffer
1621  */
1622 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1623                              unsigned int h,
1624                              __u16 *end_list, unsigned int end_aligned)
1625 {
1626         struct buffer_head *bh = mp->mp_bh[h];
1627         __be64 *first, *ptr, *end;
1628
1629         first = metaptr1(h, mp);
1630         ptr = first + mp->mp_list[h];
1631         end = (__be64 *)(bh->b_data + bh->b_size);
1632         if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1633                 bool keep_end = h < end_aligned;
1634                 end = first + end_list[h] + keep_end;
1635         }
1636
1637         while (ptr < end) {
1638                 if (*ptr) { /* if we have a non-null pointer */
1639                         mp->mp_list[h] = ptr - first;
1640                         h++;
1641                         if (h < GFS2_MAX_META_HEIGHT)
1642                                 mp->mp_list[h] = 0;
1643                         return true;
1644                 }
1645                 ptr++;
1646         }
1647         return false;
1648 }
1649
1650 enum dealloc_states {
1651         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1652         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1653         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1654         DEALLOC_DONE = 3,       /* process complete */
1655 };
1656
1657 static inline void
1658 metapointer_range(struct metapath *mp, int height,
1659                   __u16 *start_list, unsigned int start_aligned,
1660                   __u16 *end_list, unsigned int end_aligned,
1661                   __be64 **start, __be64 **end)
1662 {
1663         struct buffer_head *bh = mp->mp_bh[height];
1664         __be64 *first;
1665
1666         first = metaptr1(height, mp);
1667         *start = first;
1668         if (mp_eq_to_hgt(mp, start_list, height)) {
1669                 bool keep_start = height < start_aligned;
1670                 *start = first + start_list[height] + keep_start;
1671         }
1672         *end = (__be64 *)(bh->b_data + bh->b_size);
1673         if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1674                 bool keep_end = height < end_aligned;
1675                 *end = first + end_list[height] + keep_end;
1676         }
1677 }
1678
1679 static inline bool walk_done(struct gfs2_sbd *sdp,
1680                              struct metapath *mp, int height,
1681                              __u16 *end_list, unsigned int end_aligned)
1682 {
1683         __u16 end;
1684
1685         if (end_list) {
1686                 bool keep_end = height < end_aligned;
1687                 if (!mp_eq_to_hgt(mp, end_list, height))
1688                         return false;
1689                 end = end_list[height] + keep_end;
1690         } else
1691                 end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1692         return mp->mp_list[height] >= end;
1693 }
1694
1695 /**
1696  * punch_hole - deallocate blocks in a file
1697  * @ip: inode to truncate
1698  * @offset: the start of the hole
1699  * @length: the size of the hole (or 0 for truncate)
1700  *
1701  * Punch a hole into a file or truncate a file at a given position.  This
1702  * function operates in whole blocks (@offset and @length are rounded
1703  * accordingly); partially filled blocks must be cleared otherwise.
1704  *
1705  * This function works from the bottom up, and from the right to the left. In
1706  * other words, it strips off the highest layer (data) before stripping any of
1707  * the metadata. Doing it this way is best in case the operation is interrupted
1708  * by power failure, etc.  The dinode is rewritten in every transaction to
1709  * guarantee integrity.
1710  */
1711 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1712 {
1713         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1714         u64 maxsize = sdp->sd_heightsize[ip->i_height];
1715         struct metapath mp = {};
1716         struct buffer_head *dibh, *bh;
1717         struct gfs2_holder rd_gh;
1718         unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1719         u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1720         __u16 start_list[GFS2_MAX_META_HEIGHT];
1721         __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1722         unsigned int start_aligned, end_aligned;
1723         unsigned int strip_h = ip->i_height - 1;
1724         u32 btotal = 0;
1725         int ret, state;
1726         int mp_h; /* metapath buffers are read in to this height */
1727         u64 prev_bnr = 0;
1728         __be64 *start, *end;
1729
1730         if (offset >= maxsize) {
1731                 /*
1732                  * The starting point lies beyond the allocated metadata;
1733                  * there are no blocks to deallocate.
1734                  */
1735                 return 0;
1736         }
1737
1738         /*
1739          * The start position of the hole is defined by lblock, start_list, and
1740          * start_aligned.  The end position of the hole is defined by lend,
1741          * end_list, and end_aligned.
1742          *
1743          * start_aligned and end_aligned define down to which height the start
1744          * and end positions are aligned to the metadata tree (i.e., the
1745          * position is a multiple of the metadata granularity at the height
1746          * above).  This determines at which heights additional meta pointers
1747          * needs to be preserved for the remaining data.
1748          */
1749
1750         if (length) {
1751                 u64 end_offset = offset + length;
1752                 u64 lend;
1753
1754                 /*
1755                  * Clip the end at the maximum file size for the given height:
1756                  * that's how far the metadata goes; files bigger than that
1757                  * will have additional layers of indirection.
1758                  */
1759                 if (end_offset > maxsize)
1760                         end_offset = maxsize;
1761                 lend = end_offset >> bsize_shift;
1762
1763                 if (lblock >= lend)
1764                         return 0;
1765
1766                 find_metapath(sdp, lend, &mp, ip->i_height);
1767                 end_list = __end_list;
1768                 memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1769
1770                 for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1771                         if (end_list[mp_h])
1772                                 break;
1773                 }
1774                 end_aligned = mp_h;
1775         }
1776
1777         find_metapath(sdp, lblock, &mp, ip->i_height);
1778         memcpy(start_list, mp.mp_list, sizeof(start_list));
1779
1780         for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1781                 if (start_list[mp_h])
1782                         break;
1783         }
1784         start_aligned = mp_h;
1785
1786         ret = gfs2_meta_inode_buffer(ip, &dibh);
1787         if (ret)
1788                 return ret;
1789
1790         mp.mp_bh[0] = dibh;
1791         ret = lookup_metapath(ip, &mp);
1792         if (ret)
1793                 goto out_metapath;
1794
1795         /* issue read-ahead on metadata */
1796         for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1797                 metapointer_range(&mp, mp_h, start_list, start_aligned,
1798                                   end_list, end_aligned, &start, &end);
1799                 gfs2_metapath_ra(ip->i_gl, start, end);
1800         }
1801
1802         if (mp.mp_aheight == ip->i_height)
1803                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1804         else
1805                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1806
1807         ret = gfs2_rindex_update(sdp);
1808         if (ret)
1809                 goto out_metapath;
1810
1811         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1812         if (ret)
1813                 goto out_metapath;
1814         gfs2_holder_mark_uninitialized(&rd_gh);
1815
1816         mp_h = strip_h;
1817
1818         while (state != DEALLOC_DONE) {
1819                 switch (state) {
1820                 /* Truncate a full metapath at the given strip height.
1821                  * Note that strip_h == mp_h in order to be in this state. */
1822                 case DEALLOC_MP_FULL:
1823                         bh = mp.mp_bh[mp_h];
1824                         gfs2_assert_withdraw(sdp, bh);
1825                         if (gfs2_assert_withdraw(sdp,
1826                                                  prev_bnr != bh->b_blocknr)) {
1827                                 fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1828                                          "s_h:%u, mp_h:%u\n",
1829                                        (unsigned long long)ip->i_no_addr,
1830                                        prev_bnr, ip->i_height, strip_h, mp_h);
1831                         }
1832                         prev_bnr = bh->b_blocknr;
1833
1834                         if (gfs2_metatype_check(sdp, bh,
1835                                                 (mp_h ? GFS2_METATYPE_IN :
1836                                                         GFS2_METATYPE_DI))) {
1837                                 ret = -EIO;
1838                                 goto out;
1839                         }
1840
1841                         /*
1842                          * Below, passing end_aligned as 0 gives us the
1843                          * metapointer range excluding the end point: the end
1844                          * point is the first metapath we must not deallocate!
1845                          */
1846
1847                         metapointer_range(&mp, mp_h, start_list, start_aligned,
1848                                           end_list, 0 /* end_aligned */,
1849                                           &start, &end);
1850                         ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1851                                                  start, end,
1852                                                  mp_h != ip->i_height - 1,
1853                                                  &btotal);
1854
1855                         /* If we hit an error or just swept dinode buffer,
1856                            just exit. */
1857                         if (ret || !mp_h) {
1858                                 state = DEALLOC_DONE;
1859                                 break;
1860                         }
1861                         state = DEALLOC_MP_LOWER;
1862                         break;
1863
1864                 /* lower the metapath strip height */
1865                 case DEALLOC_MP_LOWER:
1866                         /* We're done with the current buffer, so release it,
1867                            unless it's the dinode buffer. Then back up to the
1868                            previous pointer. */
1869                         if (mp_h) {
1870                                 brelse(mp.mp_bh[mp_h]);
1871                                 mp.mp_bh[mp_h] = NULL;
1872                         }
1873                         /* If we can't get any lower in height, we've stripped
1874                            off all we can. Next step is to back up and start
1875                            stripping the previous level of metadata. */
1876                         if (mp_h == 0) {
1877                                 strip_h--;
1878                                 memcpy(mp.mp_list, start_list, sizeof(start_list));
1879                                 mp_h = strip_h;
1880                                 state = DEALLOC_FILL_MP;
1881                                 break;
1882                         }
1883                         mp.mp_list[mp_h] = 0;
1884                         mp_h--; /* search one metadata height down */
1885                         mp.mp_list[mp_h]++;
1886                         if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1887                                 break;
1888                         /* Here we've found a part of the metapath that is not
1889                          * allocated. We need to search at that height for the
1890                          * next non-null pointer. */
1891                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1892                                 state = DEALLOC_FILL_MP;
1893                                 mp_h++;
1894                         }
1895                         /* No more non-null pointers at this height. Back up
1896                            to the previous height and try again. */
1897                         break; /* loop around in the same state */
1898
1899                 /* Fill the metapath with buffers to the given height. */
1900                 case DEALLOC_FILL_MP:
1901                         /* Fill the buffers out to the current height. */
1902                         ret = fillup_metapath(ip, &mp, mp_h);
1903                         if (ret < 0)
1904                                 goto out;
1905
1906                         /* On the first pass, issue read-ahead on metadata. */
1907                         if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1908                                 unsigned int height = mp.mp_aheight - 1;
1909
1910                                 /* No read-ahead for data blocks. */
1911                                 if (mp.mp_aheight - 1 == strip_h)
1912                                         height--;
1913
1914                                 for (; height >= mp.mp_aheight - ret; height--) {
1915                                         metapointer_range(&mp, height,
1916                                                           start_list, start_aligned,
1917                                                           end_list, end_aligned,
1918                                                           &start, &end);
1919                                         gfs2_metapath_ra(ip->i_gl, start, end);
1920                                 }
1921                         }
1922
1923                         /* If buffers found for the entire strip height */
1924                         if (mp.mp_aheight - 1 == strip_h) {
1925                                 state = DEALLOC_MP_FULL;
1926                                 break;
1927                         }
1928                         if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1929                                 mp_h = mp.mp_aheight - 1;
1930
1931                         /* If we find a non-null block pointer, crawl a bit
1932                            higher up in the metapath and try again, otherwise
1933                            we need to look lower for a new starting point. */
1934                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1935                                 mp_h++;
1936                         else
1937                                 state = DEALLOC_MP_LOWER;
1938                         break;
1939                 }
1940         }
1941
1942         if (btotal) {
1943                 if (current->journal_info == NULL) {
1944                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1945                                                RES_QUOTA, 0);
1946                         if (ret)
1947                                 goto out;
1948                         down_write(&ip->i_rw_mutex);
1949                 }
1950                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1951                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1952                                   ip->i_inode.i_gid);
1953                 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1954                 gfs2_trans_add_meta(ip->i_gl, dibh);
1955                 gfs2_dinode_out(ip, dibh->b_data);
1956                 up_write(&ip->i_rw_mutex);
1957                 gfs2_trans_end(sdp);
1958         }
1959
1960 out:
1961         if (gfs2_holder_initialized(&rd_gh))
1962                 gfs2_glock_dq_uninit(&rd_gh);
1963         if (current->journal_info) {
1964                 up_write(&ip->i_rw_mutex);
1965                 gfs2_trans_end(sdp);
1966                 cond_resched();
1967         }
1968         gfs2_quota_unhold(ip);
1969 out_metapath:
1970         release_metapath(&mp);
1971         return ret;
1972 }
1973
1974 static int trunc_end(struct gfs2_inode *ip)
1975 {
1976         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1977         struct buffer_head *dibh;
1978         int error;
1979
1980         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1981         if (error)
1982                 return error;
1983
1984         down_write(&ip->i_rw_mutex);
1985
1986         error = gfs2_meta_inode_buffer(ip, &dibh);
1987         if (error)
1988                 goto out;
1989
1990         if (!i_size_read(&ip->i_inode)) {
1991                 ip->i_height = 0;
1992                 ip->i_goal = ip->i_no_addr;
1993                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1994                 gfs2_ordered_del_inode(ip);
1995         }
1996         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1997         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1998
1999         gfs2_trans_add_meta(ip->i_gl, dibh);
2000         gfs2_dinode_out(ip, dibh->b_data);
2001         brelse(dibh);
2002
2003 out:
2004         up_write(&ip->i_rw_mutex);
2005         gfs2_trans_end(sdp);
2006         return error;
2007 }
2008
2009 /**
2010  * do_shrink - make a file smaller
2011  * @inode: the inode
2012  * @newsize: the size to make the file
2013  *
2014  * Called with an exclusive lock on @inode. The @size must
2015  * be equal to or smaller than the current inode size.
2016  *
2017  * Returns: errno
2018  */
2019
2020 static int do_shrink(struct inode *inode, u64 newsize)
2021 {
2022         struct gfs2_inode *ip = GFS2_I(inode);
2023         int error;
2024
2025         error = trunc_start(inode, newsize);
2026         if (error < 0)
2027                 return error;
2028         if (gfs2_is_stuffed(ip))
2029                 return 0;
2030
2031         error = punch_hole(ip, newsize, 0);
2032         if (error == 0)
2033                 error = trunc_end(ip);
2034
2035         return error;
2036 }
2037
2038 /**
2039  * do_grow - Touch and update inode size
2040  * @inode: The inode
2041  * @size: The new size
2042  *
2043  * This function updates the timestamps on the inode and
2044  * may also increase the size of the inode. This function
2045  * must not be called with @size any smaller than the current
2046  * inode size.
2047  *
2048  * Although it is not strictly required to unstuff files here,
2049  * earlier versions of GFS2 have a bug in the stuffed file reading
2050  * code which will result in a buffer overrun if the size is larger
2051  * than the max stuffed file size. In order to prevent this from
2052  * occurring, such files are unstuffed, but in other cases we can
2053  * just update the inode size directly.
2054  *
2055  * Returns: 0 on success, or -ve on error
2056  */
2057
2058 static int do_grow(struct inode *inode, u64 size)
2059 {
2060         struct gfs2_inode *ip = GFS2_I(inode);
2061         struct gfs2_sbd *sdp = GFS2_SB(inode);
2062         struct gfs2_alloc_parms ap = { .target = 1, };
2063         struct buffer_head *dibh;
2064         int error;
2065         int unstuff = 0;
2066
2067         if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2068                 error = gfs2_quota_lock_check(ip, &ap);
2069                 if (error)
2070                         return error;
2071
2072                 error = gfs2_inplace_reserve(ip, &ap);
2073                 if (error)
2074                         goto do_grow_qunlock;
2075                 unstuff = 1;
2076         }
2077
2078         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2079                                  (unstuff &&
2080                                   gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2081                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2082                                   0 : RES_QUOTA), 0);
2083         if (error)
2084                 goto do_grow_release;
2085
2086         if (unstuff) {
2087                 error = gfs2_unstuff_dinode(ip);
2088                 if (error)
2089                         goto do_end_trans;
2090         }
2091
2092         error = gfs2_meta_inode_buffer(ip, &dibh);
2093         if (error)
2094                 goto do_end_trans;
2095
2096         truncate_setsize(inode, size);
2097         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2098         gfs2_trans_add_meta(ip->i_gl, dibh);
2099         gfs2_dinode_out(ip, dibh->b_data);
2100         brelse(dibh);
2101
2102 do_end_trans:
2103         gfs2_trans_end(sdp);
2104 do_grow_release:
2105         if (unstuff) {
2106                 gfs2_inplace_release(ip);
2107 do_grow_qunlock:
2108                 gfs2_quota_unlock(ip);
2109         }
2110         return error;
2111 }
2112
2113 /**
2114  * gfs2_setattr_size - make a file a given size
2115  * @inode: the inode
2116  * @newsize: the size to make the file
2117  *
2118  * The file size can grow, shrink, or stay the same size. This
2119  * is called holding i_rwsem and an exclusive glock on the inode
2120  * in question.
2121  *
2122  * Returns: errno
2123  */
2124
2125 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2126 {
2127         struct gfs2_inode *ip = GFS2_I(inode);
2128         int ret;
2129
2130         BUG_ON(!S_ISREG(inode->i_mode));
2131
2132         ret = inode_newsize_ok(inode, newsize);
2133         if (ret)
2134                 return ret;
2135
2136         inode_dio_wait(inode);
2137
2138         ret = gfs2_qa_get(ip);
2139         if (ret)
2140                 goto out;
2141
2142         if (newsize >= inode->i_size) {
2143                 ret = do_grow(inode, newsize);
2144                 goto out;
2145         }
2146
2147         ret = do_shrink(inode, newsize);
2148 out:
2149         gfs2_rs_delete(ip);
2150         gfs2_qa_put(ip);
2151         return ret;
2152 }
2153
2154 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2155 {
2156         int error;
2157         error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2158         if (!error)
2159                 error = trunc_end(ip);
2160         return error;
2161 }
2162
2163 int gfs2_file_dealloc(struct gfs2_inode *ip)
2164 {
2165         return punch_hole(ip, 0, 0);
2166 }
2167
2168 /**
2169  * gfs2_free_journal_extents - Free cached journal bmap info
2170  * @jd: The journal
2171  *
2172  */
2173
2174 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2175 {
2176         struct gfs2_journal_extent *jext;
2177
2178         while(!list_empty(&jd->extent_list)) {
2179                 jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2180                 list_del(&jext->list);
2181                 kfree(jext);
2182         }
2183 }
2184
2185 /**
2186  * gfs2_add_jextent - Add or merge a new extent to extent cache
2187  * @jd: The journal descriptor
2188  * @lblock: The logical block at start of new extent
2189  * @dblock: The physical block at start of new extent
2190  * @blocks: Size of extent in fs blocks
2191  *
2192  * Returns: 0 on success or -ENOMEM
2193  */
2194
2195 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2196 {
2197         struct gfs2_journal_extent *jext;
2198
2199         if (!list_empty(&jd->extent_list)) {
2200                 jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2201                 if ((jext->dblock + jext->blocks) == dblock) {
2202                         jext->blocks += blocks;
2203                         return 0;
2204                 }
2205         }
2206
2207         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2208         if (jext == NULL)
2209                 return -ENOMEM;
2210         jext->dblock = dblock;
2211         jext->lblock = lblock;
2212         jext->blocks = blocks;
2213         list_add_tail(&jext->list, &jd->extent_list);
2214         jd->nr_extents++;
2215         return 0;
2216 }
2217
2218 /**
2219  * gfs2_map_journal_extents - Cache journal bmap info
2220  * @sdp: The super block
2221  * @jd: The journal to map
2222  *
2223  * Create a reusable "extent" mapping from all logical
2224  * blocks to all physical blocks for the given journal.  This will save
2225  * us time when writing journal blocks.  Most journals will have only one
2226  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2227  * arranges the journal blocks sequentially to maximize performance.
2228  * So the extent would map the first block for the entire file length.
2229  * However, gfs2_jadd can happen while file activity is happening, so
2230  * those journals may not be sequential.  Less likely is the case where
2231  * the users created their own journals by mounting the metafs and
2232  * laying it out.  But it's still possible.  These journals might have
2233  * several extents.
2234  *
2235  * Returns: 0 on success, or error on failure
2236  */
2237
2238 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2239 {
2240         u64 lblock = 0;
2241         u64 lblock_stop;
2242         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2243         struct buffer_head bh;
2244         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2245         u64 size;
2246         int rc;
2247         ktime_t start, end;
2248
2249         start = ktime_get();
2250         lblock_stop = i_size_read(jd->jd_inode) >> shift;
2251         size = (lblock_stop - lblock) << shift;
2252         jd->nr_extents = 0;
2253         WARN_ON(!list_empty(&jd->extent_list));
2254
2255         do {
2256                 bh.b_state = 0;
2257                 bh.b_blocknr = 0;
2258                 bh.b_size = size;
2259                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2260                 if (rc || !buffer_mapped(&bh))
2261                         goto fail;
2262                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2263                 if (rc)
2264                         goto fail;
2265                 size -= bh.b_size;
2266                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2267         } while(size > 0);
2268
2269         end = ktime_get();
2270         fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2271                 jd->nr_extents, ktime_ms_delta(end, start));
2272         return 0;
2273
2274 fail:
2275         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2276                 rc, jd->jd_jid,
2277                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
2278                 jd->nr_extents);
2279         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2280                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2281                 bh.b_state, (unsigned long long)bh.b_size);
2282         gfs2_free_journal_extents(jd);
2283         return rc;
2284 }
2285
2286 /**
2287  * gfs2_write_alloc_required - figure out if a write will require an allocation
2288  * @ip: the file being written to
2289  * @offset: the offset to write to
2290  * @len: the number of bytes being written
2291  *
2292  * Returns: 1 if an alloc is required, 0 otherwise
2293  */
2294
2295 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2296                               unsigned int len)
2297 {
2298         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2299         struct buffer_head bh;
2300         unsigned int shift;
2301         u64 lblock, lblock_stop, size;
2302         u64 end_of_file;
2303
2304         if (!len)
2305                 return 0;
2306
2307         if (gfs2_is_stuffed(ip)) {
2308                 if (offset + len > gfs2_max_stuffed_size(ip))
2309                         return 1;
2310                 return 0;
2311         }
2312
2313         shift = sdp->sd_sb.sb_bsize_shift;
2314         BUG_ON(gfs2_is_dir(ip));
2315         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2316         lblock = offset >> shift;
2317         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2318         if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2319                 return 1;
2320
2321         size = (lblock_stop - lblock) << shift;
2322         do {
2323                 bh.b_state = 0;
2324                 bh.b_size = size;
2325                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2326                 if (!buffer_mapped(&bh))
2327                         return 1;
2328                 size -= bh.b_size;
2329                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2330         } while(size > 0);
2331
2332         return 0;
2333 }
2334
2335 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2336 {
2337         struct gfs2_inode *ip = GFS2_I(inode);
2338         struct buffer_head *dibh;
2339         int error;
2340
2341         if (offset >= inode->i_size)
2342                 return 0;
2343         if (offset + length > inode->i_size)
2344                 length = inode->i_size - offset;
2345
2346         error = gfs2_meta_inode_buffer(ip, &dibh);
2347         if (error)
2348                 return error;
2349         gfs2_trans_add_meta(ip->i_gl, dibh);
2350         memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2351                length);
2352         brelse(dibh);
2353         return 0;
2354 }
2355
2356 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2357                                          loff_t length)
2358 {
2359         struct gfs2_sbd *sdp = GFS2_SB(inode);
2360         loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2361         int error;
2362
2363         while (length) {
2364                 struct gfs2_trans *tr;
2365                 loff_t chunk;
2366                 unsigned int offs;
2367
2368                 chunk = length;
2369                 if (chunk > max_chunk)
2370                         chunk = max_chunk;
2371
2372                 offs = offset & ~PAGE_MASK;
2373                 if (offs && chunk > PAGE_SIZE)
2374                         chunk = offs + ((chunk - offs) & PAGE_MASK);
2375
2376                 truncate_pagecache_range(inode, offset, chunk);
2377                 offset += chunk;
2378                 length -= chunk;
2379
2380                 tr = current->journal_info;
2381                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2382                         continue;
2383
2384                 gfs2_trans_end(sdp);
2385                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2386                 if (error)
2387                         return error;
2388         }
2389         return 0;
2390 }
2391
2392 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2393 {
2394         struct inode *inode = file_inode(file);
2395         struct gfs2_inode *ip = GFS2_I(inode);
2396         struct gfs2_sbd *sdp = GFS2_SB(inode);
2397         unsigned int blocksize = i_blocksize(inode);
2398         loff_t start, end;
2399         int error;
2400
2401         if (!gfs2_is_stuffed(ip)) {
2402                 unsigned int start_off, end_len;
2403
2404                 start_off = offset & (blocksize - 1);
2405                 end_len = (offset + length) & (blocksize - 1);
2406                 if (start_off) {
2407                         unsigned int len = length;
2408                         if (length > blocksize - start_off)
2409                                 len = blocksize - start_off;
2410                         error = gfs2_block_zero_range(inode, offset, len);
2411                         if (error)
2412                                 goto out;
2413                         if (start_off + length < blocksize)
2414                                 end_len = 0;
2415                 }
2416                 if (end_len) {
2417                         error = gfs2_block_zero_range(inode,
2418                                 offset + length - end_len, end_len);
2419                         if (error)
2420                                 goto out;
2421                 }
2422         }
2423
2424         start = round_down(offset, blocksize);
2425         end = round_up(offset + length, blocksize) - 1;
2426         error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2427         if (error)
2428                 return error;
2429
2430         if (gfs2_is_jdata(ip))
2431                 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2432                                          GFS2_JTRUNC_REVOKES);
2433         else
2434                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2435         if (error)
2436                 return error;
2437
2438         if (gfs2_is_stuffed(ip)) {
2439                 error = stuffed_zero_range(inode, offset, length);
2440                 if (error)
2441                         goto out;
2442         }
2443
2444         if (gfs2_is_jdata(ip)) {
2445                 BUG_ON(!current->journal_info);
2446                 gfs2_journaled_truncate_range(inode, offset, length);
2447         } else
2448                 truncate_pagecache_range(inode, offset, offset + length - 1);
2449
2450         file_update_time(file);
2451         mark_inode_dirty(inode);
2452
2453         if (current->journal_info)
2454                 gfs2_trans_end(sdp);
2455
2456         if (!gfs2_is_stuffed(ip))
2457                 error = punch_hole(ip, offset, length);
2458
2459 out:
2460         if (current->journal_info)
2461                 gfs2_trans_end(sdp);
2462         return error;
2463 }
2464
2465 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2466                 loff_t offset)
2467 {
2468         int ret;
2469
2470         if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2471                 return -EIO;
2472
2473         if (offset >= wpc->iomap.offset &&
2474             offset < wpc->iomap.offset + wpc->iomap.length)
2475                 return 0;
2476
2477         memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2478         ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2479         return ret;
2480 }
2481
2482 const struct iomap_writeback_ops gfs2_writeback_ops = {
2483         .map_blocks             = gfs2_map_blocks,
2484 };