2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12 #include <linux/slab.h>
13 #include <linux/spinlock.h>
14 #include <linux/completion.h>
15 #include <linux/buffer_head.h>
17 #include <linux/gfs2_ondisk.h>
18 #include <linux/prefetch.h>
19 #include <linux/blkdev.h>
20 #include <linux/rbtree.h>
21 #include <linux/random.h>
36 #include "trace_gfs2.h"
39 #define BFITNOENT ((u32)~0)
40 #define NO_BLOCK ((u64)~0)
42 #if BITS_PER_LONG == 32
43 #define LBITMASK (0x55555555UL)
44 #define LBITSKIP55 (0x55555555UL)
45 #define LBITSKIP00 (0x00000000UL)
47 #define LBITMASK (0x5555555555555555UL)
48 #define LBITSKIP55 (0x5555555555555555UL)
49 #define LBITSKIP00 (0x0000000000000000UL)
53 * These routines are used by the resource group routines (rgrp.c)
54 * to keep track of block allocation. Each block is represented by two
55 * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks.
58 * 1 = Used (not metadata)
59 * 2 = Unlinked (still in use) inode
68 static const char valid_change[16] = {
76 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
77 const struct gfs2_inode *ip, bool nowrap);
81 * gfs2_setbit - Set a bit in the bitmaps
82 * @rbm: The position of the bit to set
83 * @do_clone: Also set the clone bitmap, if it exists
84 * @new_state: the new state of the block
88 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
89 unsigned char new_state)
91 unsigned char *byte1, *byte2, *end, cur_state;
92 struct gfs2_bitmap *bi = rbm_bi(rbm);
93 unsigned int buflen = bi->bi_len;
94 const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
96 byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
97 end = bi->bi_bh->b_data + bi->bi_offset + buflen;
101 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
103 if (unlikely(!valid_change[new_state * 4 + cur_state])) {
104 pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
105 rbm->offset, cur_state, new_state);
106 pr_warn("rgrp=0x%llx bi_start=0x%x\n",
107 (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
108 pr_warn("bi_offset=0x%x bi_len=0x%x\n",
109 bi->bi_offset, bi->bi_len);
111 gfs2_consist_rgrpd(rbm->rgd);
114 *byte1 ^= (cur_state ^ new_state) << bit;
116 if (do_clone && bi->bi_clone) {
117 byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
118 cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
119 *byte2 ^= (cur_state ^ new_state) << bit;
124 * gfs2_testbit - test a bit in the bitmaps
125 * @rbm: The bit to test
126 * @use_clone: If true, test the clone bitmap, not the official bitmap.
128 * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
129 * not the "real" bitmaps, to avoid allocating recently freed blocks.
131 * Returns: The two bit block state of the requested bit
134 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
136 struct gfs2_bitmap *bi = rbm_bi(rbm);
141 if (use_clone && bi->bi_clone)
142 buffer = bi->bi_clone;
144 buffer = bi->bi_bh->b_data;
145 buffer += bi->bi_offset;
146 byte = buffer + (rbm->offset / GFS2_NBBY);
147 bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
149 return (*byte >> bit) & GFS2_BIT_MASK;
154 * @ptr: Pointer to bitmap data
155 * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
156 * @state: The state we are searching for
158 * We xor the bitmap data with a patter which is the bitwise opposite
159 * of what we are looking for, this gives rise to a pattern of ones
160 * wherever there is a match. Since we have two bits per entry, we
161 * take this pattern, shift it down by one place and then and it with
162 * the original. All the even bit positions (0,2,4, etc) then represent
163 * successful matches, so we mask with 0x55555..... to remove the unwanted
166 * This allows searching of a whole u64 at once (32 blocks) with a
167 * single test (on 64 bit arches).
170 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
173 static const u64 search[] = {
174 [0] = 0xffffffffffffffffULL,
175 [1] = 0xaaaaaaaaaaaaaaaaULL,
176 [2] = 0x5555555555555555ULL,
177 [3] = 0x0000000000000000ULL,
179 tmp = le64_to_cpu(*ptr) ^ search[state];
186 * rs_cmp - multi-block reservation range compare
187 * @blk: absolute file system block number of the new reservation
188 * @len: number of blocks in the new reservation
189 * @rs: existing reservation to compare against
191 * returns: 1 if the block range is beyond the reach of the reservation
192 * -1 if the block range is before the start of the reservation
193 * 0 if the block range overlaps with the reservation
195 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
197 u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
199 if (blk >= startblk + rs->rs_free)
201 if (blk + len - 1 < startblk)
207 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
208 * a block in a given allocation state.
209 * @buf: the buffer that holds the bitmaps
210 * @len: the length (in bytes) of the buffer
211 * @goal: start search at this block's bit-pair (within @buffer)
212 * @state: GFS2_BLKST_XXX the state of the block we're looking for.
214 * Scope of @goal and returned block number is only within this bitmap buffer,
215 * not entire rgrp or filesystem. @buffer will be offset from the actual
216 * beginning of a bitmap block buffer, skipping any header structures, but
217 * headers are always a multiple of 64 bits long so that the buffer is
218 * always aligned to a 64 bit boundary.
220 * The size of the buffer is in bytes, but is it assumed that it is
221 * always ok to read a complete multiple of 64 bits at the end
222 * of the block in case the end is no aligned to a natural boundary.
224 * Return: the block number (bitmap buffer scope) that was found
227 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
230 u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
231 const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
232 const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
234 u64 mask = 0x5555555555555555ULL;
237 /* Mask off bits we don't care about at the start of the search */
239 tmp = gfs2_bit_search(ptr, mask, state);
241 while(tmp == 0 && ptr < end) {
242 tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
245 /* Mask off any bits which are more than len bytes from the start */
246 if (ptr == end && (len & (sizeof(u64) - 1)))
247 tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
248 /* Didn't find anything, so return */
253 bit /= 2; /* two bits per entry in the bitmap */
254 return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
258 * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
259 * @rbm: The rbm with rgd already set correctly
260 * @block: The block number (filesystem relative)
262 * This sets the bi and offset members of an rbm based on a
263 * resource group and a filesystem relative block number. The
264 * resource group must be set in the rbm on entry, the bi and
265 * offset members will be set by this function.
267 * Returns: 0 on success, or an error code
270 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
272 u64 rblock = block - rbm->rgd->rd_data0;
274 if (WARN_ON_ONCE(rblock > UINT_MAX))
276 if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
280 rbm->offset = (u32)(rblock);
281 /* Check if the block is within the first block */
282 if (rbm->offset < rbm_bi(rbm)->bi_blocks)
285 /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
286 rbm->offset += (sizeof(struct gfs2_rgrp) -
287 sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
288 rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
289 rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
294 * gfs2_rbm_incr - increment an rbm structure
295 * @rbm: The rbm with rgd already set correctly
297 * This function takes an existing rbm structure and increments it to the next
298 * viable block offset.
300 * Returns: If incrementing the offset would cause the rbm to go past the
301 * end of the rgrp, true is returned, otherwise false.
305 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
307 if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
311 if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
320 * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
321 * @rbm: Position to search (value/result)
322 * @n_unaligned: Number of unaligned blocks to check
323 * @len: Decremented for each block found (terminate on zero)
325 * Returns: true if a non-free block is encountered
328 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
333 for (n = 0; n < n_unaligned; n++) {
334 res = gfs2_testbit(rbm, true);
335 if (res != GFS2_BLKST_FREE)
340 if (gfs2_rbm_incr(rbm))
348 * gfs2_free_extlen - Return extent length of free blocks
349 * @rrbm: Starting position
350 * @len: Max length to check
352 * Starting at the block specified by the rbm, see how many free blocks
353 * there are, not reading more than len blocks ahead. This can be done
354 * using memchr_inv when the blocks are byte aligned, but has to be done
355 * on a block by block basis in case of unaligned blocks. Also this
356 * function can cope with bitmap boundaries (although it must stop on
357 * a resource group boundary)
359 * Returns: Number of free blocks in the extent
362 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
364 struct gfs2_rbm rbm = *rrbm;
365 u32 n_unaligned = rbm.offset & 3;
369 u8 *ptr, *start, *end;
371 struct gfs2_bitmap *bi;
374 gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
377 n_unaligned = len & 3;
378 /* Start is now byte aligned */
381 start = bi->bi_bh->b_data;
383 start = bi->bi_clone;
384 start += bi->bi_offset;
385 end = start + bi->bi_len;
386 BUG_ON(rbm.offset & 3);
387 start += (rbm.offset / GFS2_NBBY);
388 bytes = min_t(u32, len / GFS2_NBBY, (end - start));
389 ptr = memchr_inv(start, 0, bytes);
390 chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
391 chunk_size *= GFS2_NBBY;
392 BUG_ON(len < chunk_size);
394 block = gfs2_rbm_to_block(&rbm);
395 if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
403 n_unaligned = len & 3;
406 /* Deal with any bits left over at the end */
408 gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
414 * gfs2_bitcount - count the number of bits in a certain state
415 * @rgd: the resource group descriptor
416 * @buffer: the buffer that holds the bitmaps
417 * @buflen: the length (in bytes) of the buffer
418 * @state: the state of the block we're looking for
420 * Returns: The number of bits
423 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
424 unsigned int buflen, u8 state)
426 const u8 *byte = buffer;
427 const u8 *end = buffer + buflen;
428 const u8 state1 = state << 2;
429 const u8 state2 = state << 4;
430 const u8 state3 = state << 6;
433 for (; byte < end; byte++) {
434 if (((*byte) & 0x03) == state)
436 if (((*byte) & 0x0C) == state1)
438 if (((*byte) & 0x30) == state2)
440 if (((*byte) & 0xC0) == state3)
448 * gfs2_rgrp_verify - Verify that a resource group is consistent
453 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
455 struct gfs2_sbd *sdp = rgd->rd_sbd;
456 struct gfs2_bitmap *bi = NULL;
457 u32 length = rgd->rd_length;
461 memset(count, 0, 4 * sizeof(u32));
463 /* Count # blocks in each of 4 possible allocation states */
464 for (buf = 0; buf < length; buf++) {
465 bi = rgd->rd_bits + buf;
466 for (x = 0; x < 4; x++)
467 count[x] += gfs2_bitcount(rgd,
473 if (count[0] != rgd->rd_free) {
474 if (gfs2_consist_rgrpd(rgd))
475 fs_err(sdp, "free data mismatch: %u != %u\n",
476 count[0], rgd->rd_free);
480 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
481 if (count[1] != tmp) {
482 if (gfs2_consist_rgrpd(rgd))
483 fs_err(sdp, "used data mismatch: %u != %u\n",
488 if (count[2] + count[3] != rgd->rd_dinodes) {
489 if (gfs2_consist_rgrpd(rgd))
490 fs_err(sdp, "used metadata mismatch: %u != %u\n",
491 count[2] + count[3], rgd->rd_dinodes);
497 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
498 * @sdp: The GFS2 superblock
499 * @blk: The data block number
500 * @exact: True if this needs to be an exact match
502 * The @exact argument should be set to true by most callers. The exception
503 * is when we need to match blocks which are not represented by the rgrp
504 * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
505 * there for alignment purposes. Another way of looking at it is that @exact
506 * matches only valid data/metadata blocks, but with @exact false, it will
507 * match any block within the extent of the rgrp.
509 * Returns: The resource group, or NULL if not found
512 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
514 struct rb_node *n, *next;
515 struct gfs2_rgrpd *cur;
517 spin_lock(&sdp->sd_rindex_spin);
518 n = sdp->sd_rindex_tree.rb_node;
520 cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
522 if (blk < cur->rd_addr)
524 else if (blk >= cur->rd_data0 + cur->rd_data)
527 spin_unlock(&sdp->sd_rindex_spin);
529 if (blk < cur->rd_addr)
531 if (blk >= cur->rd_data0 + cur->rd_data)
538 spin_unlock(&sdp->sd_rindex_spin);
544 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
545 * @sdp: The GFS2 superblock
547 * Returns: The first rgrp in the filesystem
550 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
552 const struct rb_node *n;
553 struct gfs2_rgrpd *rgd;
555 spin_lock(&sdp->sd_rindex_spin);
556 n = rb_first(&sdp->sd_rindex_tree);
557 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
558 spin_unlock(&sdp->sd_rindex_spin);
564 * gfs2_rgrpd_get_next - get the next RG
565 * @rgd: the resource group descriptor
567 * Returns: The next rgrp
570 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
572 struct gfs2_sbd *sdp = rgd->rd_sbd;
573 const struct rb_node *n;
575 spin_lock(&sdp->sd_rindex_spin);
576 n = rb_next(&rgd->rd_node);
578 n = rb_first(&sdp->sd_rindex_tree);
580 if (unlikely(&rgd->rd_node == n)) {
581 spin_unlock(&sdp->sd_rindex_spin);
584 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
585 spin_unlock(&sdp->sd_rindex_spin);
589 void check_and_update_goal(struct gfs2_inode *ip)
591 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
592 if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
593 ip->i_goal = ip->i_no_addr;
596 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
600 for (x = 0; x < rgd->rd_length; x++) {
601 struct gfs2_bitmap *bi = rgd->rd_bits + x;
608 * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
609 * plus a quota allocations data structure, if necessary
610 * @ip: the inode for this reservation
612 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
614 return gfs2_qa_alloc(ip);
617 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
619 struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
621 gfs2_print_dbg(seq, " B: n:%llu s:%llu b:%u f:%u\n",
622 (unsigned long long)ip->i_no_addr,
623 (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
624 rs->rs_rbm.offset, rs->rs_free);
628 * __rs_deltree - remove a multi-block reservation from the rgd tree
629 * @rs: The reservation to remove
632 static void __rs_deltree(struct gfs2_blkreserv *rs)
634 struct gfs2_rgrpd *rgd;
636 if (!gfs2_rs_active(rs))
639 rgd = rs->rs_rbm.rgd;
640 trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
641 rb_erase(&rs->rs_node, &rgd->rd_rstree);
642 RB_CLEAR_NODE(&rs->rs_node);
645 struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
647 /* return reserved blocks to the rgrp */
648 BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
649 rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
650 /* The rgrp extent failure point is likely not to increase;
651 it will only do so if the freed blocks are somehow
652 contiguous with a span of free blocks that follows. Still,
653 it will force the number to be recalculated later. */
654 rgd->rd_extfail_pt += rs->rs_free;
656 clear_bit(GBF_FULL, &bi->bi_flags);
661 * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
662 * @rs: The reservation to remove
665 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
667 struct gfs2_rgrpd *rgd;
669 rgd = rs->rs_rbm.rgd;
671 spin_lock(&rgd->rd_rsspin);
674 spin_unlock(&rgd->rd_rsspin);
679 * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
680 * @ip: The inode for this reservation
681 * @wcount: The inode's write count, or NULL
684 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
686 down_write(&ip->i_rw_mutex);
687 if ((wcount == NULL) || (atomic_read(wcount) <= 1))
688 gfs2_rs_deltree(&ip->i_res);
689 up_write(&ip->i_rw_mutex);
690 gfs2_qa_delete(ip, wcount);
694 * return_all_reservations - return all reserved blocks back to the rgrp.
695 * @rgd: the rgrp that needs its space back
697 * We previously reserved a bunch of blocks for allocation. Now we need to
698 * give them back. This leave the reservation structures in tact, but removes
699 * all of their corresponding "no-fly zones".
701 static void return_all_reservations(struct gfs2_rgrpd *rgd)
704 struct gfs2_blkreserv *rs;
706 spin_lock(&rgd->rd_rsspin);
707 while ((n = rb_first(&rgd->rd_rstree))) {
708 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
711 spin_unlock(&rgd->rd_rsspin);
714 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
717 struct gfs2_rgrpd *rgd;
718 struct gfs2_glock *gl;
720 while ((n = rb_first(&sdp->sd_rindex_tree))) {
721 rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
724 rb_erase(n, &sdp->sd_rindex_tree);
727 glock_clear_object(gl, rgd);
731 gfs2_free_clones(rgd);
734 return_all_reservations(rgd);
735 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
739 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
741 pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
742 pr_info("ri_length = %u\n", rgd->rd_length);
743 pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
744 pr_info("ri_data = %u\n", rgd->rd_data);
745 pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
749 * gfs2_compute_bitstructs - Compute the bitmap sizes
750 * @rgd: The resource group descriptor
752 * Calculates bitmap descriptors, one for each block that contains bitmap data
757 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
759 struct gfs2_sbd *sdp = rgd->rd_sbd;
760 struct gfs2_bitmap *bi;
761 u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
762 u32 bytes_left, bytes;
768 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
772 bytes_left = rgd->rd_bitbytes;
774 for (x = 0; x < length; x++) {
775 bi = rgd->rd_bits + x;
778 /* small rgrp; bitmap stored completely in header block */
781 bi->bi_offset = sizeof(struct gfs2_rgrp);
784 bi->bi_blocks = bytes * GFS2_NBBY;
787 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
788 bi->bi_offset = sizeof(struct gfs2_rgrp);
791 bi->bi_blocks = bytes * GFS2_NBBY;
793 } else if (x + 1 == length) {
795 bi->bi_offset = sizeof(struct gfs2_meta_header);
796 bi->bi_start = rgd->rd_bitbytes - bytes_left;
798 bi->bi_blocks = bytes * GFS2_NBBY;
801 bytes = sdp->sd_sb.sb_bsize -
802 sizeof(struct gfs2_meta_header);
803 bi->bi_offset = sizeof(struct gfs2_meta_header);
804 bi->bi_start = rgd->rd_bitbytes - bytes_left;
806 bi->bi_blocks = bytes * GFS2_NBBY;
813 gfs2_consist_rgrpd(rgd);
816 bi = rgd->rd_bits + (length - 1);
817 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
818 if (gfs2_consist_rgrpd(rgd)) {
819 gfs2_rindex_print(rgd);
820 fs_err(sdp, "start=%u len=%u offset=%u\n",
821 bi->bi_start, bi->bi_len, bi->bi_offset);
830 * gfs2_ri_total - Total up the file system space, according to the rindex.
831 * @sdp: the filesystem
834 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
837 struct inode *inode = sdp->sd_rindex;
838 struct gfs2_inode *ip = GFS2_I(inode);
839 char buf[sizeof(struct gfs2_rindex)];
842 for (rgrps = 0;; rgrps++) {
843 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
845 if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
847 error = gfs2_internal_read(ip, buf, &pos,
848 sizeof(struct gfs2_rindex));
849 if (error != sizeof(struct gfs2_rindex))
851 total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
856 static int rgd_insert(struct gfs2_rgrpd *rgd)
858 struct gfs2_sbd *sdp = rgd->rd_sbd;
859 struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
861 /* Figure out where to put new node */
863 struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
867 if (rgd->rd_addr < cur->rd_addr)
868 newn = &((*newn)->rb_left);
869 else if (rgd->rd_addr > cur->rd_addr)
870 newn = &((*newn)->rb_right);
875 rb_link_node(&rgd->rd_node, parent, newn);
876 rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
882 * read_rindex_entry - Pull in a new resource index entry from the disk
883 * @ip: Pointer to the rindex inode
885 * Returns: 0 on success, > 0 on EOF, error code otherwise
888 static int read_rindex_entry(struct gfs2_inode *ip)
890 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
891 const unsigned bsize = sdp->sd_sb.sb_bsize;
892 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
893 struct gfs2_rindex buf;
895 struct gfs2_rgrpd *rgd;
897 if (pos >= i_size_read(&ip->i_inode))
900 error = gfs2_internal_read(ip, (char *)&buf, &pos,
901 sizeof(struct gfs2_rindex));
903 if (error != sizeof(struct gfs2_rindex))
904 return (error == 0) ? 1 : error;
906 rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
912 rgd->rd_addr = be64_to_cpu(buf.ri_addr);
913 rgd->rd_length = be32_to_cpu(buf.ri_length);
914 rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
915 rgd->rd_data = be32_to_cpu(buf.ri_data);
916 rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
917 spin_lock_init(&rgd->rd_rsspin);
919 error = compute_bitstructs(rgd);
923 error = gfs2_glock_get(sdp, rgd->rd_addr,
924 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
928 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
929 rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
930 if (rgd->rd_data > sdp->sd_max_rg_data)
931 sdp->sd_max_rg_data = rgd->rd_data;
932 spin_lock(&sdp->sd_rindex_spin);
933 error = rgd_insert(rgd);
934 spin_unlock(&sdp->sd_rindex_spin);
936 glock_set_object(rgd->rd_gl, rgd);
937 rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
938 rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
939 rgd->rd_length) * bsize) - 1;
943 error = 0; /* someone else read in the rgrp; free it and ignore it */
944 gfs2_glock_put(rgd->rd_gl);
949 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
954 * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
955 * @sdp: the GFS2 superblock
957 * The purpose of this function is to select a subset of the resource groups
958 * and mark them as PREFERRED. We do it in such a way that each node prefers
959 * to use a unique set of rgrps to minimize glock contention.
961 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
963 struct gfs2_rgrpd *rgd, *first;
966 /* Skip an initial number of rgrps, based on this node's journal ID.
967 That should start each node out on its own set. */
968 rgd = gfs2_rgrpd_get_first(sdp);
969 for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
970 rgd = gfs2_rgrpd_get_next(rgd);
974 rgd->rd_flags |= GFS2_RDF_PREFERRED;
975 for (i = 0; i < sdp->sd_journals; i++) {
976 rgd = gfs2_rgrpd_get_next(rgd);
977 if (!rgd || rgd == first)
980 } while (rgd && rgd != first);
984 * gfs2_ri_update - Pull in a new resource index from the disk
985 * @ip: pointer to the rindex inode
987 * Returns: 0 on successful update, error code otherwise
990 static int gfs2_ri_update(struct gfs2_inode *ip)
992 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
996 error = read_rindex_entry(ip);
997 } while (error == 0);
1002 set_rgrp_preferences(sdp);
1004 sdp->sd_rindex_uptodate = 1;
1009 * gfs2_rindex_update - Update the rindex if required
1010 * @sdp: The GFS2 superblock
1012 * We grab a lock on the rindex inode to make sure that it doesn't
1013 * change whilst we are performing an operation. We keep this lock
1014 * for quite long periods of time compared to other locks. This
1015 * doesn't matter, since it is shared and it is very, very rarely
1016 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1018 * This makes sure that we're using the latest copy of the resource index
1019 * special file, which might have been updated if someone expanded the
1020 * filesystem (via gfs2_grow utility), which adds new resource groups.
1022 * Returns: 0 on succeess, error code otherwise
1025 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1027 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1028 struct gfs2_glock *gl = ip->i_gl;
1029 struct gfs2_holder ri_gh;
1031 int unlock_required = 0;
1033 /* Read new copy from disk if we don't have the latest */
1034 if (!sdp->sd_rindex_uptodate) {
1035 if (!gfs2_glock_is_locked_by_me(gl)) {
1036 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1039 unlock_required = 1;
1041 if (!sdp->sd_rindex_uptodate)
1042 error = gfs2_ri_update(ip);
1043 if (unlock_required)
1044 gfs2_glock_dq_uninit(&ri_gh);
1050 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1052 const struct gfs2_rgrp *str = buf;
1055 rg_flags = be32_to_cpu(str->rg_flags);
1056 rg_flags &= ~GFS2_RDF_MASK;
1057 rgd->rd_flags &= GFS2_RDF_MASK;
1058 rgd->rd_flags |= rg_flags;
1059 rgd->rd_free = be32_to_cpu(str->rg_free);
1060 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1061 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1062 /* rd_data0, rd_data and rd_bitbytes already set from rindex */
1065 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1067 const struct gfs2_rgrp *str = buf;
1069 rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1070 rgl->rl_flags = str->rg_flags;
1071 rgl->rl_free = str->rg_free;
1072 rgl->rl_dinodes = str->rg_dinodes;
1073 rgl->rl_igeneration = str->rg_igeneration;
1077 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1079 struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1080 struct gfs2_rgrp *str = buf;
1083 str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1084 str->rg_free = cpu_to_be32(rgd->rd_free);
1085 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1088 else if (next->rd_addr > rgd->rd_addr)
1089 str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1090 str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1091 str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1092 str->rg_data = cpu_to_be32(rgd->rd_data);
1093 str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1095 crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1096 str->rg_crc = cpu_to_be32(crc);
1098 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1099 gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1102 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1104 struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1105 struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1107 if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free ||
1108 rgl->rl_dinodes != str->rg_dinodes ||
1109 rgl->rl_igeneration != str->rg_igeneration)
1114 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1116 struct gfs2_bitmap *bi;
1117 const u32 length = rgd->rd_length;
1118 const u8 *buffer = NULL;
1119 u32 i, goal, count = 0;
1121 for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1123 buffer = bi->bi_bh->b_data + bi->bi_offset;
1124 WARN_ON(!buffer_uptodate(bi->bi_bh));
1125 while (goal < bi->bi_len * GFS2_NBBY) {
1126 goal = gfs2_bitfit(buffer, bi->bi_len, goal,
1127 GFS2_BLKST_UNLINKED);
1128 if (goal == BFITNOENT)
1140 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1141 * @rgd: the struct gfs2_rgrpd describing the RG to read in
1143 * Read in all of a Resource Group's header and bitmap blocks.
1144 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
1149 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1151 struct gfs2_sbd *sdp = rgd->rd_sbd;
1152 struct gfs2_glock *gl = rgd->rd_gl;
1153 unsigned int length = rgd->rd_length;
1154 struct gfs2_bitmap *bi;
1158 if (rgd->rd_bits[0].bi_bh != NULL)
1161 for (x = 0; x < length; x++) {
1162 bi = rgd->rd_bits + x;
1163 error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1168 for (y = length; y--;) {
1169 bi = rgd->rd_bits + y;
1170 error = gfs2_meta_wait(sdp, bi->bi_bh);
1173 if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1174 GFS2_METATYPE_RG)) {
1180 if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1181 for (x = 0; x < length; x++)
1182 clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1183 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1184 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1185 rgd->rd_free_clone = rgd->rd_free;
1186 /* max out the rgrp allocation failure point */
1187 rgd->rd_extfail_pt = rgd->rd_free;
1189 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1190 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1191 gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1192 rgd->rd_bits[0].bi_bh->b_data);
1194 else if (sdp->sd_args.ar_rgrplvb) {
1195 if (!gfs2_rgrp_lvb_valid(rgd)){
1196 gfs2_consist_rgrpd(rgd);
1200 if (rgd->rd_rgl->rl_unlinked == 0)
1201 rgd->rd_flags &= ~GFS2_RDF_CHECK;
1207 bi = rgd->rd_bits + x;
1210 gfs2_assert_warn(sdp, !bi->bi_clone);
1216 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1220 if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1223 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1224 return gfs2_rgrp_bh_get(rgd);
1226 rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1227 rl_flags &= ~GFS2_RDF_MASK;
1228 rgd->rd_flags &= GFS2_RDF_MASK;
1229 rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1230 if (rgd->rd_rgl->rl_unlinked == 0)
1231 rgd->rd_flags &= ~GFS2_RDF_CHECK;
1232 rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1233 rgd->rd_free_clone = rgd->rd_free;
1234 rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1235 rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1239 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1241 struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1242 struct gfs2_sbd *sdp = rgd->rd_sbd;
1244 if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1246 return gfs2_rgrp_bh_get(rgd);
1250 * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1251 * @rgd: The resource group
1255 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1257 int x, length = rgd->rd_length;
1259 for (x = 0; x < length; x++) {
1260 struct gfs2_bitmap *bi = rgd->rd_bits + x;
1270 * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1271 * @gh: The glock holder for the resource group
1275 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1277 struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1278 int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1279 test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1281 if (rgd && demote_requested)
1282 gfs2_rgrp_brelse(rgd);
1285 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1286 struct buffer_head *bh,
1287 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1289 struct super_block *sb = sdp->sd_vfs;
1292 sector_t nr_blks = 0;
1298 for (x = 0; x < bi->bi_len; x++) {
1299 const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1300 clone += bi->bi_offset;
1303 const u8 *orig = bh->b_data + bi->bi_offset + x;
1304 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1306 diff = ~(*clone | (*clone >> 1));
1311 blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1315 goto start_new_extent;
1316 if ((start + nr_blks) != blk) {
1317 if (nr_blks >= minlen) {
1318 rv = sb_issue_discard(sb,
1335 if (nr_blks >= minlen) {
1336 rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1342 *ptrimmed = trimmed;
1346 if (sdp->sd_args.ar_discard)
1347 fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1348 sdp->sd_args.ar_discard = 0;
1353 * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1354 * @filp: Any file on the filesystem
1355 * @argp: Pointer to the arguments (also used to pass result)
1357 * Returns: 0 on success, otherwise error code
1360 int gfs2_fitrim(struct file *filp, void __user *argp)
1362 struct inode *inode = file_inode(filp);
1363 struct gfs2_sbd *sdp = GFS2_SB(inode);
1364 struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1365 struct buffer_head *bh;
1366 struct gfs2_rgrpd *rgd;
1367 struct gfs2_rgrpd *rgd_end;
1368 struct gfs2_holder gh;
1369 struct fstrim_range r;
1373 u64 start, end, minlen;
1375 unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1377 if (!capable(CAP_SYS_ADMIN))
1380 if (!blk_queue_discard(q))
1383 if (copy_from_user(&r, argp, sizeof(r)))
1386 ret = gfs2_rindex_update(sdp);
1390 start = r.start >> bs_shift;
1391 end = start + (r.len >> bs_shift);
1392 minlen = max_t(u64, r.minlen,
1393 q->limits.discard_granularity) >> bs_shift;
1395 if (end <= start || minlen > sdp->sd_max_rg_data)
1398 rgd = gfs2_blk2rgrpd(sdp, start, 0);
1399 rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1401 if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1402 && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1403 return -EINVAL; /* start is beyond the end of the fs */
1407 ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1411 if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1412 /* Trim each bitmap in the rgrp */
1413 for (x = 0; x < rgd->rd_length; x++) {
1414 struct gfs2_bitmap *bi = rgd->rd_bits + x;
1415 ret = gfs2_rgrp_send_discards(sdp,
1416 rgd->rd_data0, NULL, bi, minlen,
1419 gfs2_glock_dq_uninit(&gh);
1425 /* Mark rgrp as having been trimmed */
1426 ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1428 bh = rgd->rd_bits[0].bi_bh;
1429 rgd->rd_flags |= GFS2_RGF_TRIMMED;
1430 gfs2_trans_add_meta(rgd->rd_gl, bh);
1431 gfs2_rgrp_out(rgd, bh->b_data);
1432 gfs2_trans_end(sdp);
1435 gfs2_glock_dq_uninit(&gh);
1440 rgd = gfs2_rgrpd_get_next(rgd);
1444 r.len = trimmed << bs_shift;
1445 if (copy_to_user(argp, &r, sizeof(r)))
1452 * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1453 * @ip: the inode structure
1456 static void rs_insert(struct gfs2_inode *ip)
1458 struct rb_node **newn, *parent = NULL;
1460 struct gfs2_blkreserv *rs = &ip->i_res;
1461 struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1462 u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1464 BUG_ON(gfs2_rs_active(rs));
1466 spin_lock(&rgd->rd_rsspin);
1467 newn = &rgd->rd_rstree.rb_node;
1469 struct gfs2_blkreserv *cur =
1470 rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1473 rc = rs_cmp(fsblock, rs->rs_free, cur);
1475 newn = &((*newn)->rb_right);
1477 newn = &((*newn)->rb_left);
1479 spin_unlock(&rgd->rd_rsspin);
1485 rb_link_node(&rs->rs_node, parent, newn);
1486 rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1488 /* Do our rgrp accounting for the reservation */
1489 rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1490 spin_unlock(&rgd->rd_rsspin);
1491 trace_gfs2_rs(rs, TRACE_RS_INSERT);
1495 * rgd_free - return the number of free blocks we can allocate.
1496 * @rgd: the resource group
1498 * This function returns the number of free blocks for an rgrp.
1499 * That's the clone-free blocks (blocks that are free, not including those
1500 * still being used for unlinked files that haven't been deleted.)
1502 * It also subtracts any blocks reserved by someone else, but does not
1503 * include free blocks that are still part of our current reservation,
1504 * because obviously we can (and will) allocate them.
1506 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1508 u32 tot_reserved, tot_free;
1510 if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1512 tot_reserved = rgd->rd_reserved - rs->rs_free;
1514 if (rgd->rd_free_clone < tot_reserved)
1517 tot_free = rgd->rd_free_clone - tot_reserved;
1523 * rg_mblk_search - find a group of multiple free blocks to form a reservation
1524 * @rgd: the resource group descriptor
1525 * @ip: pointer to the inode for which we're reserving blocks
1526 * @ap: the allocation parameters
1530 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1531 const struct gfs2_alloc_parms *ap)
1533 struct gfs2_rbm rbm = { .rgd = rgd, };
1535 struct gfs2_blkreserv *rs = &ip->i_res;
1537 u32 free_blocks = rgd_free(rgd, rs);
1539 struct inode *inode = &ip->i_inode;
1541 if (S_ISDIR(inode->i_mode))
1544 extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
1545 extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
1547 if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1550 /* Find bitmap block that contains bits for goal block */
1551 if (rgrp_contains_block(rgd, ip->i_goal))
1554 goal = rgd->rd_last_alloc + rgd->rd_data0;
1556 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1559 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1562 rs->rs_free = extlen;
1565 if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1566 rgd->rd_last_alloc = 0;
1571 * gfs2_next_unreserved_block - Return next block that is not reserved
1572 * @rgd: The resource group
1573 * @block: The starting block
1574 * @length: The required length
1575 * @ip: Ignore any reservations for this inode
1577 * If the block does not appear in any reservation, then return the
1578 * block number unchanged. If it does appear in the reservation, then
1579 * keep looking through the tree of reservations in order to find the
1580 * first block number which is not reserved.
1583 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1585 const struct gfs2_inode *ip)
1587 struct gfs2_blkreserv *rs;
1591 spin_lock(&rgd->rd_rsspin);
1592 n = rgd->rd_rstree.rb_node;
1594 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1595 rc = rs_cmp(block, length, rs);
1605 while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1606 block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1610 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1614 spin_unlock(&rgd->rd_rsspin);
1619 * gfs2_reservation_check_and_update - Check for reservations during block alloc
1620 * @rbm: The current position in the resource group
1621 * @ip: The inode for which we are searching for blocks
1622 * @minext: The minimum extent length
1623 * @maxext: A pointer to the maximum extent structure
1625 * This checks the current position in the rgrp to see whether there is
1626 * a reservation covering this block. If not then this function is a
1627 * no-op. If there is, then the position is moved to the end of the
1628 * contiguous reservation(s) so that we are pointing at the first
1629 * non-reserved block.
1631 * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1634 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1635 const struct gfs2_inode *ip,
1637 struct gfs2_extent *maxext)
1639 u64 block = gfs2_rbm_to_block(rbm);
1645 * If we have a minimum extent length, then skip over any extent
1646 * which is less than the min extent length in size.
1649 extlen = gfs2_free_extlen(rbm, minext);
1650 if (extlen <= maxext->len)
1655 * Check the extent which has been found against the reservations
1656 * and skip if parts of it are already reserved
1658 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1659 if (nblock == block) {
1660 if (!minext || extlen >= minext)
1663 if (extlen > maxext->len) {
1664 maxext->len = extlen;
1668 nblock = block + extlen;
1670 ret = gfs2_rbm_from_block(rbm, nblock);
1677 * gfs2_rbm_find - Look for blocks of a particular state
1678 * @rbm: Value/result starting position and final position
1679 * @state: The state which we want to find
1680 * @minext: Pointer to the requested extent length (NULL for a single block)
1681 * This is updated to be the actual reservation size.
1682 * @ip: If set, check for reservations
1683 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1684 * around until we've reached the starting point.
1687 * - If looking for free blocks, we set GBF_FULL on each bitmap which
1688 * has no free blocks in it.
1689 * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1690 * has come up short on a free block search.
1692 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1695 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1696 const struct gfs2_inode *ip, bool nowrap)
1698 struct buffer_head *bh;
1701 int first_bii = rbm->bii;
1702 u32 first_offset = rbm->offset;
1706 int iters = rbm->rgd->rd_length;
1708 struct gfs2_bitmap *bi;
1709 struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1711 /* If we are not starting at the beginning of a bitmap, then we
1712 * need to add one to the bitmap count to ensure that we search
1713 * the starting bitmap twice.
1715 if (rbm->offset != 0)
1720 if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1721 test_bit(GBF_FULL, &bi->bi_flags) &&
1722 (state == GFS2_BLKST_FREE))
1726 buffer = bh->b_data + bi->bi_offset;
1727 WARN_ON(!buffer_uptodate(bh));
1728 if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1729 buffer = bi->bi_clone + bi->bi_offset;
1730 initial_offset = rbm->offset;
1731 offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
1732 if (offset == BFITNOENT)
1734 rbm->offset = offset;
1738 initial_bii = rbm->bii;
1739 ret = gfs2_reservation_check_and_update(rbm, ip,
1740 minext ? *minext : 0,
1745 n += (rbm->bii - initial_bii);
1748 if (ret == -E2BIG) {
1751 n += (rbm->bii - initial_bii);
1752 goto res_covered_end_of_rgrp;
1756 bitmap_full: /* Mark bitmap as full and fall through */
1757 if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
1758 set_bit(GBF_FULL, &bi->bi_flags);
1760 next_bitmap: /* Find next bitmap in the rgrp */
1763 if (rbm->bii == rbm->rgd->rd_length)
1765 res_covered_end_of_rgrp:
1766 if ((rbm->bii == 0) && nowrap)
1774 if (minext == NULL || state != GFS2_BLKST_FREE)
1777 /* If the extent was too small, and it's smaller than the smallest
1778 to have failed before, remember for future reference that it's
1779 useless to search this rgrp again for this amount or more. */
1780 if ((first_offset == 0) && (first_bii == 0) &&
1781 (*minext < rbm->rgd->rd_extfail_pt))
1782 rbm->rgd->rd_extfail_pt = *minext;
1784 /* If the maximum extent we found is big enough to fulfill the
1785 minimum requirements, use it anyway. */
1788 *minext = maxext.len;
1796 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1798 * @last_unlinked: block address of the last dinode we unlinked
1799 * @skip: block address we should explicitly not unlink
1801 * Returns: 0 if no error
1802 * The inode, if one has been found, in inode.
1805 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1808 struct gfs2_sbd *sdp = rgd->rd_sbd;
1809 struct gfs2_glock *gl;
1810 struct gfs2_inode *ip;
1813 struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1816 down_write(&sdp->sd_log_flush_lock);
1817 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1819 up_write(&sdp->sd_log_flush_lock);
1820 if (error == -ENOSPC)
1822 if (WARN_ON_ONCE(error))
1825 block = gfs2_rbm_to_block(&rbm);
1826 if (gfs2_rbm_from_block(&rbm, block + 1))
1828 if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1832 *last_unlinked = block;
1834 error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1838 /* If the inode is already in cache, we can ignore it here
1839 * because the existing inode disposal code will deal with
1840 * it when all refs have gone away. Accessing gl_object like
1841 * this is not safe in general. Here it is ok because we do
1842 * not dereference the pointer, and we only need an approx
1843 * answer to whether it is NULL or not.
1847 if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1852 /* Limit reclaim to sensible number of tasks */
1853 if (found > NR_CPUS)
1857 rgd->rd_flags &= ~GFS2_RDF_CHECK;
1862 * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1863 * @rgd: The rgrp in question
1864 * @loops: An indication of how picky we can be (0=very, 1=less so)
1866 * This function uses the recently added glock statistics in order to
1867 * figure out whether a parciular resource group is suffering from
1868 * contention from multiple nodes. This is done purely on the basis
1869 * of timings, since this is the only data we have to work with and
1870 * our aim here is to reject a resource group which is highly contended
1871 * but (very important) not to do this too often in order to ensure that
1872 * we do not land up introducing fragmentation by changing resource
1873 * groups when not actually required.
1875 * The calculation is fairly simple, we want to know whether the SRTTB
1876 * (i.e. smoothed round trip time for blocking operations) to acquire
1877 * the lock for this rgrp's glock is significantly greater than the
1878 * time taken for resource groups on average. We introduce a margin in
1879 * the form of the variable @var which is computed as the sum of the two
1880 * respective variences, and multiplied by a factor depending on @loops
1881 * and whether we have a lot of data to base the decision on. This is
1882 * then tested against the square difference of the means in order to
1883 * decide whether the result is statistically significant or not.
1885 * Returns: A boolean verdict on the congestion status
1888 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1890 const struct gfs2_glock *gl = rgd->rd_gl;
1891 const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1892 struct gfs2_lkstats *st;
1893 u64 r_dcount, l_dcount;
1894 u64 l_srttb, a_srttb = 0;
1898 int cpu, nonzero = 0;
1901 for_each_present_cpu(cpu) {
1902 st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1903 if (st->stats[GFS2_LKS_SRTTB]) {
1904 a_srttb += st->stats[GFS2_LKS_SRTTB];
1908 st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1910 do_div(a_srttb, nonzero);
1911 r_dcount = st->stats[GFS2_LKS_DCOUNT];
1912 var = st->stats[GFS2_LKS_SRTTVARB] +
1913 gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1916 l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1917 l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1919 if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1922 srttb_diff = a_srttb - l_srttb;
1923 sqr_diff = srttb_diff * srttb_diff;
1926 if (l_dcount < 8 || r_dcount < 8)
1931 return ((srttb_diff < 0) && (sqr_diff > var));
1935 * gfs2_rgrp_used_recently
1936 * @rs: The block reservation with the rgrp to test
1937 * @msecs: The time limit in milliseconds
1939 * Returns: True if the rgrp glock has been used within the time limit
1941 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1946 tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1947 rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1949 return tdiff > (msecs * 1000 * 1000);
1952 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1954 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1957 get_random_bytes(&skip, sizeof(skip));
1958 return skip % sdp->sd_rgrps;
1961 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1963 struct gfs2_rgrpd *rgd = *pos;
1964 struct gfs2_sbd *sdp = rgd->rd_sbd;
1966 rgd = gfs2_rgrpd_get_next(rgd);
1968 rgd = gfs2_rgrpd_get_first(sdp);
1970 if (rgd != begin) /* If we didn't wrap */
1976 * fast_to_acquire - determine if a resource group will be fast to acquire
1978 * If this is one of our preferred rgrps, it should be quicker to acquire,
1979 * because we tried to set ourselves up as dlm lock master.
1981 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1983 struct gfs2_glock *gl = rgd->rd_gl;
1985 if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1986 !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1987 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1989 if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1995 * gfs2_inplace_reserve - Reserve space in the filesystem
1996 * @ip: the inode to reserve space for
1997 * @ap: the allocation parameters
1999 * We try our best to find an rgrp that has at least ap->target blocks
2000 * available. After a couple of passes (loops == 2), the prospects of finding
2001 * such an rgrp diminish. At this stage, we return the first rgrp that has
2002 * atleast ap->min_target blocks available. Either way, we set ap->allowed to
2003 * the number of blocks available in the chosen rgrp.
2005 * Returns: 0 on success,
2006 * -ENOMEM if a suitable rgrp can't be found
2010 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2012 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2013 struct gfs2_rgrpd *begin = NULL;
2014 struct gfs2_blkreserv *rs = &ip->i_res;
2015 int error = 0, rg_locked, flags = 0;
2016 u64 last_unlinked = NO_BLOCK;
2018 u32 free_blocks, skip = 0;
2020 if (sdp->sd_args.ar_rgrplvb)
2022 if (gfs2_assert_warn(sdp, ap->target))
2024 if (gfs2_rs_active(rs)) {
2025 begin = rs->rs_rbm.rgd;
2026 } else if (rs->rs_rbm.rgd &&
2027 rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2028 begin = rs->rs_rbm.rgd;
2030 check_and_update_goal(ip);
2031 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2033 if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2034 skip = gfs2_orlov_skip(ip);
2035 if (rs->rs_rbm.rgd == NULL)
2041 if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2045 if (!gfs2_rs_active(rs)) {
2047 !fast_to_acquire(rs->rs_rbm.rgd))
2050 gfs2_rgrp_used_recently(rs, 1000) &&
2051 gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2054 error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2055 LM_ST_EXCLUSIVE, flags,
2057 if (unlikely(error))
2059 if (!gfs2_rs_active(rs) && (loops < 2) &&
2060 gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2062 if (sdp->sd_args.ar_rgrplvb) {
2063 error = update_rgrp_lvb(rs->rs_rbm.rgd);
2064 if (unlikely(error)) {
2065 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2071 /* Skip unuseable resource groups */
2072 if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2074 (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2077 if (sdp->sd_args.ar_rgrplvb)
2078 gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2080 /* Get a reservation if we don't already have one */
2081 if (!gfs2_rs_active(rs))
2082 rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2084 /* Skip rgrps when we can't get a reservation on first pass */
2085 if (!gfs2_rs_active(rs) && (loops < 1))
2088 /* If rgrp has enough free space, use it */
2089 free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2090 if (free_blocks >= ap->target ||
2091 (loops == 2 && ap->min_target &&
2092 free_blocks >= ap->min_target)) {
2093 ap->allowed = free_blocks;
2097 /* Check for unlinked inodes which can be reclaimed */
2098 if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2099 try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2102 /* Drop reservation, if we couldn't use reserved rgrp */
2103 if (gfs2_rs_active(rs))
2104 gfs2_rs_deltree(rs);
2106 /* Unlock rgrp if required */
2108 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2110 /* Find the next rgrp, and continue looking */
2111 if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2116 /* If we've scanned all the rgrps, but found no free blocks
2117 * then this checks for some less likely conditions before
2121 /* Check that fs hasn't grown if writing to rindex */
2122 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2123 error = gfs2_ri_update(ip);
2127 /* Flushing the log may release space */
2129 gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2130 GFS2_LFC_INPLACE_RESERVE);
2137 * gfs2_inplace_release - release an inplace reservation
2138 * @ip: the inode the reservation was taken out on
2140 * Release a reservation made by gfs2_inplace_reserve().
2143 void gfs2_inplace_release(struct gfs2_inode *ip)
2145 struct gfs2_blkreserv *rs = &ip->i_res;
2147 if (gfs2_holder_initialized(&rs->rs_rgd_gh))
2148 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2152 * gfs2_alloc_extent - allocate an extent from a given bitmap
2153 * @rbm: the resource group information
2154 * @dinode: TRUE if the first block we allocate is for a dinode
2155 * @n: The extent length (value/result)
2157 * Add the bitmap buffer to the transaction.
2158 * Set the found bits to @new_state to change block's allocation state.
2160 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2163 struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2164 const unsigned int elen = *n;
2169 block = gfs2_rbm_to_block(rbm);
2170 gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2171 gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2174 ret = gfs2_rbm_from_block(&pos, block);
2175 if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2177 gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2178 gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2185 * rgblk_free - Change alloc state of given block(s)
2186 * @sdp: the filesystem
2187 * @bstart: the start of a run of blocks to free
2188 * @blen: the length of the block run (all must lie within ONE RG!)
2189 * @new_state: GFS2_BLKST_XXX the after-allocation block state
2191 * Returns: Resource group containing the block(s)
2194 static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2195 u32 blen, unsigned char new_state)
2197 struct gfs2_rbm rbm;
2198 struct gfs2_bitmap *bi, *bi_prev = NULL;
2200 rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
2202 if (gfs2_consist(sdp))
2203 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
2207 gfs2_rbm_from_block(&rbm, bstart);
2210 if (bi != bi_prev) {
2211 if (!bi->bi_clone) {
2212 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2213 GFP_NOFS | __GFP_NOFAIL);
2214 memcpy(bi->bi_clone + bi->bi_offset,
2215 bi->bi_bh->b_data + bi->bi_offset,
2218 gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2221 gfs2_setbit(&rbm, false, new_state);
2222 gfs2_rbm_incr(&rbm);
2229 * gfs2_rgrp_dump - print out an rgrp
2230 * @seq: The iterator
2231 * @gl: The glock in question
2235 void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2237 struct gfs2_rgrpd *rgd = gl->gl_object;
2238 struct gfs2_blkreserv *trs;
2239 const struct rb_node *n;
2243 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2244 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2245 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2246 rgd->rd_reserved, rgd->rd_extfail_pt);
2247 spin_lock(&rgd->rd_rsspin);
2248 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2249 trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2252 spin_unlock(&rgd->rd_rsspin);
2255 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2257 struct gfs2_sbd *sdp = rgd->rd_sbd;
2258 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2259 (unsigned long long)rgd->rd_addr);
2260 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2261 gfs2_rgrp_dump(NULL, rgd->rd_gl);
2262 rgd->rd_flags |= GFS2_RDF_ERROR;
2266 * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2267 * @ip: The inode we have just allocated blocks for
2268 * @rbm: The start of the allocated blocks
2269 * @len: The extent length
2271 * Adjusts a reservation after an allocation has taken place. If the
2272 * reservation does not match the allocation, or if it is now empty
2273 * then it is removed.
2276 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2277 const struct gfs2_rbm *rbm, unsigned len)
2279 struct gfs2_blkreserv *rs = &ip->i_res;
2280 struct gfs2_rgrpd *rgd = rbm->rgd;
2285 spin_lock(&rgd->rd_rsspin);
2286 if (gfs2_rs_active(rs)) {
2287 if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2288 block = gfs2_rbm_to_block(rbm);
2289 ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2290 rlen = min(rs->rs_free, len);
2291 rs->rs_free -= rlen;
2292 rgd->rd_reserved -= rlen;
2293 trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2294 if (rs->rs_free && !ret)
2296 /* We used up our block reservation, so we should
2297 reserve more blocks next time. */
2298 atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
2303 spin_unlock(&rgd->rd_rsspin);
2307 * gfs2_set_alloc_start - Set starting point for block allocation
2308 * @rbm: The rbm which will be set to the required location
2309 * @ip: The gfs2 inode
2310 * @dinode: Flag to say if allocation includes a new inode
2312 * This sets the starting point from the reservation if one is active
2313 * otherwise it falls back to guessing a start point based on the
2314 * inode's goal block or the last allocation point in the rgrp.
2317 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2318 const struct gfs2_inode *ip, bool dinode)
2322 if (gfs2_rs_active(&ip->i_res)) {
2323 *rbm = ip->i_res.rs_rbm;
2327 if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2330 goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2332 gfs2_rbm_from_block(rbm, goal);
2336 * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2337 * @ip: the inode to allocate the block for
2338 * @bn: Used to return the starting block number
2339 * @nblocks: requested number of blocks/extent length (value/result)
2340 * @dinode: 1 if we're allocating a dinode block, else 0
2341 * @generation: the generation number of the inode
2343 * Returns: 0 or error
2346 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2347 bool dinode, u64 *generation)
2349 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2350 struct buffer_head *dibh;
2351 struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2353 u64 block; /* block, within the file system scope */
2356 gfs2_set_alloc_start(&rbm, ip, dinode);
2357 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2359 if (error == -ENOSPC) {
2360 gfs2_set_alloc_start(&rbm, ip, dinode);
2361 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2364 /* Since all blocks are reserved in advance, this shouldn't happen */
2366 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2367 (unsigned long long)ip->i_no_addr, error, *nblocks,
2368 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2369 rbm.rgd->rd_extfail_pt);
2373 gfs2_alloc_extent(&rbm, dinode, nblocks);
2374 block = gfs2_rbm_to_block(&rbm);
2375 rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2376 if (gfs2_rs_active(&ip->i_res))
2377 gfs2_adjust_reservation(ip, &rbm, *nblocks);
2383 ip->i_goal = block + ndata - 1;
2384 error = gfs2_meta_inode_buffer(ip, &dibh);
2386 struct gfs2_dinode *di =
2387 (struct gfs2_dinode *)dibh->b_data;
2388 gfs2_trans_add_meta(ip->i_gl, dibh);
2389 di->di_goal_meta = di->di_goal_data =
2390 cpu_to_be64(ip->i_goal);
2394 if (rbm.rgd->rd_free < *nblocks) {
2395 pr_warn("nblocks=%u\n", *nblocks);
2399 rbm.rgd->rd_free -= *nblocks;
2401 rbm.rgd->rd_dinodes++;
2402 *generation = rbm.rgd->rd_igeneration++;
2403 if (*generation == 0)
2404 *generation = rbm.rgd->rd_igeneration++;
2407 gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2408 gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2410 gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2412 gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2414 gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2416 rbm.rgd->rd_free_clone -= *nblocks;
2417 trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2418 dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2423 gfs2_rgrp_error(rbm.rgd);
2428 * __gfs2_free_blocks - free a contiguous run of block(s)
2429 * @ip: the inode these blocks are being freed from
2430 * @bstart: first block of a run of contiguous blocks
2431 * @blen: the length of the block run
2432 * @meta: 1 if the blocks represent metadata
2436 void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
2438 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2439 struct gfs2_rgrpd *rgd;
2441 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
2444 trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2445 rgd->rd_free += blen;
2446 rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2447 gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2448 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2450 /* Directories keep their data in the metadata address space */
2451 if (meta || ip->i_depth)
2452 gfs2_meta_wipe(ip, bstart, blen);
2456 * gfs2_free_meta - free a contiguous run of data block(s)
2457 * @ip: the inode these blocks are being freed from
2458 * @bstart: first block of a run of contiguous blocks
2459 * @blen: the length of the block run
2463 void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
2465 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2467 __gfs2_free_blocks(ip, bstart, blen, 1);
2468 gfs2_statfs_change(sdp, 0, +blen, 0);
2469 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2472 void gfs2_unlink_di(struct inode *inode)
2474 struct gfs2_inode *ip = GFS2_I(inode);
2475 struct gfs2_sbd *sdp = GFS2_SB(inode);
2476 struct gfs2_rgrpd *rgd;
2477 u64 blkno = ip->i_no_addr;
2479 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
2482 trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2483 gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2484 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2485 be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2488 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2490 struct gfs2_sbd *sdp = rgd->rd_sbd;
2491 struct gfs2_rgrpd *tmp_rgd;
2493 tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2496 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
2498 if (!rgd->rd_dinodes)
2499 gfs2_consist_rgrpd(rgd);
2503 gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2504 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2505 be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2507 gfs2_statfs_change(sdp, 0, +1, -1);
2508 trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2509 gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2510 gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2514 * gfs2_check_blk_type - Check the type of a block
2515 * @sdp: The superblock
2516 * @no_addr: The block number to check
2517 * @type: The block type we are looking for
2519 * Returns: 0 if the block type matches the expected type
2520 * -ESTALE if it doesn't match
2521 * or -ve errno if something went wrong while checking
2524 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2526 struct gfs2_rgrpd *rgd;
2527 struct gfs2_holder rgd_gh;
2528 struct gfs2_rbm rbm;
2529 int error = -EINVAL;
2531 rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2535 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2540 error = gfs2_rbm_from_block(&rbm, no_addr);
2541 WARN_ON_ONCE(error != 0);
2543 if (gfs2_testbit(&rbm, false) != type)
2546 gfs2_glock_dq_uninit(&rgd_gh);
2552 * gfs2_rlist_add - add a RG to a list of RGs
2554 * @rlist: the list of resource groups
2557 * Figure out what RG a block belongs to and add that RG to the list
2559 * FIXME: Don't use NOFAIL
2563 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2566 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2567 struct gfs2_rgrpd *rgd;
2568 struct gfs2_rgrpd **tmp;
2569 unsigned int new_space;
2572 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2576 * The resource group last accessed is kept in the last position.
2579 if (rlist->rl_rgrps) {
2580 rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2581 if (rgrp_contains_block(rgd, block))
2583 rgd = gfs2_blk2rgrpd(sdp, block, 1);
2585 rgd = ip->i_res.rs_rbm.rgd;
2586 if (!rgd || !rgrp_contains_block(rgd, block))
2587 rgd = gfs2_blk2rgrpd(sdp, block, 1);
2591 fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2592 (unsigned long long)block);
2596 for (x = 0; x < rlist->rl_rgrps; x++) {
2597 if (rlist->rl_rgd[x] == rgd) {
2598 swap(rlist->rl_rgd[x],
2599 rlist->rl_rgd[rlist->rl_rgrps - 1]);
2604 if (rlist->rl_rgrps == rlist->rl_space) {
2605 new_space = rlist->rl_space + 10;
2607 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2608 GFP_NOFS | __GFP_NOFAIL);
2610 if (rlist->rl_rgd) {
2611 memcpy(tmp, rlist->rl_rgd,
2612 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2613 kfree(rlist->rl_rgd);
2616 rlist->rl_space = new_space;
2617 rlist->rl_rgd = tmp;
2620 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2624 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2625 * and initialize an array of glock holders for them
2626 * @rlist: the list of resource groups
2627 * @state: the lock state to acquire the RG lock in
2629 * FIXME: Don't use NOFAIL
2633 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
2637 rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2638 sizeof(struct gfs2_holder),
2639 GFP_NOFS | __GFP_NOFAIL);
2640 for (x = 0; x < rlist->rl_rgrps; x++)
2641 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2647 * gfs2_rlist_free - free a resource group list
2648 * @rlist: the list of resource groups
2652 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2656 kfree(rlist->rl_rgd);
2658 if (rlist->rl_ghs) {
2659 for (x = 0; x < rlist->rl_rgrps; x++)
2660 gfs2_holder_uninit(&rlist->rl_ghs[x]);
2661 kfree(rlist->rl_ghs);
2662 rlist->rl_ghs = NULL;