1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/slab.h>
6 #include "btrfs_inode.h"
9 * Subpage (sectorsize < PAGE_SIZE) support overview:
13 * - Only support 64K page size for now
14 * This is to make metadata handling easier, as 64K page would ensure
15 * all nodesize would fit inside one page, thus we don't need to handle
16 * cases where a tree block crosses several pages.
18 * - Only metadata read-write for now
19 * The data read-write part is in development.
21 * - Metadata can't cross 64K page boundary
22 * btrfs-progs and kernel have done that for a while, thus only ancient
23 * filesystems could have such problem. For such case, do a graceful
29 * Metadata read is fully supported.
30 * Meaning when reading one tree block will only trigger the read for the
31 * needed range, other unrelated range in the same page will not be touched.
33 * Metadata write support is partial.
34 * The writeback is still for the full page, but we will only submit
35 * the dirty extent buffers in the page.
37 * This means, if we have a metadata page like this:
41 * |/////////| |///////////|
42 * \- Tree block A \- Tree block B
44 * Even if we just want to writeback tree block A, we will also writeback
45 * tree block B if it's also dirty.
47 * This may cause extra metadata writeback which results more COW.
52 * Both metadata and data will use a new structure, btrfs_subpage, to
53 * record the status of each sector inside a page. This provides the extra
57 * Since we have multiple tree blocks inside one page, we can't rely on page
58 * locking anymore, or we will have greatly reduced concurrency or even
59 * deadlocks (hold one tree lock while trying to lock another tree lock in
62 * Thus for metadata locking, subpage support relies on io_tree locking only.
63 * This means a slightly higher tree locking latency.
66 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
67 struct page *page, enum btrfs_subpage_type type)
69 struct btrfs_subpage *subpage = NULL;
73 * We have cases like a dummy extent buffer page, which is not mappped
74 * and doesn't need to be locked.
77 ASSERT(PageLocked(page));
78 /* Either not subpage, or the page already has private attached */
79 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
82 ret = btrfs_alloc_subpage(fs_info, &subpage, type);
85 attach_page_private(page, subpage);
89 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
92 struct btrfs_subpage *subpage;
94 /* Either not subpage, or already detached */
95 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
98 subpage = (struct btrfs_subpage *)detach_page_private(page);
100 btrfs_free_subpage(subpage);
103 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
104 struct btrfs_subpage **ret,
105 enum btrfs_subpage_type type)
107 if (fs_info->sectorsize == PAGE_SIZE)
110 *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
113 spin_lock_init(&(*ret)->lock);
114 if (type == BTRFS_SUBPAGE_METADATA) {
115 atomic_set(&(*ret)->eb_refs, 0);
117 atomic_set(&(*ret)->readers, 0);
118 atomic_set(&(*ret)->writers, 0);
123 void btrfs_free_subpage(struct btrfs_subpage *subpage)
129 * Increase the eb_refs of current subpage.
131 * This is important for eb allocation, to prevent race with last eb freeing
133 * With the eb_refs increased before the eb inserted into radix tree,
134 * detach_extent_buffer_page() won't detach the page private while we're still
135 * allocating the extent buffer.
137 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
140 struct btrfs_subpage *subpage;
142 if (fs_info->sectorsize == PAGE_SIZE)
145 ASSERT(PagePrivate(page) && page->mapping);
146 lockdep_assert_held(&page->mapping->private_lock);
148 subpage = (struct btrfs_subpage *)page->private;
149 atomic_inc(&subpage->eb_refs);
152 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
155 struct btrfs_subpage *subpage;
157 if (fs_info->sectorsize == PAGE_SIZE)
160 ASSERT(PagePrivate(page) && page->mapping);
161 lockdep_assert_held(&page->mapping->private_lock);
163 subpage = (struct btrfs_subpage *)page->private;
164 ASSERT(atomic_read(&subpage->eb_refs));
165 atomic_dec(&subpage->eb_refs);
168 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
169 struct page *page, u64 start, u32 len)
172 ASSERT(PagePrivate(page) && page->private);
173 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
174 IS_ALIGNED(len, fs_info->sectorsize));
176 * The range check only works for mapped page, we can still have
177 * unmapped page like dummy extent buffer pages.
180 ASSERT(page_offset(page) <= start &&
181 start + len <= page_offset(page) + PAGE_SIZE);
184 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
185 struct page *page, u64 start, u32 len)
187 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
188 const int nbits = len >> fs_info->sectorsize_bits;
190 btrfs_subpage_assert(fs_info, page, start, len);
192 atomic_add(nbits, &subpage->readers);
195 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
196 struct page *page, u64 start, u32 len)
198 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
199 const int nbits = len >> fs_info->sectorsize_bits;
203 btrfs_subpage_assert(fs_info, page, start, len);
204 is_data = is_data_inode(page->mapping->host);
205 ASSERT(atomic_read(&subpage->readers) >= nbits);
206 last = atomic_sub_and_test(nbits, &subpage->readers);
209 * For data we need to unlock the page if the last read has finished.
211 * And please don't replace @last with atomic_sub_and_test() call
212 * inside if () condition.
213 * As we want the atomic_sub_and_test() to be always executed.
219 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
221 u64 orig_start = *start;
224 *start = max_t(u64, page_offset(page), orig_start);
225 *len = min_t(u64, page_offset(page) + PAGE_SIZE,
226 orig_start + orig_len) - *start;
229 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
230 struct page *page, u64 start, u32 len)
232 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
233 const int nbits = (len >> fs_info->sectorsize_bits);
236 btrfs_subpage_assert(fs_info, page, start, len);
238 ASSERT(atomic_read(&subpage->readers) == 0);
239 ret = atomic_add_return(nbits, &subpage->writers);
240 ASSERT(ret == nbits);
243 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
244 struct page *page, u64 start, u32 len)
246 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
247 const int nbits = (len >> fs_info->sectorsize_bits);
249 btrfs_subpage_assert(fs_info, page, start, len);
251 ASSERT(atomic_read(&subpage->writers) >= nbits);
252 return atomic_sub_and_test(nbits, &subpage->writers);
256 * Lock a page for delalloc page writeback.
258 * Return -EAGAIN if the page is not properly initialized.
259 * Return 0 with the page locked, and writer counter updated.
261 * Even with 0 returned, the page still need extra check to make sure
262 * it's really the correct page, as the caller is using
263 * find_get_pages_contig(), which can race with page invalidating.
265 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
266 struct page *page, u64 start, u32 len)
268 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
273 if (!PagePrivate(page) || !page->private) {
277 btrfs_subpage_clamp_range(page, &start, &len);
278 btrfs_subpage_start_writer(fs_info, page, start, len);
282 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
283 struct page *page, u64 start, u32 len)
285 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
286 return unlock_page(page);
287 btrfs_subpage_clamp_range(page, &start, &len);
288 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
293 * Convert the [start, start + len) range into a u16 bitmap
295 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
297 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
298 struct page *page, u64 start, u32 len)
300 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
301 const int nbits = len >> fs_info->sectorsize_bits;
303 btrfs_subpage_assert(fs_info, page, start, len);
306 * Here nbits can be 16, thus can go beyond u16 range. We make the
307 * first left shift to be calculate in unsigned long (at least u32),
308 * then truncate the result to u16.
310 return (u16)(((1UL << nbits) - 1) << bit_start);
313 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
314 struct page *page, u64 start, u32 len)
316 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
317 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
320 spin_lock_irqsave(&subpage->lock, flags);
321 subpage->uptodate_bitmap |= tmp;
322 if (subpage->uptodate_bitmap == U16_MAX)
323 SetPageUptodate(page);
324 spin_unlock_irqrestore(&subpage->lock, flags);
327 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
328 struct page *page, u64 start, u32 len)
330 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
331 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
334 spin_lock_irqsave(&subpage->lock, flags);
335 subpage->uptodate_bitmap &= ~tmp;
336 ClearPageUptodate(page);
337 spin_unlock_irqrestore(&subpage->lock, flags);
340 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
341 struct page *page, u64 start, u32 len)
343 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
344 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
347 spin_lock_irqsave(&subpage->lock, flags);
348 subpage->error_bitmap |= tmp;
350 spin_unlock_irqrestore(&subpage->lock, flags);
353 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
354 struct page *page, u64 start, u32 len)
356 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
357 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
360 spin_lock_irqsave(&subpage->lock, flags);
361 subpage->error_bitmap &= ~tmp;
362 if (subpage->error_bitmap == 0)
363 ClearPageError(page);
364 spin_unlock_irqrestore(&subpage->lock, flags);
367 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
368 struct page *page, u64 start, u32 len)
370 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
371 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
374 spin_lock_irqsave(&subpage->lock, flags);
375 subpage->dirty_bitmap |= tmp;
376 spin_unlock_irqrestore(&subpage->lock, flags);
377 set_page_dirty(page);
381 * Extra clear_and_test function for subpage dirty bitmap.
383 * Return true if we're the last bits in the dirty_bitmap and clear the
385 * Return false otherwise.
387 * NOTE: Callers should manually clear page dirty for true case, as we have
388 * extra handling for tree blocks.
390 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
391 struct page *page, u64 start, u32 len)
393 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
394 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
398 spin_lock_irqsave(&subpage->lock, flags);
399 subpage->dirty_bitmap &= ~tmp;
400 if (subpage->dirty_bitmap == 0)
402 spin_unlock_irqrestore(&subpage->lock, flags);
406 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
407 struct page *page, u64 start, u32 len)
411 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
413 clear_page_dirty_for_io(page);
416 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
417 struct page *page, u64 start, u32 len)
419 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
420 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
423 spin_lock_irqsave(&subpage->lock, flags);
424 subpage->writeback_bitmap |= tmp;
425 set_page_writeback(page);
426 spin_unlock_irqrestore(&subpage->lock, flags);
429 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
430 struct page *page, u64 start, u32 len)
432 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
433 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
436 spin_lock_irqsave(&subpage->lock, flags);
437 subpage->writeback_bitmap &= ~tmp;
438 if (subpage->writeback_bitmap == 0) {
439 ASSERT(PageWriteback(page));
440 end_page_writeback(page);
442 spin_unlock_irqrestore(&subpage->lock, flags);
445 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
446 struct page *page, u64 start, u32 len)
448 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
449 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
452 spin_lock_irqsave(&subpage->lock, flags);
453 subpage->ordered_bitmap |= tmp;
454 SetPageOrdered(page);
455 spin_unlock_irqrestore(&subpage->lock, flags);
458 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
459 struct page *page, u64 start, u32 len)
461 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
462 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
465 spin_lock_irqsave(&subpage->lock, flags);
466 subpage->ordered_bitmap &= ~tmp;
467 if (subpage->ordered_bitmap == 0)
468 ClearPageOrdered(page);
469 spin_unlock_irqrestore(&subpage->lock, flags);
472 * Unlike set/clear which is dependent on each page status, for test all bits
473 * are tested in the same way.
475 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
476 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
477 struct page *page, u64 start, u32 len) \
479 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
480 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
481 unsigned long flags; \
484 spin_lock_irqsave(&subpage->lock, flags); \
485 ret = ((subpage->name##_bitmap & tmp) == tmp); \
486 spin_unlock_irqrestore(&subpage->lock, flags); \
489 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
490 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
491 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
492 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
493 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
496 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
497 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
498 * back to regular sectorsize branch.
500 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
502 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
503 struct page *page, u64 start, u32 len) \
505 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
506 set_page_func(page); \
509 btrfs_subpage_set_##name(fs_info, page, start, len); \
511 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
512 struct page *page, u64 start, u32 len) \
514 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
515 clear_page_func(page); \
518 btrfs_subpage_clear_##name(fs_info, page, start, len); \
520 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
521 struct page *page, u64 start, u32 len) \
523 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
524 return test_page_func(page); \
525 return btrfs_subpage_test_##name(fs_info, page, start, len); \
527 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
528 struct page *page, u64 start, u32 len) \
530 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
531 set_page_func(page); \
534 btrfs_subpage_clamp_range(page, &start, &len); \
535 btrfs_subpage_set_##name(fs_info, page, start, len); \
537 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
538 struct page *page, u64 start, u32 len) \
540 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
541 clear_page_func(page); \
544 btrfs_subpage_clamp_range(page, &start, &len); \
545 btrfs_subpage_clear_##name(fs_info, page, start, len); \
547 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
548 struct page *page, u64 start, u32 len) \
550 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
551 return test_page_func(page); \
552 btrfs_subpage_clamp_range(page, &start, &len); \
553 return btrfs_subpage_test_##name(fs_info, page, start, len); \
555 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
557 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
558 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
560 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
562 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
566 * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
569 void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
572 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
574 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
577 ASSERT(!PageDirty(page));
578 if (fs_info->sectorsize == PAGE_SIZE)
581 ASSERT(PagePrivate(page) && page->private);
582 ASSERT(subpage->dirty_bitmap == 0);