ext4: introduce lseek SEEK_DATA/SEEK_HOLE support
authorZheng Liu <wenqing.lz@taobao.com>
Fri, 9 Nov 2012 02:57:40 +0000 (21:57 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Fri, 9 Nov 2012 02:57:40 +0000 (21:57 -0500)
This patch makes ext4 really support SEEK_DATA/SEEK_HOLE flags.  Block-mapped
and extent-mapped files are fully implemented together because ext4_map_blocks
hides this differences.

After applying this patch, it will cause a failure in xfstest #285 when the file
is block-mapped due to block-mapped file isn't support fallocate(2).

I had tried to use ext4_ext_walk_space() to retrieve the offset for a
extent-mapped file.  But finally I decide to keep using ext4_map_blocks() to
support SEEK_DATA/SEEK_HOLE because ext4_map_blocks() can hide the difference
between block-mapped file and extent-mapped file.  Moreover, in next step,
extent status tree will track all extent status, and we can get all mappings
from this tree.  So I think that using ext4_map_blocks() is a better choice.

CC: Hugh Dickins <hughd@google.com>
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/file.c

index bf3966b..2f5759e 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/mount.h>
 #include <linux/path.h>
 #include <linux/quotaops.h>
+#include <linux/pagevec.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -286,6 +287,324 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 }
 
 /*
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function.  When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
+ */
+
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
+ */
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+                                    int origin,
+                                    struct ext4_map_blocks *map,
+                                    loff_t *offset)
+{
+       struct pagevec pvec;
+       unsigned int blkbits;
+       pgoff_t index;
+       pgoff_t end;
+       loff_t endoff;
+       loff_t startoff;
+       loff_t lastoff;
+       int found = 0;
+
+       blkbits = inode->i_sb->s_blocksize_bits;
+       startoff = *offset;
+       lastoff = startoff;
+       endoff = (map->m_lblk + map->m_len) << blkbits;
+
+       index = startoff >> PAGE_CACHE_SHIFT;
+       end = endoff >> PAGE_CACHE_SHIFT;
+
+       pagevec_init(&pvec, 0);
+       do {
+               int i, num;
+               unsigned long nr_pages;
+
+               num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+               nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+                                         (pgoff_t)num);
+               if (nr_pages == 0) {
+                       if (origin == SEEK_DATA)
+                               break;
+
+                       BUG_ON(origin != SEEK_HOLE);
+                       /*
+                        * If this is the first time to go into the loop and
+                        * offset is not beyond the end offset, it will be a
+                        * hole at this offset
+                        */
+                       if (lastoff == startoff || lastoff < endoff)
+                               found = 1;
+                       break;
+               }
+
+               /*
+                * If this is the first time to go into the loop and
+                * offset is smaller than the first page offset, it will be a
+                * hole at this offset.
+                */
+               if (lastoff == startoff && origin == SEEK_HOLE &&
+                   lastoff < page_offset(pvec.pages[0])) {
+                       found = 1;
+                       break;
+               }
+
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       struct buffer_head *bh, *head;
+
+                       /*
+                        * If the current offset is not beyond the end of given
+                        * range, it will be a hole.
+                        */
+                       if (lastoff < endoff && origin == SEEK_HOLE &&
+                           page->index > end) {
+                               found = 1;
+                               *offset = lastoff;
+                               goto out;
+                       }
+
+                       lock_page(page);
+
+                       if (unlikely(page->mapping != inode->i_mapping)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (!page_has_buffers(page)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (page_has_buffers(page)) {
+                               lastoff = page_offset(page);
+                               bh = head = page_buffers(page);
+                               do {
+                                       if (buffer_uptodate(bh) ||
+                                           buffer_unwritten(bh)) {
+                                               if (origin == SEEK_DATA)
+                                                       found = 1;
+                                       } else {
+                                               if (origin == SEEK_HOLE)
+                                                       found = 1;
+                                       }
+                                       if (found) {
+                                               *offset = max_t(loff_t,
+                                                       startoff, lastoff);
+                                               unlock_page(page);
+                                               goto out;
+                                       }
+                                       lastoff += bh->b_size;
+                                       bh = bh->b_this_page;
+                               } while (bh != head);
+                       }
+
+                       lastoff = page_offset(page) + PAGE_SIZE;
+                       unlock_page(page);
+               }
+
+               /*
+                * The no. of pages is less than our desired, that would be a
+                * hole in there.
+                */
+               if (nr_pages < num && origin == SEEK_HOLE) {
+                       found = 1;
+                       *offset = lastoff;
+                       break;
+               }
+
+               index = pvec.pages[i - 1]->index + 1;
+               pagevec_release(&pvec);
+       } while (index <= end);
+
+out:
+       pagevec_release(&pvec);
+       return found;
+}
+
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
+{
+       struct inode *inode = file->f_mapping->host;
+       struct ext4_map_blocks map;
+       struct extent_status es;
+       ext4_lblk_t start, last, end;
+       loff_t dataoff, isize;
+       int blkbits;
+       int ret = 0;
+
+       mutex_lock(&inode->i_mutex);
+
+       isize = i_size_read(inode);
+       if (offset >= isize) {
+               mutex_unlock(&inode->i_mutex);
+               return -ENXIO;
+       }
+
+       blkbits = inode->i_sb->s_blocksize_bits;
+       start = offset >> blkbits;
+       last = start;
+       end = isize >> blkbits;
+       dataoff = offset;
+
+       do {
+               map.m_lblk = last;
+               map.m_len = end - last + 1;
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                       if (last != start)
+                               dataoff = last << blkbits;
+                       break;
+               }
+
+               /*
+                * If there is a delay extent at this offset,
+                * it will be as a data.
+                */
+               es.start = last;
+               (void)ext4_es_find_extent(inode, &es);
+               if (last >= es.start &&
+                   last < es.start + es.len) {
+                       if (last != start)
+                               dataoff = last << blkbits;
+                       break;
+               }
+
+               /*
+                * If there is a unwritten extent at this offset,
+                * it will be as a data or a hole according to page
+                * cache that has data or not.
+                */
+               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                       int unwritten;
+                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                                                             &map, &dataoff);
+                       if (unwritten)
+                               break;
+               }
+
+               last++;
+               dataoff = last << blkbits;
+       } while (last <= end);
+
+       mutex_unlock(&inode->i_mutex);
+
+       if (dataoff > isize)
+               return -ENXIO;
+
+       if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               return -EINVAL;
+       if (dataoff > maxsize)
+               return -EINVAL;
+
+       if (dataoff != file->f_pos) {
+               file->f_pos = dataoff;
+               file->f_version = 0;
+       }
+
+       return dataoff;
+}
+
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+       struct inode *inode = file->f_mapping->host;
+       struct ext4_map_blocks map;
+       struct extent_status es;
+       ext4_lblk_t start, last, end;
+       loff_t holeoff, isize;
+       int blkbits;
+       int ret = 0;
+
+       mutex_lock(&inode->i_mutex);
+
+       isize = i_size_read(inode);
+       if (offset >= isize) {
+               mutex_unlock(&inode->i_mutex);
+               return -ENXIO;
+       }
+
+       blkbits = inode->i_sb->s_blocksize_bits;
+       start = offset >> blkbits;
+       last = start;
+       end = isize >> blkbits;
+       holeoff = offset;
+
+       do {
+               map.m_lblk = last;
+               map.m_len = end - last + 1;
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+               if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+                       last += ret;
+                       holeoff = last << blkbits;
+                       continue;
+               }
+
+               /*
+                * If there is a delay extent at this offset,
+                * we will skip this extent.
+                */
+               es.start = last;
+               (void)ext4_es_find_extent(inode, &es);
+               if (last >= es.start &&
+                   last < es.start + es.len) {
+                       last = es.start + es.len;
+                       holeoff = last << blkbits;
+                       continue;
+               }
+
+               /*
+                * If there is a unwritten extent at this offset,
+                * it will be as a data or a hole according to page
+                * cache that has data or not.
+                */
+               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+                       int unwritten;
+                       unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+                                                             &map, &holeoff);
+                       if (!unwritten) {
+                               last += ret;
+                               holeoff = last << blkbits;
+                               continue;
+                       }
+               }
+
+               /* find a hole */
+               break;
+       } while (last <= end);
+
+       mutex_unlock(&inode->i_mutex);
+
+       if (holeoff > isize)
+               holeoff = isize;
+
+       if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               return -EINVAL;
+       if (holeoff > maxsize)
+               return -EINVAL;
+
+       if (holeoff != file->f_pos) {
+               file->f_pos = holeoff;
+               file->f_version = 0;
+       }
+
+       return holeoff;
+}
+
+/*
  * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
  * by calling generic_file_llseek_size() with the appropriate maxbytes
  * value for each.
@@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
        else
                maxbytes = inode->i_sb->s_maxbytes;
 
-       return generic_file_llseek_size(file, offset, origin,
-                                       maxbytes, i_size_read(inode));
+       switch (origin) {
+       case SEEK_SET:
+       case SEEK_CUR:
+       case SEEK_END:
+               return generic_file_llseek_size(file, offset, origin,
+                                               maxbytes, i_size_read(inode));
+       case SEEK_DATA:
+               return ext4_seek_data(file, offset, maxbytes);
+       case SEEK_HOLE:
+               return ext4_seek_hole(file, offset, maxbytes);
+       }
+
+       return -EINVAL;
 }
 
 const struct file_operations ext4_file_operations = {