Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
diff --combined Documentation/filesystems/fscrypt.rst

index 471a511,b0d015a..68c2bc8
--- 1/Documentation/filesystems/fscrypt.rst
--- 2/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@@ -256,8 -256,13 +256,8 @@@ alternative master keys or to support r
   the master keys may be wrapped in userspace, e.g. as is done by the
   `fscrypt <https://github.com/google/fscrypt>`_ tool.
   
- -Including the inode number in the IVs was considered.  However, it was
- -rejected as it would have prevented ext4 filesystems from being
- -resized, and by itself still wouldn't have been sufficient to prevent
- -the same key from being directly reused for both XTS and CTS-CBC.
- -
- -DIRECT_KEY and per-mode keys
- -----------------------------
+ +DIRECT_KEY policies
+ +-------------------
   
   The Adiantum encryption mode (see `Encryption modes and usage`_) is
   suitable for both contents and filenames encryption, and it accepts
@@@ -280,21 -285,6 +280,21 @@@ IV.  Moreover
     key derived using the KDF.  Users may use the same master key for
     other v2 encryption policies.
   
+ +IV_INO_LBLK_64 policies
+ +-----------------------
+ +
+ +When FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 is set in the fscrypt policy,
+ +the encryption keys are derived from the master key, encryption mode
+ +number, and filesystem UUID.  This normally results in all files
+ +protected by the same master key sharing a single contents encryption
+ +key and a single filenames encryption key.  To still encrypt different
+ +files' data differently, inode numbers are included in the IVs.
+ +Consequently, shrinking the filesystem may not be allowed.
+ +
+ +This format is optimized for use with inline encryption hardware
+ +compliant with the UFS or eMMC standards, which support only 64 IV
+ +bits per I/O request and may have only a small number of keyslots.
+ +
   Key identifiers
   ---------------
   
@@@ -318,9 -308,8 +318,9 @@@ If unsure, you should use the (AES-256-
   
   AES-128-CBC was added only for low-powered embedded devices with
   crypto accelerators such as CAAM or CESA that do not support XTS.  To
- -use AES-128-CBC, CONFIG_CRYPTO_SHA256 (or another SHA-256
- -implementation) must be enabled so that ESSIV can be used.
+ +use AES-128-CBC, CONFIG_CRYPTO_ESSIV and CONFIG_CRYPTO_SHA256 (or
+ +another SHA-256 implementation) must be enabled so that ESSIV can be
+ +used.
   
   Adiantum is a (primarily) stream cipher-based mode that is fast even
   on CPUs without dedicated crypto instructions.  It's also a true
@@@ -342,8 -331,8 +342,8 @@@ Contents encryptio
   -------------------
   
   For file contents, each filesystem block is encrypted independently.
- Currently, only the case where the filesystem block size is equal to
- the system's page size (usually 4096 bytes) is supported.
+ Starting from Linux kernel 5.5, encryption of filesystems with block
+ size less than system's page size is supported.
   
   Each block's IV is set to the logical block number within the file as
   a little endian number, except that:
@@@ -352,16 -341,10 +352,16 @@@
     is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
     of the file's data encryption key.
   
- -- In the "direct key" configuration (FSCRYPT_POLICY_FLAG_DIRECT_KEY
- -  set in the fscrypt_policy), the file's nonce is also appended to the
- -  IV.  Currently this is only allowed with the Adiantum encryption
- -  mode.
+ +- With `DIRECT_KEY policies`_, the file's nonce is appended to the IV.
+ +  Currently this is only allowed with the Adiantum encryption mode.
+ +
+ +- With `IV_INO_LBLK_64 policies`_, the logical block number is limited
+ +  to 32 bits and is placed in bits 0-31 of the IV.  The inode number
+ +  (which is also limited to 32 bits) is placed in bits 32-63.
+ +
+ +Note that because file logical block numbers are included in the IVs,
+ +filesystems must enforce that blocks are never shifted around within
+ +encrypted files, e.g. via "collapse range" or "insert range".
   
   Filenames encryption
   --------------------
@@@ -371,10 -354,10 +371,10 @@@ the requirements to retain support for 
   filenames of up to 255 bytes, the same IV is used for every filename
   in a directory.
   
- -However, each encrypted directory still uses a unique key; or
- -alternatively (for the "direct key" configuration) has the file's
- -nonce included in the IVs.  Thus, IV reuse is limited to within a
- -single directory.
+ +However, each encrypted directory still uses a unique key, or
+ +alternatively has the file's nonce (for `DIRECT_KEY policies`_) or
+ +inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs.
+ +Thus, IV reuse is limited to within a single directory.
   
   With CTS-CBC, the IV reuse means that when the plaintext filenames
   share a common prefix at least as long as the cipher block size (16
@@@ -448,15 -431,12 +448,15 @@@ This structure must be initialized as f
     (1) for ``contents_encryption_mode`` and FSCRYPT_MODE_AES_256_CTS
     (4) for ``filenames_encryption_mode``.
   
- -- ``flags`` must contain a value from ``<linux/fscrypt.h>`` which
- -  identifies the amount of NUL-padding to use when encrypting
- -  filenames.  If unsure, use FSCRYPT_POLICY_FLAGS_PAD_32 (0x3).
- -  Additionally, if the encryption modes are both
- -  FSCRYPT_MODE_ADIANTUM, this can contain
- -  FSCRYPT_POLICY_FLAG_DIRECT_KEY; see `DIRECT_KEY and per-mode keys`_.
+ +- ``flags`` contains optional flags from ``<linux/fscrypt.h>``:
+ +
+ +  - FSCRYPT_POLICY_FLAGS_PAD_*: The amount of NUL padding to use when
+ +    encrypting filenames.  If unsure, use FSCRYPT_POLICY_FLAGS_PAD_32
+ +    (0x3).
+ +  - FSCRYPT_POLICY_FLAG_DIRECT_KEY: See `DIRECT_KEY policies`_.
+ +  - FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64: See `IV_INO_LBLK_64
+ +    policies`_.  This is mutually exclusive with DIRECT_KEY and is not
+ +    supported on v1 policies.
   
   - For v2 encryption policies, ``__reserved`` must be zeroed.
   
@@@ -1109,7 -1089,7 +1109,7 @@@ policy structs (see `Setting an encrypt
   context structs also contain a nonce.  The nonce is randomly generated
   by the kernel and is used as KDF input or as a tweak to cause
   different files to be encrypted differently; see `Per-file keys`_ and
- -`DIRECT_KEY and per-mode keys`_.
+ +`DIRECT_KEY policies`_.
   
   Data path changes
   -----------------
diff --combined fs/ext4/ext4.h

index b3a2cc7,61987c1..f8578ca
--- 1/fs/ext4/ext4.h
--- 2/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@@ -198,6 -198,12 +198,12 @@@ struct ext4_system_blocks 
    */
   #define       EXT4_IO_END_UNWRITTEN   0x0001
   
+ struct ext4_io_end_vec {
+       struct list_head list;          /* list of io_end_vec */
+       loff_t offset;                  /* offset in the file */
+       ssize_t size;                   /* size of the extent */
+ };
+ 
   /*
    * For converting unwritten extents on a work queue. 'handle' is used for
    * buffered writeback.
@@@ -211,8 -217,7 +217,7 @@@ typedef struct ext4_io_end 
                                                  * bios covering the extent */
         unsigned int            flag;           /* unwritten or not */
         atomic_t                count;          /* reference counter */
-       loff_t                  offset;         /* offset in the file */
-       ssize_t                 size;           /* size of the extent */
+       struct list_head        list_vec;       /* list of ext4_io_end_vec */
   } ext4_io_end_t;
   
   struct ext4_io_submit {
@@@ -1579,7 -1584,6 +1584,6 @@@ enum 
         EXT4_STATE_NO_EXPAND,           /* No space for expansion */
         EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
         EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
-       EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
         EXT4_STATE_NEWENTRY,            /* File just added to dir */
         EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
         EXT4_STATE_EXT_PRECACHED,       /* extents have been precached */
@@@ -1678,7 -1682,6 +1682,7 @@@ static inline bool ext4_verity_in_progr
   #define EXT4_FEATURE_COMPAT_RESIZE_INODE      0x0010
   #define EXT4_FEATURE_COMPAT_DIR_INDEX         0x0020
   #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2     0x0200
+ +#define EXT4_FEATURE_COMPAT_STABLE_INODES     0x0800
   
   #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER   0x0001
   #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE     0x0002
@@@ -1780,7 -1783,6 +1784,7 @@@ EXT4_FEATURE_COMPAT_FUNCS(xattr,                EXT_A
   EXT4_FEATURE_COMPAT_FUNCS(resize_inode,               RESIZE_INODE)
   EXT4_FEATURE_COMPAT_FUNCS(dir_index,          DIR_INDEX)
   EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,      SPARSE_SUPER2)
+ +EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,      STABLE_INODES)
   
   EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,    SPARSE_SUPER)
   EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,      LARGE_FILE)
@@@ -2562,8 -2564,6 +2566,6 @@@ int ext4_get_block_unwritten(struct ino
                              struct buffer_head *bh_result, int create);
   int ext4_get_block(struct inode *inode, sector_t iblock,
                    struct buffer_head *bh_result, int create);
- int ext4_dio_get_block(struct inode *inode, sector_t iblock,
-                      struct buffer_head *bh_result, int create);
   int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                            struct buffer_head *bh, int create);
   int ext4_walk_page_buffers(handle_t *handle,
@@@ -2606,7 -2606,6 +2608,6 @@@ extern int ext4_can_truncate(struct ino
   extern int ext4_truncate(struct inode *);
   extern int ext4_break_layouts(struct inode *);
   extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
- extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
   extern void ext4_set_inode_flags(struct inode *);
   extern int ext4_alloc_da_blocks(struct inode *inode);
   extern void ext4_set_aops(struct inode *inode);
@@@ -3266,6 -3265,8 +3267,8 @@@ extern long ext4_fallocate(struct file 
                           loff_t len);
   extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                           loff_t offset, ssize_t len);
+ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+                                            ext4_io_end_t *io_end);
   extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                            struct ext4_map_blocks *map, int flags);
   extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@@ -3298,6 -3299,10 +3301,10 @@@ extern int ext4_swap_extents(handle_t *
                              ext4_lblk_t lblk2,  ext4_lblk_t count,
                              int mark_unwritten,int *err);
   extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+ extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+                                      int check_cred, int restart_cred,
+                                      int revoke_cred);
+ 
   
   /* move_extent.c */
   extern void ext4_double_down_write_data_sem(struct inode *first,
@@@ -3324,6 -3329,8 +3331,8 @@@ extern int ext4_bio_write_page(struct e
                                int len,
                                struct writeback_control *wbc,
                                bool keep_towrite);
+ extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+ extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
   
   /* mmp.c */
   extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
@@@ -3381,6 -3388,7 +3390,7 @@@ static inline void ext4_clear_io_unwrit
   }
   
   extern const struct iomap_ops ext4_iomap_ops;
+ extern const struct iomap_ops ext4_iomap_report_ops;
   
   static inline int ext4_buffer_uptodate(struct buffer_head *bh)
   {
diff --combined fs/ext4/inode.c

index 1d880ae,310e4ab..28f28de
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -164,39 -164,18 +164,18 @@@ int ext4_inode_is_fast_symlink(struct i
   }
   
   /*
-  * Restart the transaction associated with *handle.  This does a commit,
-  * so before we call here everything must be consistently dirtied against
-  * this transaction.
-  */
- int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
-                                int nblocks)
- {
-       int ret;
- 
-       /*
-        * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
-        * moment, get_block can be called only for blocks inside i_size since
-        * page cache has been already dropped and writes are blocked by
-        * i_mutex. So we can safely drop the i_data_sem here.
-        */
-       BUG_ON(EXT4_JOURNAL(inode) == NULL);
-       jbd_debug(2, "restarting handle %p\n", handle);
-       up_write(&EXT4_I(inode)->i_data_sem);
-       ret = ext4_journal_restart(handle, nblocks);
-       down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
- 
-       return ret;
- }
- 
- /*
    * Called at the last iput() if i_nlink is zero.
    */
   void ext4_evict_inode(struct inode *inode)
   {
         handle_t *handle;
         int err;
-       int extra_credits = 3;
+       /*
+        * Credits for final inode cleanup and freeing:
+        * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+        * (xattr block freeing), bitmap, group descriptor (inode freeing)
+        */
+       int extra_credits = 6;
         struct ext4_xattr_inode_array *ea_inode_array = NULL;
   
         trace_ext4_evict_inode(inode);
@@@ -252,8 -231,12 +231,12 @@@
         if (!IS_NOQUOTA(inode))
                 extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
   
+       /*
+        * Block bitmap, group descriptor, and inode are accounted in both
+        * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+        */
         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
-                                ext4_blocks_for_truncate(inode)+extra_credits);
+                        ext4_blocks_for_truncate(inode) + extra_credits - 3);
         if (IS_ERR(handle)) {
                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
                 /*
@@@ -827,136 -810,6 +810,6 @@@ int ext4_get_block_unwritten(struct ino
   #define DIO_MAX_BLOCKS 4096
   
   /*
-  * Get blocks function for the cases that need to start a transaction -
-  * generally difference cases of direct IO and DAX IO. It also handles retries
-  * in case of ENOSPC.
-  */
- static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
-                               struct buffer_head *bh_result, int flags)
- {
-       int dio_credits;
-       handle_t *handle;
-       int retries = 0;
-       int ret;
- 
-       /* Trim mapping request to maximum we can map at once for DIO */
-       if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
-               bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
-       dio_credits = ext4_chunk_trans_blocks(inode,
-                                     bh_result->b_size >> inode->i_blkbits);
- retry:
-       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
- 
-       ret = _ext4_get_block(inode, iblock, bh_result, flags);
-       ext4_journal_stop(handle);
- 
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-       return ret;
- }
- 
- /* Get block function for DIO reads and writes to inodes without extents */
- int ext4_dio_get_block(struct inode *inode, sector_t iblock,
-                      struct buffer_head *bh, int create)
- {
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
- 
-       if (!create)
-               return _ext4_get_block(inode, iblock, bh, 0);
-       return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
- }
- 
- /*
-  * Get block function for AIO DIO writes when we create unwritten extent if
-  * blocks are not allocated yet. The extent will be converted to written
-  * after IO is complete.
-  */
- static int ext4_dio_get_block_unwritten_async(struct inode *inode,
-               sector_t iblock, struct buffer_head *bh_result, int create)
- {
-       int ret;
- 
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
- 
-       ret = ext4_get_block_trans(inode, iblock, bh_result,
-                                  EXT4_GET_BLOCKS_IO_CREATE_EXT);
- 
-       /*
-        * When doing DIO using unwritten extents, we need io_end to convert
-        * unwritten extents to written on IO completion. We allocate io_end
-        * once we spot unwritten extent and store it in b_private. Generic
-        * DIO code keeps b_private set and furthermore passes the value to
-        * our completion callback in 'private' argument.
-        */
-       if (!ret && buffer_unwritten(bh_result)) {
-               if (!bh_result->b_private) {
-                       ext4_io_end_t *io_end;
- 
-                       io_end = ext4_init_io_end(inode, GFP_KERNEL);
-                       if (!io_end)
-                               return -ENOMEM;
-                       bh_result->b_private = io_end;
-                       ext4_set_io_unwritten_flag(inode, io_end);
-               }
-               set_buffer_defer_completion(bh_result);
-       }
- 
-       return ret;
- }
- 
- /*
-  * Get block function for non-AIO DIO writes when we create unwritten extent if
-  * blocks are not allocated yet. The extent will be converted to written
-  * after IO is complete by ext4_direct_IO_write().
-  */
- static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
-               sector_t iblock, struct buffer_head *bh_result, int create)
- {
-       int ret;
- 
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
- 
-       ret = ext4_get_block_trans(inode, iblock, bh_result,
-                                  EXT4_GET_BLOCKS_IO_CREATE_EXT);
- 
-       /*
-        * Mark inode as having pending DIO writes to unwritten extents.
-        * ext4_direct_IO_write() checks this flag and converts extents to
-        * written.
-        */
-       if (!ret && buffer_unwritten(bh_result))
-               ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- 
-       return ret;
- }
- 
- static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int create)
- {
-       int ret;
- 
-       ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
- 
-       ret = _ext4_get_block(inode, iblock, bh_result, 0);
-       /*
-        * Blocks should have been preallocated! ext4_file_write_iter() checks
-        * that.
-        */
-       WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
- 
-       return ret;
- }
- 
- 
- /*
    * `handle' can be NULL if create is zero
    */
   struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@@ -2341,6 -2194,79 +2194,79 @@@ static int mpage_process_page_bufs(stru
   }
   
   /*
+  * mpage_process_page - update page buffers corresponding to changed extent and
+  *                   may submit fully mapped page for IO
+  *
+  * @mpd               - description of extent to map, on return next extent to map
+  * @m_lblk    - logical block mapping.
+  * @m_pblk    - corresponding physical mapping.
+  * @map_bh    - determines on return whether this page requires any further
+  *              mapping or not.
+  * Scan given page buffers corresponding to changed extent and update buffer
+  * state according to new extent state.
+  * We map delalloc buffers to their physical location, clear unwritten bits.
+  * If the given page is not fully mapped, we update @map to the next extent in
+  * the given page that needs mapping & return @map_bh as true.
+  */
+ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+                             ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+                             bool *map_bh)
+ {
+       struct buffer_head *head, *bh;
+       ext4_io_end_t *io_end = mpd->io_submit.io_end;
+       ext4_lblk_t lblk = *m_lblk;
+       ext4_fsblk_t pblock = *m_pblk;
+       int err = 0;
+       int blkbits = mpd->inode->i_blkbits;
+       ssize_t io_end_size = 0;
+       struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+ 
+       bh = head = page_buffers(page);
+       do {
+               if (lblk < mpd->map.m_lblk)
+                       continue;
+               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+                       /*
+                        * Buffer after end of mapped extent.
+                        * Find next buffer in the page to map.
+                        */
+                       mpd->map.m_len = 0;
+                       mpd->map.m_flags = 0;
+                       io_end_vec->size += io_end_size;
+                       io_end_size = 0;
+ 
+                       err = mpage_process_page_bufs(mpd, head, bh, lblk);
+                       if (err > 0)
+                               err = 0;
+                       if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+                               io_end_vec = ext4_alloc_io_end_vec(io_end);
+                               if (IS_ERR(io_end_vec)) {
+                                       err = PTR_ERR(io_end_vec);
+                                       goto out;
+                               }
+                               io_end_vec->offset = mpd->map.m_lblk << blkbits;
+                       }
+                       *map_bh = true;
+                       goto out;
+               }
+               if (buffer_delay(bh)) {
+                       clear_buffer_delay(bh);
+                       bh->b_blocknr = pblock++;
+               }
+               clear_buffer_unwritten(bh);
+               io_end_size += (1 << blkbits);
+       } while (lblk++, (bh = bh->b_this_page) != head);
+ 
+       io_end_vec->size += io_end_size;
+       io_end_size = 0;
+       *map_bh = false;
+ out:
+       *m_lblk = lblk;
+       *m_pblk = pblock;
+       return err;
+ }
+ 
+ /*
    * mpage_map_buffers - update buffers corresponding to changed extent and
    *                   submit fully mapped pages for IO
    *
@@@ -2359,12 -2285,12 +2285,12 @@@ static int mpage_map_and_submit_buffers
         struct pagevec pvec;
         int nr_pages, i;
         struct inode *inode = mpd->inode;
-       struct buffer_head *head, *bh;
         int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
         pgoff_t start, end;
         ext4_lblk_t lblk;
-       sector_t pblock;
+       ext4_fsblk_t pblock;
         int err;
+       bool map_bh = false;
   
         start = mpd->map.m_lblk >> bpp_bits;
         end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@@ -2380,50 -2306,19 +2306,19 @@@
                 for (i = 0; i < nr_pages; i++) {
                         struct page *page = pvec.pages[i];
   
-                       bh = head = page_buffers(page);
-                       do {
-                               if (lblk < mpd->map.m_lblk)
-                                       continue;
-                               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
-                                       /*
-                                        * Buffer after end of mapped extent.
-                                        * Find next buffer in the page to map.
-                                        */
-                                       mpd->map.m_len = 0;
-                                       mpd->map.m_flags = 0;
-                                       /*
-                                        * FIXME: If dioread_nolock supports
-                                        * blocksize < pagesize, we need to make
-                                        * sure we add size mapped so far to
-                                        * io_end->size as the following call
-                                        * can submit the page for IO.
-                                        */
-                                       err = mpage_process_page_bufs(mpd, head,
-                                                                     bh, lblk);
-                                       pagevec_release(&pvec);
-                                       if (err > 0)
-                                               err = 0;
-                                       return err;
-                               }
-                               if (buffer_delay(bh)) {
-                                       clear_buffer_delay(bh);
-                                       bh->b_blocknr = pblock++;
-                               }
-                               clear_buffer_unwritten(bh);
-                       } while (lblk++, (bh = bh->b_this_page) != head);
- 
+                       err = mpage_process_page(mpd, page, &lblk, &pblock,
+                                                &map_bh);
                         /*
-                        * FIXME: This is going to break if dioread_nolock
-                        * supports blocksize < pagesize as we will try to
-                        * convert potentially unmapped parts of inode.
+                        * If map_bh is true, means page may require further bh
+                        * mapping, or maybe the page was submitted for IO.
+                        * So we return to call further extent mapping.
                          */
-                       mpd->io_submit.io_end->size += PAGE_SIZE;
+                       if (err < 0 || map_bh == true)
+                               goto out;
                         /* Page fully mapped - let IO run! */
                         err = mpage_submit_page(mpd, page);
-                       if (err < 0) {
-                               pagevec_release(&pvec);
-                               return err;
-                       }
+                       if (err < 0)
+                               goto out;
                 }
                 pagevec_release(&pvec);
         }
@@@ -2431,6 -2326,9 +2326,9 @@@
         mpd->map.m_len = 0;
         mpd->map.m_flags = 0;
         return 0;
+ out:
+       pagevec_release(&pvec);
+       return err;
   }
   
   static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@@ -2510,9 -2408,13 +2408,13 @@@ static int mpage_map_and_submit_extent(
         int err;
         loff_t disksize;
         int progress = 0;
+       ext4_io_end_t *io_end = mpd->io_submit.io_end;
+       struct ext4_io_end_vec *io_end_vec;
   
-       mpd->io_submit.io_end->offset =
-                               ((loff_t)map->m_lblk) << inode->i_blkbits;
+       io_end_vec = ext4_alloc_io_end_vec(io_end);
+       if (IS_ERR(io_end_vec))
+               return PTR_ERR(io_end_vec);
+       io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
         do {
                 err = mpage_map_one_extent(handle, mpd);
                 if (err < 0) {
@@@ -3406,473 -3308,235 +3308,235 @@@ static bool ext4_inode_datasync_dirty(s
         return inode->i_state & I_DIRTY_DATASYNC;
   }
   
- static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
-               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
+ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
+                          struct ext4_map_blocks *map, loff_t offset,
+                          loff_t length)
   {
-       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       unsigned int blkbits = inode->i_blkbits;
-       unsigned long first_block, last_block;
-       struct ext4_map_blocks map;
-       bool delalloc = false;
-       int ret;
- 
-       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
-               return -EINVAL;
-       first_block = offset >> blkbits;
-       last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
-                          EXT4_MAX_LOGICAL_BLOCK);
- 
-       if (flags & IOMAP_REPORT) {
-               if (ext4_has_inline_data(inode)) {
-                       ret = ext4_inline_data_iomap(inode, iomap);
-                       if (ret != -EAGAIN) {
-                               if (ret == 0 && offset >= iomap->length)
-                                       ret = -ENOENT;
-                               return ret;
-                       }
-               }
-       } else {
-               if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
-                       return -ERANGE;
-       }
- 
-       map.m_lblk = first_block;
-       map.m_len = last_block - first_block + 1;
- 
-       if (flags & IOMAP_REPORT) {
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret < 0)
-                       return ret;
- 
-               if (ret == 0) {
-                       ext4_lblk_t end = map.m_lblk + map.m_len - 1;
-                       struct extent_status es;
- 
-                       ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
-                                                 map.m_lblk, end, &es);
- 
-                       if (!es.es_len || es.es_lblk > end) {
-                               /* entire range is a hole */
-                       } else if (es.es_lblk > map.m_lblk) {
-                               /* range starts with a hole */
-                               map.m_len = es.es_lblk - map.m_lblk;
-                       } else {
-                               ext4_lblk_t offs = 0;
- 
-                               if (es.es_lblk < map.m_lblk)
-                                       offs = map.m_lblk - es.es_lblk;
-                               map.m_lblk = es.es_lblk + offs;
-                               map.m_len = es.es_len - offs;
-                               delalloc = true;
-                       }
-               }
-       } else if (flags & IOMAP_WRITE) {
-               int dio_credits;
-               handle_t *handle;
-               int retries = 0;
- 
-               /* Trim mapping request to maximum we can map at once for DIO */
-               if (map.m_len > DIO_MAX_BLOCKS)
-                       map.m_len = DIO_MAX_BLOCKS;
-               dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- retry:
-               /*
-                * Either we allocate blocks and then we don't get unwritten
-                * extent so we have reserved enough credits, or the blocks
-                * are already allocated and unwritten and in that case
-                * extent conversion fits in the credits as well.
-                */
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                           dio_credits);
-               if (IS_ERR(handle))
-                       return PTR_ERR(handle);
- 
-               ret = ext4_map_blocks(handle, inode, &map,
-                                     EXT4_GET_BLOCKS_CREATE_ZERO);
-               if (ret < 0) {
-                       ext4_journal_stop(handle);
-                       if (ret == -ENOSPC &&
-                           ext4_should_retry_alloc(inode->i_sb, &retries))
-                               goto retry;
-                       return ret;
-               }
- 
-               /*
-                * If we added blocks beyond i_size, we need to make sure they
-                * will get truncated if we crash before updating i_size in
-                * ext4_iomap_end(). For faults we don't need to do that (and
-                * even cannot because for orphan list operations inode_lock is
-                * required) - if we happen to instantiate block beyond i_size,
-                * it is because we race with truncate which has already added
-                * the inode to the orphan list.
-                */
-               if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
-                   (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
-                       int err;
- 
-                       err = ext4_orphan_add(handle, inode);
-                       if (err < 0) {
-                               ext4_journal_stop(handle);
-                               return err;
-                       }
-               }
-               ext4_journal_stop(handle);
-       } else {
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret < 0)
-                       return ret;
-       }
+       u8 blkbits = inode->i_blkbits;
   
+       /*
+        * Writes that span EOF might trigger an I/O size update on completion,
+        * so consider them to be dirty for the purpose of O_DSYNC, even if
+        * there is no other metadata changes being made or are pending.
+        */
         iomap->flags = 0;
-       if (ext4_inode_datasync_dirty(inode))
+       if (ext4_inode_datasync_dirty(inode) ||
+           offset + length > i_size_read(inode))
                 iomap->flags |= IOMAP_F_DIRTY;
+ 
+       if (map->m_flags & EXT4_MAP_NEW)
+               iomap->flags |= IOMAP_F_NEW;
+ 
         iomap->bdev = inode->i_sb->s_bdev;
-       iomap->dax_dev = sbi->s_daxdev;
-       iomap->offset = (u64)first_block << blkbits;
-       iomap->length = (u64)map.m_len << blkbits;
+       iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+       iomap->offset = (u64) map->m_lblk << blkbits;
+       iomap->length = (u64) map->m_len << blkbits;
   
-       if (ret == 0) {
-               iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
-               iomap->addr = IOMAP_NULL_ADDR;
+       /*
+        * Flags passed to ext4_map_blocks() for direct I/O writes can result
+        * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
+        * set. In order for any allocated unwritten extents to be converted
+        * into written extents correctly within the ->end_io() handler, we
+        * need to ensure that the iomap->type is set appropriately. Hence, the
+        * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
+        * been set first.
+        */
+       if (map->m_flags & EXT4_MAP_UNWRITTEN) {
+               iomap->type = IOMAP_UNWRITTEN;
+               iomap->addr = (u64) map->m_pblk << blkbits;
+       } else if (map->m_flags & EXT4_MAP_MAPPED) {
+               iomap->type = IOMAP_MAPPED;
+               iomap->addr = (u64) map->m_pblk << blkbits;
         } else {
-               if (map.m_flags & EXT4_MAP_MAPPED) {
-                       iomap->type = IOMAP_MAPPED;
-               } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                       iomap->type = IOMAP_UNWRITTEN;
-               } else {
-                       WARN_ON_ONCE(1);
-                       return -EIO;
-               }
-               iomap->addr = (u64)map.m_pblk << blkbits;
+               iomap->type = IOMAP_HOLE;
+               iomap->addr = IOMAP_NULL_ADDR;
         }
- 
-       if (map.m_flags & EXT4_MAP_NEW)
-               iomap->flags |= IOMAP_F_NEW;
- 
-       return 0;
   }
   
- static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
-                         ssize_t written, unsigned flags, struct iomap *iomap)
+ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
+                           unsigned int flags)
   {
-       int ret = 0;
         handle_t *handle;
-       int blkbits = inode->i_blkbits;
-       bool truncate = false;
+       u8 blkbits = inode->i_blkbits;
+       int ret, dio_credits, m_flags = 0, retries = 0;
   
-       if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
-               return 0;
- 
-       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-       if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto orphan_del;
-       }
-       if (ext4_update_inode_size(inode, offset + written))
-               ext4_mark_inode_dirty(handle, inode);
         /*
-        * We may need to truncate allocated but not written blocks beyond EOF.
+        * Trim the mapping request to the maximum value that we can map at
+        * once for direct I/O.
          */
-       if (iomap->offset + iomap->length > 
-           ALIGN(inode->i_size, 1 << blkbits)) {
-               ext4_lblk_t written_blk, end_blk;
+       if (map->m_len > DIO_MAX_BLOCKS)
+               map->m_len = DIO_MAX_BLOCKS;
+       dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
   
-               written_blk = (offset + written) >> blkbits;
-               end_blk = (offset + length) >> blkbits;
-               if (written_blk < end_blk && ext4_can_truncate(inode))
-                       truncate = true;
-       }
+ retry:
         /*
-        * Remove inode from orphan list if we were extending a inode and
-        * everything went fine.
+        * Either we allocate blocks and then don't get an unwritten extent, so
+        * in that case we have reserved enough credits. Or, the blocks are
+        * already allocated and unwritten. In that case, the extent conversion
+        * fits into the credits as well.
          */
-       if (!truncate && inode->i_nlink &&
-           !list_empty(&EXT4_I(inode)->i_orphan))
-               ext4_orphan_del(handle, inode);
-       ext4_journal_stop(handle);
-       if (truncate) {
-               ext4_truncate_failed_write(inode);
- orphan_del:
-               /*
-                * If truncate failed early the inode might still be on the
-                * orphan list; we need to make sure the inode is removed from
-                * the orphan list in that case.
-                */
-               if (inode->i_nlink)
-                       ext4_orphan_del(NULL, inode);
-       }
-       return ret;
- }
- 
- const struct iomap_ops ext4_iomap_ops = {
-       .iomap_begin            = ext4_iomap_begin,
-       .iomap_end              = ext4_iomap_end,
- };
- 
- static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-                           ssize_t size, void *private)
- {
-         ext4_io_end_t *io_end = private;
+       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
   
-       /* if not async direct IO just return */
-       if (!io_end)
-               return 0;
+       /*
+        * DAX and direct I/O are the only two operations that are currently
+        * supported with IOMAP_WRITE.
+        */
+       WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+       if (IS_DAX(inode))
+               m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+       /*
+        * We use i_size instead of i_disksize here because delalloc writeback
+        * can complete at any point during the I/O and subsequently push the
+        * i_disksize out to i_size. This could be beyond where direct I/O is
+        * happening and thus expose allocated blocks to direct I/O reads.
+        */
+       else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+               m_flags = EXT4_GET_BLOCKS_CREATE;
+       else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
   
-       ext_debug("ext4_end_io_dio(): io_end 0x%p "
-                 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
-                 io_end, io_end->inode->i_ino, iocb, offset, size);
+       ret = ext4_map_blocks(handle, inode, map, m_flags);
   
         /*
-        * Error during AIO DIO. We cannot convert unwritten extents as the
-        * data was not written. Just clear the unwritten flag and drop io_end.
+        * We cannot fill holes in indirect tree based inodes as that could
+        * expose stale data in the case of a crash. Use the magic error code
+        * to fallback to buffered I/O.
          */
-       if (size <= 0) {
-               ext4_clear_io_unwritten_flag(io_end);
-               size = 0;
-       }
-       io_end->offset = offset;
-       io_end->size = size;
-       ext4_put_io_end(io_end);
+       if (!m_flags && !ret)
+               ret = -ENOTBLK;
   
-       return 0;
+       ext4_journal_stop(handle);
+       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
+ 
+       return ret;
   }
   
- /*
-  * Handling of direct IO writes.
-  *
-  * For ext4 extent files, ext4 will do direct-io write even to holes,
-  * preallocated extents, and those write extend the file, no need to
-  * fall back to buffered IO.
-  *
-  * For holes, we fallocate those blocks, mark them as unwritten
-  * If those blocks were preallocated, we mark sure they are split, but
-  * still keep the range to write as unwritten.
-  *
-  * The unwritten extents will be converted to written when DIO is completed.
-  * For async direct IO, since the IO may still pending when return, we
-  * set up an end_io call back function, which will do the conversion
-  * when async direct IO completed.
-  *
-  * If the O_DIRECT write will extend the file then add this inode to the
-  * orphan list.  So recovery will truncate it back to the original size
-  * if the machine crashes during the write.
-  *
-  */
- static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+ 
+ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
   {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       ssize_t ret;
-       loff_t offset = iocb->ki_pos;
-       size_t count = iov_iter_count(iter);
-       int overwrite = 0;
-       get_block_t *get_block_func = NULL;
-       int dio_flags = 0;
-       loff_t final_size = offset + count;
-       int orphan = 0;
-       handle_t *handle;
+       int ret;
+       struct ext4_map_blocks map;
+       u8 blkbits = inode->i_blkbits;
   
-       if (final_size > inode->i_size || final_size > ei->i_disksize) {
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
-               ret = ext4_orphan_add(handle, inode);
-               if (ret) {
-                       ext4_journal_stop(handle);
-                       goto out;
-               }
-               orphan = 1;
-               ext4_update_i_disksize(inode, inode->i_size);
-               ext4_journal_stop(handle);
-       }
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
   
-       BUG_ON(iocb->private == NULL);
+       if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+               return -ERANGE;
   
         /*
-        * Make all waiters for direct IO properly wait also for extent
-        * conversion. This also disallows race between truncate() and
-        * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+        * Calculate the first and last logical blocks respectively.
          */
-       inode_dio_begin(inode);
+       map.m_lblk = offset >> blkbits;
+       map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+                         EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ 
+       if (flags & IOMAP_WRITE)
+               ret = ext4_iomap_alloc(inode, &map, flags);
+       else
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+ 
+       if (ret < 0)
+               return ret;
   
-       /* If we do a overwrite dio, i_mutex locking can be released */
-       overwrite = *((int *)iocb->private);
+       ext4_set_iomap(inode, iomap, &map, offset, length);
   
-       if (overwrite)
-               inode_unlock(inode);
+       return 0;
+ }
   
+ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+                         ssize_t written, unsigned flags, struct iomap *iomap)
+ {
         /*
-        * For extent mapped files we could direct write to holes and fallocate.
-        *
-        * Allocated blocks to fill the hole are marked as unwritten to prevent
-        * parallel buffered read to expose the stale data before DIO complete
-        * the data IO.
-        *
-        * As to previously fallocated extents, ext4 get_block will just simply
-        * mark the buffer mapped but still keep the extents unwritten.
-        *
-        * For non AIO case, we will convert those unwritten extents to written
-        * after return back from blockdev_direct_IO. That way we save us from
-        * allocating io_end structure and also the overhead of offloading
-        * the extent convertion to a workqueue.
-        *
-        * For async DIO, the conversion needs to be deferred when the
-        * IO is completed. The ext4 end_io callback function will be
-        * called to take care of the conversion work.  Here for async
-        * case, we allocate an io_end structure to hook to the iocb.
+        * Check to see whether an error occurred while writing out the data to
+        * the allocated blocks. If so, return the magic error code so that we
+        * fallback to buffered I/O and attempt to complete the remainder of
+        * the I/O. Any blocks that may have been allocated in preparation for
+        * the direct I/O will be reused during buffered I/O.
          */
-       iocb->private = NULL;
-       if (overwrite)
-               get_block_func = ext4_dio_get_block_overwrite;
-       else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
-                  round_down(offset, i_blocksize(inode)) >= inode->i_size) {
-               get_block_func = ext4_dio_get_block;
-               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
-       } else if (is_sync_kiocb(iocb)) {
-               get_block_func = ext4_dio_get_block_unwritten_sync;
-               dio_flags = DIO_LOCKING;
-       } else {
-               get_block_func = ext4_dio_get_block_unwritten_async;
-               dio_flags = DIO_LOCKING;
-       }
-       ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-                                  get_block_func, ext4_end_io_dio, NULL,
-                                  dio_flags);
+       if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+               return -ENOTBLK;
   
-       if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
-                                               EXT4_STATE_DIO_UNWRITTEN)) {
-               int err;
-               /*
-                * for non AIO case, since the IO is already
-                * completed, we could do the conversion right here
-                */
-               err = ext4_convert_unwritten_extents(NULL, inode,
-                                                    offset, ret);
-               if (err < 0)
-                       ret = err;
-               ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-       }
+       return 0;
+ }
   
-       inode_dio_end(inode);
-       /* take i_mutex locking again if we do a ovewrite dio */
-       if (overwrite)
-               inode_lock(inode);
+ const struct iomap_ops ext4_iomap_ops = {
+       .iomap_begin            = ext4_iomap_begin,
+       .iomap_end              = ext4_iomap_end,
+ };
   
-       if (ret < 0 && final_size > inode->i_size)
-               ext4_truncate_failed_write(inode);
+ static bool ext4_iomap_is_delalloc(struct inode *inode,
+                                  struct ext4_map_blocks *map)
+ {
+       struct extent_status es;
+       ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
   
-       /* Handle extending of i_size after direct IO write */
-       if (orphan) {
-               int err;
+       ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+                                 map->m_lblk, end, &es);
   
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       /*
-                        * We wrote the data but cannot extend
-                        * i_size. Bail out. In async io case, we do
-                        * not return error here because we have
-                        * already submmitted the corresponding
-                        * bio. Returning error here makes the caller
-                        * think that this IO is done and failed
-                        * resulting in race with bio's completion
-                        * handler.
-                        */
-                       if (!ret)
-                               ret = PTR_ERR(handle);
-                       if (inode->i_nlink)
-                               ext4_orphan_del(NULL, inode);
+       if (!es.es_len || es.es_lblk > end)
+               return false;
   
-                       goto out;
-               }
-               if (inode->i_nlink)
-                       ext4_orphan_del(handle, inode);
-               if (ret > 0) {
-                       loff_t end = offset + ret;
-                       if (end > inode->i_size || end > ei->i_disksize) {
-                               ext4_update_i_disksize(inode, end);
-                               if (end > inode->i_size)
-                                       i_size_write(inode, end);
-                               /*
-                                * We're going to return a positive `ret'
-                                * here due to non-zero-length I/O, so there's
-                                * no way of reporting error returns from
-                                * ext4_mark_inode_dirty() to userspace.  So
-                                * ignore it.
-                                */
-                               ext4_mark_inode_dirty(handle, inode);
-                       }
-               }
-               err = ext4_journal_stop(handle);
-               if (ret == 0)
-                       ret = err;
+       if (es.es_lblk > map->m_lblk) {
+               map->m_len = es.es_lblk - map->m_lblk;
+               return false;
         }
- out:
-       return ret;
- }
   
- static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
- {
-       struct address_space *mapping = iocb->ki_filp->f_mapping;
-       struct inode *inode = mapping->host;
-       size_t count = iov_iter_count(iter);
-       ssize_t ret;
+       offset = map->m_lblk - es.es_lblk;
+       map->m_len = es.es_len - offset;
   
-       /*
-        * Shared inode_lock is enough for us - it protects against concurrent
-        * writes & truncates and since we take care of writing back page cache,
-        * we are protected against page writeback as well.
-        */
-       inode_lock_shared(inode);
-       ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
-                                          iocb->ki_pos + count - 1);
-       if (ret)
-               goto out_unlock;
-       ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-                                  iter, ext4_dio_get_block, NULL, NULL, 0);
- out_unlock:
-       inode_unlock_shared(inode);
-       return ret;
+       return true;
   }
   
- static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
+                                  loff_t length, unsigned int flags,
+                                  struct iomap *iomap, struct iomap *srcmap)
   {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       size_t count = iov_iter_count(iter);
-       loff_t offset = iocb->ki_pos;
-       ssize_t ret;
+       int ret;
+       bool delalloc = false;
+       struct ext4_map_blocks map;
+       u8 blkbits = inode->i_blkbits;
   
- #ifdef CONFIG_FS_ENCRYPTION
-       if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
-               return 0;
- #endif
-       if (fsverity_active(inode))
-               return 0;
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
+ 
+       if (ext4_has_inline_data(inode)) {
+               ret = ext4_inline_data_iomap(inode, iomap);
+               if (ret != -EAGAIN) {
+                       if (ret == 0 && offset >= iomap->length)
+                               ret = -ENOENT;
+                       return ret;
+               }
+       }
   
         /*
-        * If we are doing data journalling we don't support O_DIRECT
+        * Calculate the first and last logical block respectively.
          */
-       if (ext4_should_journal_data(inode))
-               return 0;
+       map.m_lblk = offset >> blkbits;
+       map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+                         EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
   
-       /* Let buffer I/O handle the inline data case. */
-       if (ext4_has_inline_data(inode))
-               return 0;
+       ret = ext4_map_blocks(NULL, inode, &map, 0);
+       if (ret < 0)
+               return ret;
+       if (ret == 0)
+               delalloc = ext4_iomap_is_delalloc(inode, &map);
   
-       trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-       if (iov_iter_rw(iter) == READ)
-               ret = ext4_direct_IO_read(iocb, iter);
-       else
-               ret = ext4_direct_IO_write(iocb, iter);
-       trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
-       return ret;
+       ext4_set_iomap(inode, iomap, &map, offset, length);
+       if (delalloc && iomap->type == IOMAP_HOLE)
+               iomap->type = IOMAP_DELALLOC;
+ 
+       return 0;
   }
   
+ const struct iomap_ops ext4_iomap_report_ops = {
+       .iomap_begin = ext4_iomap_begin_report,
+ };
+ 
   /*
    * Pages can be marked dirty completely asynchronously from ext4's journalling
    * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
@@@ -3910,7 -3574,7 +3574,7 @@@ static const struct address_space_opera
         .bmap                   = ext4_bmap,
         .invalidatepage         = ext4_invalidatepage,
         .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
         .error_remove_page      = generic_error_remove_page,
@@@ -3927,7 -3591,7 +3591,7 @@@ static const struct address_space_opera
         .bmap                   = ext4_bmap,
         .invalidatepage         = ext4_journalled_invalidatepage,
         .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
         .is_partially_uptodate  = block_is_partially_uptodate,
         .error_remove_page      = generic_error_remove_page,
   };
@@@ -3943,7 -3607,7 +3607,7 @@@ static const struct address_space_opera
         .bmap                   = ext4_bmap,
         .invalidatepage         = ext4_invalidatepage,
         .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
         .error_remove_page      = generic_error_remove_page,
@@@ -5450,11 -5114,15 +5114,15 @@@ static void ext4_wait_for_tail_page_com
   
         offset = inode->i_size & (PAGE_SIZE - 1);
         /*
-        * All buffers in the last page remain valid? Then there's nothing to
-        * do. We do the check mainly to optimize the common PAGE_SIZE ==
-        * blocksize case
+        * If the page is fully truncated, we don't need to wait for any commit
+        * (and we even should not as __ext4_journalled_invalidatepage() may
+        * strip all buffers from the page but keep the page dirty which can then
+        * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+        * buffers). Also we don't need to wait for any commit if all buffers in
+        * the page remain valid. This is most beneficial for the common case of
+        * blocksize == PAGESIZE.
          */
-       if (offset > PAGE_SIZE - i_blocksize(inode))
+       if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                 return;
         while (1) {
                 page = find_lock_page(inode->i_mapping,
@@@ -5717,15 -5385,12 +5385,15 @@@ int ext4_getattr(const struct path *pat
                 stat->attributes |= STATX_ATTR_IMMUTABLE;
         if (flags & EXT4_NODUMP_FL)
                 stat->attributes |= STATX_ATTR_NODUMP;
+ +      if (flags & EXT4_VERITY_FL)
+ +              stat->attributes |= STATX_ATTR_VERITY;
   
         stat->attributes_mask |= (STATX_ATTR_APPEND |
                                   STATX_ATTR_COMPRESSED |
                                   STATX_ATTR_ENCRYPTED |
                                   STATX_ATTR_IMMUTABLE |
- -                                STATX_ATTR_NODUMP);
+ +                                STATX_ATTR_NODUMP |
+ +                                STATX_ATTR_VERITY);
   
         generic_fillattr(inode, stat);
         return 0;
@@@ -5915,8 -5580,23 +5583,23 @@@ static int __ext4_expand_extra_isize(st
   {
         struct ext4_inode *raw_inode;
         struct ext4_xattr_ibody_header *header;
+       unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
         int error;
   
+       /* this was checked at iget time, but double check for good measure */
+       if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+           (ei->i_extra_isize & 3)) {
+               EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+                                ei->i_extra_isize,
+                                EXT4_INODE_SIZE(inode->i_sb));
+               return -EFSCORRUPTED;
+       }
+       if ((new_extra_isize < ei->i_extra_isize) ||
+           (new_extra_isize < 4) ||
+           (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+               return -EINVAL; /* Should never happen */
+ 
         raw_inode = ext4_raw_inode(iloc);
   
         header = IHDR(inode, raw_inode);
@@@ -5968,9 -5648,8 +5651,8 @@@ static int ext4_try_to_expand_extra_isi
          * If this is felt to be critical, then e2fsck should be run to
          * force a large enough s_min_extra_isize.
          */
-       if (ext4_handle_valid(handle) &&
-           jbd2_journal_extend(handle,
-                               EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
+       if (ext4_journal_extend(handle,
+                               EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                 return -ENOSPC;
   
         if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
diff --combined fs/ext4/super.c

index b3cbf86,b205112..c540199
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -1172,9 -1172,9 +1172,9 @@@ void ext4_clear_inode(struct inode *ino
   {
         invalidate_inode_buffers(inode);
         clear_inode(inode);
-       dquot_drop(inode);
         ext4_discard_preallocations(inode);
         ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+       dquot_drop(inode);
         if (EXT4_I(inode)->jinode) {
                 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                                EXT4_I(inode)->jinode);
@@@ -1345,18 -1345,6 +1345,18 @@@ static bool ext4_dummy_context(struct i
         return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
   }
   
+ +static bool ext4_has_stable_inodes(struct super_block *sb)
+ +{
+ +      return ext4_has_feature_stable_inodes(sb);
+ +}
+ +
+ +static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
+ +                                     int *ino_bits_ret, int *lblk_bits_ret)
+ +{
+ +      *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
+ +      *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
+ +}
+ +
   static const struct fscrypt_operations ext4_cryptops = {
         .key_prefix             = "ext4:",
         .get_context            = ext4_get_context,
@@@ -1364,8 -1352,6 +1364,8 @@@
         .dummy_context          = ext4_dummy_context,
         .empty_dir              = ext4_empty_dir,
         .max_namelen            = EXT4_NAME_LEN,
+ +      .has_stable_inodes      = ext4_has_stable_inodes,
+ +      .get_ino_and_lblk_bits  = ext4_get_ino_and_lblk_bits,
   };
   #endif
   
@@@ -1388,7 -1374,6 +1388,6 @@@ static ssize_t ext4_quota_write(struct 
   static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                              unsigned int flags);
   static int ext4_enable_quotas(struct super_block *sb);
- static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
   
   static struct dquot **ext4_get_dquots(struct inode *inode)
   {
@@@ -1406,7 -1391,7 +1405,7 @@@ static const struct dquot_operations ex
         .destroy_dquot          = dquot_destroy,
         .get_projid             = ext4_get_projid,
         .get_inode_usage        = ext4_get_inode_usage,
-       .get_next_id            = ext4_get_next_id,
+       .get_next_id            = dquot_get_next_id,
   };
   
   static const struct quotactl_ops ext4_qctl_operations = {
@@@ -2065,7 -2050,7 +2064,7 @@@ static int parse_options(char *options
                          unsigned int *journal_ioprio,
                          int is_remount)
   {
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
         char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
         substring_t args[MAX_OPT_ARGS];
         int token;
@@@ -2119,16 -2104,6 +2118,6 @@@
                 }
         }
   #endif
-       if (test_opt(sb, DIOREAD_NOLOCK)) {
-               int blocksize =
-                       BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
- 
-               if (blocksize < PAGE_SIZE) {
-                       ext4_msg(sb, KERN_ERR, "can't mount with "
-                                "dioread_nolock if block size != PAGE_SIZE");
-                       return 0;
-               }
-       }
         return 1;
   }
   
@@@ -3569,12 -3544,15 +3558,15 @@@ static void ext4_clamp_want_extra_isize
   {
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_super_block *es = sbi->s_es;
+       unsigned def_extra_isize = sizeof(struct ext4_inode) -
+                                               EXT4_GOOD_OLD_INODE_SIZE;
   
-       /* determine the minimum size of new large inodes, if present */
-       if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
-           sbi->s_want_extra_isize == 0) {
-               sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-                                                    EXT4_GOOD_OLD_INODE_SIZE;
+       if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
+               sbi->s_want_extra_isize = 0;
+               return;
+       }
+       if (sbi->s_want_extra_isize < 4) {
+               sbi->s_want_extra_isize = def_extra_isize;
                 if (ext4_has_feature_extra_isize(sb)) {
                         if (sbi->s_want_extra_isize <
                             le16_to_cpu(es->s_want_extra_isize))
@@@ -3587,10 -3565,10 +3579,10 @@@
                 }
         }
         /* Check if enough inode space is available */
-       if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
-                                                       sbi->s_inode_size) {
-               sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-                                                      EXT4_GOOD_OLD_INODE_SIZE;
+       if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
+           (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+                                                       sbi->s_inode_size)) {
+               sbi->s_want_extra_isize = def_extra_isize;
                 ext4_msg(sb, KERN_INFO,
                          "required extra inode space not available");
         }
@@@ -4453,13 -4431,6 +4445,6 @@@ no_journal
                 }
         }
   
-       if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
-           (blocksize != PAGE_SIZE)) {
-               ext4_msg(sb, KERN_ERR,
-                        "Unsupported blocksize for fs encryption");
-               goto failed_mount_wq;
-       }
- 
         if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
                 ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
                 goto failed_mount_wq;
@@@ -6033,18 -6004,6 +6018,6 @@@ out
         }
         return len;
   }
- 
- static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
- {
-       const struct quota_format_ops   *ops;
- 
-       if (!sb_has_quota_loaded(sb, qid->type))
-               return -ESRCH;
-       ops = sb_dqopt(sb)->ops[qid->type];
-       if (!ops || !ops->get_next_id)
-               return -ENOSYS;
-       return dquot_get_next_id(sb, qid);
- }
   #endif
   
   static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
diff --combined fs/jbd2/transaction.c

index b25ebdc,c068912..27b9f9d
--- 1/fs/jbd2/transaction.c
--- 2/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@@ -63,6 -63,28 +63,28 @@@ void jbd2_journal_free_transaction(tran
   }
   
   /*
+  * Base amount of descriptor blocks we reserve for each transaction.
+  */
+ static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+ {
+       int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+       int tags_per_block;
+ 
+       /* Subtract UUID */
+       tag_space -= 16;
+       if (jbd2_journal_has_csum_v2or3(journal))
+               tag_space -= sizeof(struct jbd2_journal_block_tail);
+       /* Commit code leaves a slack space of 16 bytes at the end of block */
+       tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+       /*
+        * Revoke descriptors are accounted separately so we need to reserve
+        * space for commit block and normal transaction descriptor blocks.
+        */
+       return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
+                               tags_per_block);
+ }
+ 
+ /*
    * jbd2_get_transaction: obtain a new transaction_t object.
    *
    * Simply initialise a new transaction. Initialize it in
@@@ -88,7 -110,9 +110,9 @@@ static void jbd2_get_transaction(journa
         spin_lock_init(&transaction->t_handle_lock);
         atomic_set(&transaction->t_updates, 0);
         atomic_set(&transaction->t_outstanding_credits,
+                  jbd2_descriptor_blocks_per_trans(journal) +
                    atomic_read(&journal->j_reserved_credits));
+       atomic_set(&transaction->t_outstanding_revokes, 0);
         atomic_set(&transaction->t_handle_count, 0);
         INIT_LIST_HEAD(&transaction->t_inode_list);
         INIT_LIST_HEAD(&transaction->t_private_list);
@@@ -258,12 -282,13 +282,13 @@@ static int add_transaction_credits(jour
          * *before* starting to dirty potentially checkpointed buffers
          * in the new transaction.
          */
-       if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+       if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
                 atomic_sub(total, &t->t_outstanding_credits);
                 read_unlock(&journal->j_state_lock);
                 jbd2_might_wait_for_commit(journal);
                 write_lock(&journal->j_state_lock);
-               if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+               if (jbd2_log_space_left(journal) <
+                                       journal->j_max_transaction_buffers)
                         __jbd2_log_wait_for_space(journal);
                 write_unlock(&journal->j_state_lock);
                 return 1;
@@@ -299,12 -324,12 +324,12 @@@ static int start_this_handle(journal_t 
                              gfp_t gfp_mask)
   {
         transaction_t   *transaction, *new_transaction = NULL;
-       int             blocks = handle->h_buffer_credits;
+       int             blocks = handle->h_total_credits;
         int             rsv_blocks = 0;
         unsigned long ts = jiffies;
   
         if (handle->h_rsv_handle)
-               rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+               rsv_blocks = handle->h_rsv_handle->h_total_credits;
   
         /*
          * Limit the number of reserved credits to 1/2 of maximum transaction
@@@ -405,6 -430,7 +430,7 @@@ repeat
         update_t_max_wait(transaction, ts);
         handle->h_transaction = transaction;
         handle->h_requested_credits = blocks;
+       handle->h_revoke_credits_requested = handle->h_revoke_credits;
         handle->h_start_jiffies = jiffies;
         atomic_inc(&transaction->t_updates);
         atomic_inc(&transaction->t_handle_count);
@@@ -431,15 -457,15 +457,15 @@@ static handle_t *new_handle(int nblocks
         handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
         if (!handle)
                 return NULL;
-       handle->h_buffer_credits = nblocks;
+       handle->h_total_credits = nblocks;
         handle->h_ref = 1;
   
         return handle;
   }
   
   handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
-                             gfp_t gfp_mask, unsigned int type,
-                             unsigned int line_no)
+                             int revoke_records, gfp_t gfp_mask,
+                             unsigned int type, unsigned int line_no)
   {
         handle_t *handle = journal_current_handle();
         int err;
@@@ -453,6 -479,8 +479,8 @@@
                 return handle;
         }
   
+       nblocks += DIV_ROUND_UP(revoke_records,
+                               journal->j_revoke_records_per_block);
         handle = new_handle(nblocks);
         if (!handle)
                 return ERR_PTR(-ENOMEM);
@@@ -468,6 -496,7 +496,7 @@@
                 rsv_handle->h_journal = journal;
                 handle->h_rsv_handle = rsv_handle;
         }
+       handle->h_revoke_credits = revoke_records;
   
         err = start_this_handle(journal, handle, gfp_mask);
         if (err < 0) {
@@@ -508,16 -537,21 +537,21 @@@ EXPORT_SYMBOL(jbd2__journal_start)
    */
   handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
   {
-       return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
+       return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
   }
   EXPORT_SYMBOL(jbd2_journal_start);
   
- void jbd2_journal_free_reserved(handle_t *handle)
+ static void __jbd2_journal_unreserve_handle(handle_t *handle)
   {
         journal_t *journal = handle->h_journal;
   
         WARN_ON(!handle->h_reserved);
-       sub_reserved_credits(journal, handle->h_buffer_credits);
+       sub_reserved_credits(journal, handle->h_total_credits);
+ }
+ 
+ void jbd2_journal_free_reserved(handle_t *handle)
+ {
+       __jbd2_journal_unreserve_handle(handle);
         jbd2_free_handle(handle);
   }
   EXPORT_SYMBOL(jbd2_journal_free_reserved);
@@@ -571,7 -605,7 +605,7 @@@ int jbd2_journal_start_reserved(handle_
         handle->h_line_no = line_no;
         trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                 handle->h_transaction->t_tid, type,
-                               line_no, handle->h_buffer_credits);
+                               line_no, handle->h_total_credits);
         return 0;
   }
   EXPORT_SYMBOL(jbd2_journal_start_reserved);
@@@ -580,6 -614,7 +614,7 @@@
    * int jbd2_journal_extend() - extend buffer credits.
    * @handle:  handle to 'extend'
    * @nblocks: nr blocks to try to extend by.
+  * @revoke_records: number of revoke records to try to extend by.
    *
    * Some transactions, such as large extends and truncates, can be done
    * atomically all at once or in several stages.  The operation requests
@@@ -596,7 -631,7 +631,7 @@@
    * return code < 0 implies an error
    * return code > 0 implies normal transaction-full status.
    */
- int jbd2_journal_extend(handle_t *handle, int nblocks)
+ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
   {
         transaction_t *transaction = handle->h_transaction;
         journal_t *journal;
@@@ -618,6 -653,12 +653,12 @@@
                 goto error_out;
         }
   
+       nblocks += DIV_ROUND_UP(
+                       handle->h_revoke_credits_requested + revoke_records,
+                       journal->j_revoke_records_per_block) -
+               DIV_ROUND_UP(
+                       handle->h_revoke_credits_requested,
+                       journal->j_revoke_records_per_block);
         spin_lock(&transaction->t_handle_lock);
         wanted = atomic_add_return(nblocks,
                                    &transaction->t_outstanding_credits);
@@@ -629,22 -670,16 +670,16 @@@
                 goto unlock;
         }
   
-       if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
-           jbd2_log_space_left(journal)) {
-               jbd_debug(3, "denied handle %p %d blocks: "
-                         "insufficient log space\n", handle, nblocks);
-               atomic_sub(nblocks, &transaction->t_outstanding_credits);
-               goto unlock;
-       }
- 
         trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
                                  transaction->t_tid,
                                  handle->h_type, handle->h_line_no,
-                                handle->h_buffer_credits,
+                                handle->h_total_credits,
                                  nblocks);
   
-       handle->h_buffer_credits += nblocks;
+       handle->h_total_credits += nblocks;
         handle->h_requested_credits += nblocks;
+       handle->h_revoke_credits += revoke_records;
+       handle->h_revoke_credits_requested += revoke_records;
         result = 0;
   
         jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@@ -655,11 -690,55 +690,55 @@@ error_out
         return result;
   }
   
- -      rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+ static void stop_this_handle(handle_t *handle)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       int revokes;
+ 
+       J_ASSERT(journal_current_handle() == handle);
+       J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+       current->journal_info = NULL;
+       /*
+        * Subtract necessary revoke descriptor blocks from handle credits. We
+        * take care to account only for revoke descriptor blocks the
+        * transaction will really need as large sequences of transactions with
+        * small numbers of revokes are relatively common.
+        */
+       revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
+       if (revokes) {
+               int t_revokes, revoke_descriptors;
+               int rr_per_blk = journal->j_revoke_records_per_block;
+ 
+               WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
+                               > handle->h_total_credits);
+               t_revokes = atomic_add_return(revokes,
+                               &transaction->t_outstanding_revokes);
+               revoke_descriptors =
+                       DIV_ROUND_UP(t_revokes, rr_per_blk) -
+                       DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
+               handle->h_total_credits -= revoke_descriptors;
+       }
+       atomic_sub(handle->h_total_credits,
+                  &transaction->t_outstanding_credits);
+       if (handle->h_rsv_handle)
+               __jbd2_journal_unreserve_handle(handle->h_rsv_handle);
+       if (atomic_dec_and_test(&transaction->t_updates))
+               wake_up(&journal->j_wait_updates);
+ 
++      rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+       /*
+        * Scope of the GFP_NOFS context is over here and so we can restore the
+        * original alloc context.
+        */
+       memalloc_nofs_restore(handle->saved_alloc_context);
+ }
   
   /**
    * int jbd2_journal_restart() - restart a handle .
    * @handle:  handle to restart
    * @nblocks: nr credits requested
+  * @revoke_records: number of revoke record credits requested
    * @gfp_mask: memory allocation flags (for start_this_handle)
    *
    * Restart a handle for a multi-transaction filesystem
@@@ -672,56 -751,48 +751,48 @@@
    * credits. We preserve reserved handle if there's any attached to the
    * passed in handle.
    */
- int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
+ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
+                         gfp_t gfp_mask)
   {
         transaction_t *transaction = handle->h_transaction;
         journal_t *journal;
         tid_t           tid;
-       int             need_to_start, ret;
+       int             need_to_start;
+       int             ret;
   
         /* If we've had an abort of any type, don't even think about
          * actually doing the restart! */
         if (is_handle_aborted(handle))
                 return 0;
         journal = transaction->t_journal;
+       tid = transaction->t_tid;
   
         /*
          * First unlink the handle from its current transaction, and start the
          * commit on that.
          */
-       J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-       J_ASSERT(journal_current_handle() == handle);
- 
-       read_lock(&journal->j_state_lock);
-       spin_lock(&transaction->t_handle_lock);
-       atomic_sub(handle->h_buffer_credits,
-                  &transaction->t_outstanding_credits);
-       if (handle->h_rsv_handle) {
-               sub_reserved_credits(journal,
-                                    handle->h_rsv_handle->h_buffer_credits);
-       }
-       if (atomic_dec_and_test(&transaction->t_updates))
-               wake_up(&journal->j_wait_updates);
-       tid = transaction->t_tid;
-       spin_unlock(&transaction->t_handle_lock);
+       jbd_debug(2, "restarting handle %p\n", handle);
+       stop_this_handle(handle);
         handle->h_transaction = NULL;
-       current->journal_info = NULL;
   
-       jbd_debug(2, "restarting handle %p\n", handle);
+       /*
+        * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
+        * get rid of pointless j_state_lock traffic like this.
+        */
+       read_lock(&journal->j_state_lock);
         need_to_start = !tid_geq(journal->j_commit_request, tid);
         read_unlock(&journal->j_state_lock);
         if (need_to_start)
                 jbd2_log_start_commit(journal, tid);
- 
-       rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
-       handle->h_buffer_credits = nblocks;
-       /*
-        * Restore the original nofs context because the journal restart
-        * is basically the same thing as journal stop and start.
-        * start_this_handle will start a new nofs context.
-        */
-       memalloc_nofs_restore(handle->saved_alloc_context);
+       handle->h_total_credits = nblocks +
+               DIV_ROUND_UP(revoke_records,
+                            journal->j_revoke_records_per_block);
+       handle->h_revoke_credits = revoke_records;
         ret = start_this_handle(journal, handle, gfp_mask);
+       trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
+                                ret ? 0 : handle->h_transaction->t_tid,
+                                handle->h_type, handle->h_line_no,
+                                handle->h_total_credits);
         return ret;
   }
   EXPORT_SYMBOL(jbd2__journal_restart);
@@@ -729,7 -800,7 +800,7 @@@
   
   int jbd2_journal_restart(handle_t *handle, int nblocks)
   {
-       return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+       return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
   }
   EXPORT_SYMBOL(jbd2_journal_restart);
   
@@@ -879,7 -950,7 +950,7 @@@ repeat
   
         start_lock = jiffies;
         lock_buffer(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
   
         /* If it takes too long to lock the buffer, trace it */
         time_lock = jbd2_time_diff(start_lock, jiffies);
@@@ -929,7 -1000,7 +1000,7 @@@
   
         error = -EROFS;
         if (is_handle_aborted(handle)) {
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                 goto out;
         }
         error = 0;
@@@ -993,7 -1064,7 +1064,7 @@@
          */
         if (buffer_shadow(bh)) {
                 JBUFFER_TRACE(jh, "on shadow: sleep");
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                 wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
                 goto repeat;
         }
@@@ -1014,7 -1085,7 +1085,7 @@@
                 JBUFFER_TRACE(jh, "generate frozen data");
                 if (!frozen_buffer) {
                         JBUFFER_TRACE(jh, "allocate memory for buffer");
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                         frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
                                                    GFP_NOFS | __GFP_NOFAIL);
                         goto repeat;
@@@ -1033,7 -1104,7 +1104,7 @@@ attach_next
         jh->b_next_transaction = transaction;
   
   done:
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
   
         /*
          * If we are about to journal a buffer, then any revoke pending on it is
@@@ -1172,7 -1243,7 +1243,7 @@@ int jbd2_journal_get_create_access(hand
          * that case: the transaction must have deleted the buffer for it to be
          * reused here.
          */
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
         J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
                 jh->b_transaction == NULL ||
                 (jh->b_transaction == journal->j_committing_transaction &&
@@@ -1207,7 -1278,7 +1278,7 @@@
                 jh->b_next_transaction = transaction;
                 spin_unlock(&journal->j_list_lock);
         }
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
   
         /*
          * akpm: I added this.  ext3_alloc_branch can pick up new indirect
@@@ -1275,13 -1346,13 +1346,13 @@@ repeat
                 committed_data = jbd2_alloc(jh2bh(jh)->b_size,
                                             GFP_NOFS|__GFP_NOFAIL);
   
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
         if (!jh->b_committed_data) {
                 /* Copy out the current buffer contents into the
                  * preserved, committed copy. */
                 JBUFFER_TRACE(jh, "generate b_committed data");
                 if (!committed_data) {
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                         goto repeat;
                 }
   
@@@ -1289,7 -1360,7 +1360,7 @@@
                 committed_data = NULL;
                 memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
         }
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
   out:
         jbd2_journal_put_journal_head(jh);
         if (unlikely(committed_data))
@@@ -1390,16 -1461,16 +1461,16 @@@ int jbd2_journal_dirty_metadata(handle_
          */
         if (jh->b_transaction != transaction &&
             jh->b_next_transaction != transaction) {
-               jbd_lock_bh_state(bh);
+               spin_lock(&jh->b_state_lock);
                 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                 jh->b_next_transaction == transaction);
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
         }
         if (jh->b_modified == 1) {
                 /* If it's in our transaction it must be in BJ_Metadata list. */
                 if (jh->b_transaction == transaction &&
                     jh->b_jlist != BJ_Metadata) {
-                       jbd_lock_bh_state(bh);
+                       spin_lock(&jh->b_state_lock);
                         if (jh->b_transaction == transaction &&
                             jh->b_jlist != BJ_Metadata)
                                 pr_err("JBD2: assertion failure: h_type=%u "
@@@ -1409,13 -1480,13 +1480,13 @@@
                                        jh->b_jlist);
                         J_ASSERT_JH(jh, jh->b_transaction != transaction ||
                                         jh->b_jlist == BJ_Metadata);
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                 }
                 goto out;
         }
   
         journal = transaction->t_journal;
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
   
         if (jh->b_modified == 0) {
                 /*
@@@ -1423,12 -1494,12 +1494,12 @@@
                  * of the transaction. This needs to be done
                  * once a transaction -bzzz
                  */
-               if (handle->h_buffer_credits <= 0) {
+               if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
                         ret = -ENOSPC;
                         goto out_unlock_bh;
                 }
                 jh->b_modified = 1;
-               handle->h_buffer_credits--;
+               handle->h_total_credits--;
         }
   
         /*
@@@ -1501,7 -1572,7 +1572,7 @@@
         __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
         spin_unlock(&journal->j_list_lock);
   out_unlock_bh:
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
   out:
         JBUFFER_TRACE(jh, "exit");
         return ret;
@@@ -1539,18 -1610,20 +1610,20 @@@ int jbd2_journal_forget (handle_t *hand
   
         BUFFER_TRACE(bh, "entry");
   
-       jbd_lock_bh_state(bh);
+       jh = jbd2_journal_grab_journal_head(bh);
+       if (!jh) {
+               __bforget(bh);
+               return 0;
+       }
   
-       if (!buffer_jbd(bh))
-               goto not_jbd;
-       jh = bh2jh(bh);
+       spin_lock(&jh->b_state_lock);
   
         /* Critical error: attempting to delete a bitmap buffer, maybe?
          * Don't do any jbd operations, and return an error. */
         if (!J_EXPECT_JH(jh, !jh->b_committed_data,
                          "inconsistent data on disk")) {
                 err = -EIO;
-               goto not_jbd;
+               goto drop;
         }
   
         /* keep track of whether or not this transaction modified us */
@@@ -1598,10 -1671,7 +1671,7 @@@
                         __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                 } else {
                         __jbd2_journal_unfile_buffer(jh);
-                       if (!buffer_jbd(bh)) {
-                               spin_unlock(&journal->j_list_lock);
-                               goto not_jbd;
-                       }
+                       jbd2_journal_put_journal_head(jh);
                 }
                 spin_unlock(&journal->j_list_lock);
         } else if (jh->b_transaction) {
@@@ -1643,7 -1713,7 +1713,7 @@@
                 if (!jh->b_cp_transaction) {
                         JBUFFER_TRACE(jh, "belongs to none transaction");
                         spin_unlock(&journal->j_list_lock);
-                       goto not_jbd;
+                       goto drop;
                 }
   
                 /*
@@@ -1653,7 -1723,7 +1723,7 @@@
                 if (!buffer_dirty(bh)) {
                         __jbd2_journal_remove_checkpoint(jh);
                         spin_unlock(&journal->j_list_lock);
-                       goto not_jbd;
+                       goto drop;
                 }
   
                 /*
@@@ -1666,20 -1736,15 +1736,15 @@@
                 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                 spin_unlock(&journal->j_list_lock);
         }
- 
-       jbd_unlock_bh_state(bh);
-       __brelse(bh);
   drop:
+       __brelse(bh);
+       spin_unlock(&jh->b_state_lock);
+       jbd2_journal_put_journal_head(jh);
         if (drop_reserve) {
                 /* no need to reserve log space for this block -bzzz */
-               handle->h_buffer_credits++;
+               handle->h_total_credits++;
         }
         return err;
- 
- not_jbd:
-       jbd_unlock_bh_state(bh);
-       __bforget(bh);
-       goto drop;
   }
   
   /**
@@@ -1706,45 -1771,34 +1771,34 @@@ int jbd2_journal_stop(handle_t *handle
         tid_t tid;
         pid_t pid;
   
+       if (--handle->h_ref > 0) {
+               jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+                                                handle->h_ref);
+               if (is_handle_aborted(handle))
+                       return -EIO;
+               return 0;
+       }
         if (!transaction) {
                 /*
-                * Handle is already detached from the transaction so
-                * there is nothing to do other than decrease a refcount,
-                * or free the handle if refcount drops to zero
+                * Handle is already detached from the transaction so there is
+                * nothing to do other than free the handle.
                  */
-               if (--handle->h_ref > 0) {
-                       jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
-                                                        handle->h_ref);
-                       return err;
-               } else {
-                       if (handle->h_rsv_handle)
-                               jbd2_free_handle(handle->h_rsv_handle);
-                       goto free_and_exit;
-               }
+               memalloc_nofs_restore(handle->saved_alloc_context);
+               goto free_and_exit;
         }
         journal = transaction->t_journal;
- 
-       J_ASSERT(journal_current_handle() == handle);
+       tid = transaction->t_tid;
   
         if (is_handle_aborted(handle))
                 err = -EIO;
-       else
-               J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- 
-       if (--handle->h_ref > 0) {
-               jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
-                         handle->h_ref);
-               return err;
-       }
   
         jbd_debug(4, "Handle %p going down\n", handle);
         trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
-                               transaction->t_tid,
-                               handle->h_type, handle->h_line_no,
+                               tid, handle->h_type, handle->h_line_no,
                                 jiffies - handle->h_start_jiffies,
                                 handle->h_sync, handle->h_requested_credits,
                                 (handle->h_requested_credits -
-                                handle->h_buffer_credits));
+                                handle->h_total_credits));
   
         /*
          * Implement synchronous transaction batching.  If the handle
@@@ -1804,19 -1858,13 +1858,13 @@@
   
         if (handle->h_sync)
                 transaction->t_synchronous_commit = 1;
-       current->journal_info = NULL;
-       atomic_sub(handle->h_buffer_credits,
-                  &transaction->t_outstanding_credits);
   
         /*
          * If the handle is marked SYNC, we need to set another commit
-        * going!  We also want to force a commit if the current
-        * transaction is occupying too much of the log, or if the
-        * transaction is too old now.
+        * going!  We also want to force a commit if the transaction is too
+        * old now.
          */
         if (handle->h_sync ||
-           (atomic_read(&transaction->t_outstanding_credits) >
-            journal->j_max_transaction_buffers) ||
             time_after_eq(jiffies, transaction->t_expires)) {
                 /* Do this even for aborted journals: an abort still
                  * completes the commit thread, it just doesn't write
@@@ -1825,7 -1873,7 +1873,7 @@@
                 jbd_debug(2, "transaction too old, requesting commit for "
                                         "handle %p\n", handle);
                 /* This is non-blocking */
-               jbd2_log_start_commit(journal, transaction->t_tid);
+               jbd2_log_start_commit(journal, tid);
   
                 /*
                  * Special case: JBD2_SYNC synchronous updates require us
@@@ -1836,31 -1884,19 +1884,19 @@@
         }
   
         /*
-        * Once we drop t_updates, if it goes to zero the transaction
-        * could start committing on us and eventually disappear.  So
-        * once we do this, we must not dereference transaction
-        * pointer again.
+        * Once stop_this_handle() drops t_updates, the transaction could start
+        * committing on us and eventually disappear.  So we must not
+        * dereference transaction pointer again after calling
+        * stop_this_handle().
          */
-       tid = transaction->t_tid;
-       if (atomic_dec_and_test(&transaction->t_updates)) {
-               wake_up(&journal->j_wait_updates);
-               if (journal->j_barrier_count)
-                       wake_up(&journal->j_wait_transaction_locked);
-       }
- 
-       rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+       stop_this_handle(handle);
   
         if (wait_for_commit)
                 err = jbd2_log_wait_commit(journal, tid);
   
-       if (handle->h_rsv_handle)
-               jbd2_journal_free_reserved(handle->h_rsv_handle);
   free_and_exit:
-       /*
-        * Scope of the GFP_NOFS context is over here and so we can restore the
-        * original alloc context.
-        */
-       memalloc_nofs_restore(handle->saved_alloc_context);
+       if (handle->h_rsv_handle)
+               jbd2_free_handle(handle->h_rsv_handle);
         jbd2_free_handle(handle);
         return err;
   }
@@@ -1878,7 -1914,7 +1914,7 @@@
    *
    * j_list_lock is held.
    *
-  * jbd_lock_bh_state(jh2bh(jh)) is held.
+  * jh->b_state_lock is held.
    */
   
   static inline void
@@@ -1902,7 -1938,7 +1938,7 @@@ __blist_add_buffer(struct journal_head 
    *
    * Called with j_list_lock held, and the journal may not be locked.
    *
-  * jbd_lock_bh_state(jh2bh(jh)) is held.
+  * jh->b_state_lock is held.
    */
   
   static inline void
@@@ -1934,7 -1970,7 +1970,7 @@@ static void __jbd2_journal_temp_unlink_
         transaction_t *transaction;
         struct buffer_head *bh = jh2bh(jh);
   
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
         transaction = jh->b_transaction;
         if (transaction)
                 assert_spin_locked(&transaction->t_journal->j_list_lock);
@@@ -1971,17 -2007,15 +2007,15 @@@
   }
   
   /*
-  * Remove buffer from all transactions.
+  * Remove buffer from all transactions. The caller is responsible for dropping
+  * the jh reference that belonged to the transaction.
    *
    * Called with bh_state lock and j_list_lock
-  *
-  * jh and bh may be already freed when this function returns.
    */
   static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
   {
         __jbd2_journal_temp_unlink_buffer(jh);
         jh->b_transaction = NULL;
-       jbd2_journal_put_journal_head(jh);
   }
   
   void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
@@@ -1990,18 -2024,19 +2024,19 @@@
   
         /* Get reference so that buffer cannot be freed before we unlock it */
         get_bh(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
         spin_lock(&journal->j_list_lock);
         __jbd2_journal_unfile_buffer(jh);
         spin_unlock(&journal->j_list_lock);
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
+       jbd2_journal_put_journal_head(jh);
         __brelse(bh);
   }
   
   /*
    * Called from jbd2_journal_try_to_free_buffers().
    *
-  * Called under jbd_lock_bh_state(bh)
+  * Called under jh->b_state_lock
    */
   static void
   __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
@@@ -2088,10 -2123,10 +2123,10 @@@ int jbd2_journal_try_to_free_buffers(jo
                 if (!jh)
                         continue;
   
-               jbd_lock_bh_state(bh);
+               spin_lock(&jh->b_state_lock);
                 __journal_try_to_free_buffer(journal, bh);
+               spin_unlock(&jh->b_state_lock);
                 jbd2_journal_put_journal_head(jh);
-               jbd_unlock_bh_state(bh);
                 if (buffer_jbd(bh))
                         goto busy;
         } while ((bh = bh->b_this_page) != head);
@@@ -2112,7 -2147,7 +2147,7 @@@ busy
    *
    * Called under j_list_lock.
    *
-  * Called under jbd_lock_bh_state(bh).
+  * Called under jh->b_state_lock.
    */
   static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
   {
@@@ -2133,6 -2168,7 +2168,7 @@@
         } else {
                 JBUFFER_TRACE(jh, "on running transaction");
                 __jbd2_journal_unfile_buffer(jh);
+               jbd2_journal_put_journal_head(jh);
         }
         return may_free;
   }
@@@ -2199,18 -2235,15 +2235,15 @@@ static int journal_unmap_buffer(journal
          * holding the page lock. --sct
          */
   
-       if (!buffer_jbd(bh))
+       jh = jbd2_journal_grab_journal_head(bh);
+       if (!jh)
                 goto zap_buffer_unlocked;
   
         /* OK, we have data buffer in journaled mode */
         write_lock(&journal->j_state_lock);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
         spin_lock(&journal->j_list_lock);
   
-       jh = jbd2_journal_grab_journal_head(bh);
-       if (!jh)
-               goto zap_buffer_no_jh;
- 
         /*
          * We cannot remove the buffer from checkpoint lists until the
          * transaction adding inode to orphan list (let's call it T)
@@@ -2289,10 -2322,10 +2322,10 @@@
                  * for commit and try again.
                  */
                 if (partial_page) {
-                       jbd2_journal_put_journal_head(jh);
                         spin_unlock(&journal->j_list_lock);
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                         write_unlock(&journal->j_state_lock);
+                       jbd2_journal_put_journal_head(jh);
                         return -EBUSY;
                 }
                 /*
@@@ -2304,10 -2337,10 +2337,10 @@@
                 set_buffer_freed(bh);
                 if (journal->j_running_transaction && buffer_jbddirty(bh))
                         jh->b_next_transaction = journal->j_running_transaction;
-               jbd2_journal_put_journal_head(jh);
                 spin_unlock(&journal->j_list_lock);
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                 write_unlock(&journal->j_state_lock);
+               jbd2_journal_put_journal_head(jh);
                 return 0;
         } else {
                 /* Good, the buffer belongs to the running transaction.
@@@ -2331,11 -2364,10 +2364,10 @@@ zap_buffer
          * here.
          */
         jh->b_modified = 0;
-       jbd2_journal_put_journal_head(jh);
- zap_buffer_no_jh:
         spin_unlock(&journal->j_list_lock);
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
         write_unlock(&journal->j_state_lock);
+       jbd2_journal_put_journal_head(jh);
   zap_buffer_unlocked:
         clear_buffer_dirty(bh);
         J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@@ -2422,7 -2454,7 +2454,7 @@@ void __jbd2_journal_file_buffer(struct 
         int was_dirty = 0;
         struct buffer_head *bh = jh2bh(jh);
   
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
         assert_spin_locked(&transaction->t_journal->j_list_lock);
   
         J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@@ -2484,11 -2516,11 +2516,11 @@@
   void jbd2_journal_file_buffer(struct journal_head *jh,
                                 transaction_t *transaction, int jlist)
   {
-       jbd_lock_bh_state(jh2bh(jh));
+       spin_lock(&jh->b_state_lock);
         spin_lock(&transaction->t_journal->j_list_lock);
         __jbd2_journal_file_buffer(jh, transaction, jlist);
         spin_unlock(&transaction->t_journal->j_list_lock);
-       jbd_unlock_bh_state(jh2bh(jh));
+       spin_unlock(&jh->b_state_lock);
   }
   
   /*
@@@ -2498,23 -2530,25 +2530,25 @@@
    * buffer on that transaction's metadata list.
    *
    * Called under j_list_lock
-  * Called under jbd_lock_bh_state(jh2bh(jh))
+  * Called under jh->b_state_lock
    *
-  * jh and bh may be already free when this function returns
+  * When this function returns true, there's no next transaction to refile to
+  * and the caller has to drop jh reference through
+  * jbd2_journal_put_journal_head().
    */
- void __jbd2_journal_refile_buffer(struct journal_head *jh)
+ bool __jbd2_journal_refile_buffer(struct journal_head *jh)
   {
         int was_dirty, jlist;
         struct buffer_head *bh = jh2bh(jh);
   
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
         if (jh->b_transaction)
                 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
   
         /* If the buffer is now unused, just drop it. */
         if (jh->b_next_transaction == NULL) {
                 __jbd2_journal_unfile_buffer(jh);
-               return;
+               return true;
         }
   
         /*
@@@ -2542,6 -2576,7 +2576,7 @@@
   
         if (was_dirty)
                 set_buffer_jbddirty(bh);
+       return false;
   }
   
   /*
@@@ -2552,16 -2587,15 +2587,15 @@@
    */
   void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
   {
-       struct buffer_head *bh = jh2bh(jh);
+       bool drop;
   
-       /* Get reference so that buffer cannot be freed before we unlock it */
-       get_bh(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
         spin_lock(&journal->j_list_lock);
-       __jbd2_journal_refile_buffer(jh);
-       jbd_unlock_bh_state(bh);
+       drop = __jbd2_journal_refile_buffer(jh);
+       spin_unlock(&jh->b_state_lock);
         spin_unlock(&journal->j_list_lock);
-       __brelse(bh);
+       if (drop)
+               jbd2_journal_put_journal_head(jh);
   }
   
   /*
diff --combined fs/ocfs2/journal.c

index 699a560,a032f02..1afe57f
--- 1/fs/ocfs2/journal.c
--- 2/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@@ -217,8 -217,7 +217,8 @@@ void ocfs2_recovery_exit(struct ocfs2_s
         /* At this point, we know that no more recovery threads can be
          * launched, so wait for any recovery completion work to
          * complete. */
- -      flush_workqueue(osb->ocfs2_wq);
+ +      if (osb->ocfs2_wq)
+ +              flush_workqueue(osb->ocfs2_wq);
   
         /*
          * Now that recovery is shut down, and the osb is about to be
@@@ -420,14 -419,14 +420,14 @@@ int ocfs2_extend_trans(handle_t *handle
         if (!nblocks)
                 return 0;
   
-       old_nblocks = handle->h_buffer_credits;
+       old_nblocks = jbd2_handle_buffer_credits(handle);
   
         trace_ocfs2_extend_trans(old_nblocks, nblocks);
   
   #ifdef CONFIG_OCFS2_DEBUG_FS
         status = 1;
   #else
-       status = jbd2_journal_extend(handle, nblocks);
+       status = jbd2_journal_extend(handle, nblocks, 0);
         if (status < 0) {
                 mlog_errno(status);
                 goto bail;
@@@ -461,13 -460,13 +461,13 @@@ int ocfs2_allocate_extend_trans(handle_
   
         BUG_ON(!handle);
   
-       old_nblks = handle->h_buffer_credits;
+       old_nblks = jbd2_handle_buffer_credits(handle);
         trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
   
         if (old_nblks < thresh)
                 return 0;
   
-       status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+       status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0);
         if (status < 0) {
                 mlog_errno(status);
                 goto bail;
diff --combined include/linux/jbd2.h

index 564793c,842b626..29dce6f
--- 1/include/linux/jbd2.h
--- 2/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@@ -313,7 -313,6 +313,6 @@@ enum jbd_state_bits 
         BH_Revoked,             /* Has been revoked from the log */
         BH_RevokeValid,         /* Revoked flag is valid */
         BH_JBDDirty,            /* Is dirty but journaled */
-       BH_State,               /* Pins most journal_head state */
         BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
         BH_Shadow,              /* IO on shadow buffer is running */
         BH_Verified,            /* Metadata block has been verified ok */
@@@ -342,26 -341,6 +341,6 @@@ static inline struct journal_head *bh2j
         return bh->b_private;
   }
   
- static inline void jbd_lock_bh_state(struct buffer_head *bh)
- {
-       bit_spin_lock(BH_State, &bh->b_state);
- }
- 
- static inline int jbd_trylock_bh_state(struct buffer_head *bh)
- {
-       return bit_spin_trylock(BH_State, &bh->b_state);
- }
- 
- static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
- {
-       return bit_spin_is_locked(BH_State, &bh->b_state);
- }
- 
- static inline void jbd_unlock_bh_state(struct buffer_head *bh)
- {
-       bit_spin_unlock(BH_State, &bh->b_state);
- }
- 
   static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
   {
         bit_spin_lock(BH_JournalHead, &bh->b_state);
@@@ -477,7 -456,9 +456,9 @@@ struct jbd2_revoke_table_s
    * @h_transaction: Which compound transaction is this update a part of?
    * @h_journal: Which journal handle belongs to - used iff h_reserved set.
    * @h_rsv_handle: Handle reserved for finishing the logical operation.
-  * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
+  * @h_total_credits: Number of remaining buffers we are allowed to add to
+       journal. These are dirty buffers and revoke descriptor blocks.
+  * @h_revoke_credits: Number of remaining revoke records available for handle
    * @h_ref: Reference count on this handle.
    * @h_err: Field for caller's use to track errors through large fs operations.
    * @h_sync: Flag for sync-on-close.
@@@ -487,7 -468,8 +468,8 @@@
    * @h_type: For handle statistics.
    * @h_line_no: For handle statistics.
    * @h_start_jiffies: Handle Start time.
-  * @h_requested_credits: Holds @h_buffer_credits after handle is started.
+  * @h_requested_credits: Holds @h_total_credits after handle is started.
+  * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started.
    * @saved_alloc_context: Saved context while transaction is open.
    **/
   
@@@ -504,7 -486,9 +486,9 @@@ struct jbd2_journal_handl
         };
   
         handle_t                *h_rsv_handle;
-       int                     h_buffer_credits;
+       int                     h_total_credits;
+       int                     h_revoke_credits;
+       int                     h_revoke_credits_requested;
         int                     h_ref;
         int                     h_err;
   
@@@ -556,9 -540,9 +540,9 @@@ struct transaction_chp_stats_s 
    *      ->jbd_lock_bh_journal_head()  (This is "innermost")
    *
    *    j_state_lock
-  *    ->jbd_lock_bh_state()
+  *    ->b_state_lock
    *
-  *    jbd_lock_bh_state()
+  *    b_state_lock
    *    ->j_list_lock
    *
    *    j_state_lock
@@@ -681,12 -665,25 +665,25 @@@ struct transaction_
         atomic_t                t_updates;
   
         /*
-        * Number of buffers reserved for use by all handles in this transaction
-        * handle but not yet modified. [none]
+        * Number of blocks reserved for this transaction in the journal.
+        * This is including all credits reserved when starting transaction
+        * handles as well as all journal descriptor blocks needed for this
+        * transaction. [none]
          */
         atomic_t                t_outstanding_credits;
   
         /*
+        * Number of revoke records for this transaction added by already
+        * stopped handles. [none]
+        */
+       atomic_t                t_outstanding_revokes;
+ 
+       /*
+        * How many handles used this transaction? [none]
+        */
+       atomic_t                t_handle_count;
+ 
+       /*
          * Forward and backward links for the circular list of all transactions
          * awaiting checkpoint. [j_list_lock]
          */
@@@ -704,11 -701,6 +701,6 @@@
         ktime_t                 t_start_time;
   
         /*
-        * How many handles used this transaction? [none]
-        */
-       atomic_t                t_handle_count;
- 
-       /*
          * This transaction is being forced and some process is
          * waiting for it to finish.
          */
@@@ -1025,6 -1017,13 +1017,13 @@@ struct journal_
         int                     j_max_transaction_buffers;
   
         /**
+        * @j_revoke_records_per_block:
+        *
+        * Number of revoke records that fit in one descriptor block.
+        */
+       int                     j_revoke_records_per_block;
+ 
+       /**
          * @j_commit_interval:
          *
          * What is the maximum transaction lifetime before we begin a commit?
@@@ -1170,7 -1169,7 +1169,7 @@@
   #define jbd2_might_wait_for_commit(j) \
         do { \
                 rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \
- -              rwsem_release(&j->j_trans_commit_map, 1, _THIS_IP_); \
+ +              rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \
         } while (0)
   
   /* journal feature predicate functions */
@@@ -1257,7 -1256,7 +1256,7 @@@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3,              CSU
   
   /* Filing buffers */
   extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
- extern void __jbd2_journal_refile_buffer(struct journal_head *);
+ extern bool __jbd2_journal_refile_buffer(struct journal_head *);
   extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
   extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
   extern void __journal_free_buffer(struct journal_head *bh);
@@@ -1358,14 -1357,16 +1357,16 @@@ static inline handle_t *journal_current
   
   extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
   extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
-                                    gfp_t gfp_mask, unsigned int type,
-                                    unsigned int line_no);
+                                    int revoke_records, gfp_t gfp_mask,
+                                    unsigned int type, unsigned int line_no);
   extern int     jbd2_journal_restart(handle_t *, int nblocks);
- extern int     jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask);
+ extern int     jbd2__journal_restart(handle_t *, int nblocks,
+                                      int revoke_records, gfp_t gfp_mask);
   extern int     jbd2_journal_start_reserved(handle_t *handle,
                                 unsigned int type, unsigned int line_no);
   extern void    jbd2_journal_free_reserved(handle_t *handle);
- extern int     jbd2_journal_extend (handle_t *, int nblocks);
+ extern int     jbd2_journal_extend(handle_t *handle, int nblocks,
+                                    int revoke_records);
   extern int     jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
   extern int     jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
   extern int     jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
@@@ -1561,37 -1562,18 +1562,18 @@@ static inline int jbd2_journal_has_csum
   }
   
   /*
-  * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for
-  * transaction control blocks.
-  */
- #define JBD2_CONTROL_BLOCKS_SHIFT 5
- 
- /*
-  * Return the minimum number of blocks which must be free in the journal
-  * before a new transaction may be started.  Must be called under j_state_lock.
-  */
- static inline int jbd2_space_needed(journal_t *journal)
- {
-       int nblocks = journal->j_max_transaction_buffers;
-       return nblocks + (nblocks >> JBD2_CONTROL_BLOCKS_SHIFT);
- }
- 
- /*
    * Return number of free blocks in the log. Must be called under j_state_lock.
    */
   static inline unsigned long jbd2_log_space_left(journal_t *journal)
   {
         /* Allow for rounding errors */
-       unsigned long free = journal->j_free - 32;
+       long free = journal->j_free - 32;
   
         if (journal->j_committing_transaction) {
-               unsigned long committing = atomic_read(&journal->
-                       j_committing_transaction->t_outstanding_credits);
- 
-               /* Transaction + control blocks */
-               free -= committing + (committing >> JBD2_CONTROL_BLOCKS_SHIFT);
+               free -= atomic_read(&journal->
+                         j_committing_transaction->t_outstanding_credits);
         }
-       return free;
+       return max_t(long, free, 0);
   }
   
   /*
@@@ -1645,6 -1627,20 +1627,20 @@@ static inline tid_t  jbd2_get_latest_tr
         return tid;
   }
   
+ static inline int jbd2_handle_buffer_credits(handle_t *handle)
+ {
+       journal_t *journal;
+ 
+       if (!handle->h_reserved)
+               journal = handle->h_transaction->t_journal;
+       else
+               journal = handle->h_journal;
+ 
+       return handle->h_total_credits -
+               DIV_ROUND_UP(handle->h_revoke_credits_requested,
+                            journal->j_revoke_records_per_block);
+ }
+ 
   #ifdef __KERNEL__
   
   #define buffer_trace_init(bh) do {} while (0)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
		1	2
Documentation/filesystems/fscrypt.rst	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/ext4.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jbd2/transaction.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/journal.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/jbd2.h	patch \|	diff1 \|	diff2 \|	blob \| history