Skip to content

Commit 402e38e

Browse files
zhangyi089tytso
authored andcommitted
ext4: prevent stale extent cache entries caused by concurrent I/O writeback
Currently, in the I/O writeback path, ext4_map_blocks() may attempt to cache additional unrelated extents in the extent status tree without holding the inode's i_rwsem and the mapping's invalidate_lock. This can lead to stale extent status entries remaining in certain scenarios, potentially causing data corruption. For example, when performing a collapse range in ext4_collapse_range(), it clears the extent cache and dirty pages before removing blocks and shifting extents. It also holds the i_data_sem during these two operations. However, both ext4_ext_remove_space() and ext4_ext_shift_extents() may briefly release the i_data_sem if journal credits are insufficient (ext4_datasem_ensure_credits()). If another writeback process writes dirty pages from other regions during this interval, it may cache extents that are about to be modified. Unless ext4_collapse_range() explicitly clears the extent cache again, these cached entries can become stale and inconsistent with the actual extents. 0 a n b c m | | | | | | [www][wwwwww][wwwwwwww]...[wwwww][wwww]... | | N M Assume that block a is dirty. The collapse range operation is removing data from n to m and drops i_data_sem immediately after removing the extent from b to c. At the same time, a concurrent writeback begins to write back block a; it will reloads the extent from [n, b) into the extent status tree since it does not hold the i_rwsem or the invalidate_lock. After the collapse range operation, it left the stale extent [n, b), which points logical block n to N, but the actual physical block of n should be M. Similarly, both ext4_insert_range() and ext4_truncate() have the same problem. ext4_punch_hole() survived since it re-add a hole extent entry after removing space since commit 9f11182 ("ext4: add a hole extent entry in cache after punch"). In most cases, during dirty page writeback, the block mapping information is likely to be found in the extent cache, making it less necessary to search for physical extents. Consequently, loading unrelated extent caches during writeback appears to be ineffective. Therefore, fix this by adds EXT4_EX_NOCACHE in the writeback path to prevent caching of unrelated extents, eliminating this potential source of corruption. Signed-off-by: Zhang Yi <yi.zhang@huawei.com> Link: https://patch.msgid.link/20250423085257.122685-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
1 parent 86b349c commit 402e38e

File tree

4 files changed

+32
-12
lines changed

4 files changed

+32
-12
lines changed

fs/ext4/ext4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,7 @@ enum {
741741
#define EXT4_EX_NOCACHE 0x40000000
742742
#define EXT4_EX_FORCE_CACHE 0x20000000
743743
#define EXT4_EX_NOFAIL 0x10000000
744+
#define EXT4_EX_FILTER 0x70000000
744745

745746
/*
746747
* Flags used by ext4_free_blocks

fs/ext4/extents.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4202,7 +4202,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
42024202
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
42034203

42044204
/* find extent for this block */
4205-
path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4205+
path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
42064206
if (IS_ERR(path)) {
42074207
err = PTR_ERR(path);
42084208
goto out;
@@ -4315,7 +4315,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
43154315
goto out;
43164316
ar.lright = map->m_lblk;
43174317
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
4318-
&ex2, 0);
4318+
&ex2, flags);
43194319
if (err < 0)
43204320
goto out;
43214321

@@ -4820,8 +4820,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
48204820
break;
48214821
}
48224822
}
4823+
/*
4824+
* Do not cache any unrelated extents, as it does not hold the
4825+
* i_rwsem or invalidate_lock, which could corrupt the extent
4826+
* status tree.
4827+
*/
48234828
ret = ext4_map_blocks(handle, inode, &map,
4824-
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4829+
EXT4_GET_BLOCKS_IO_CONVERT_EXT |
4830+
EXT4_EX_NOCACHE);
48254831
if (ret <= 0)
48264832
ext4_warning(inode->i_sb,
48274833
"inode #%lu: block %u: len %u: "

fs/ext4/fast_commit.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -918,7 +918,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
918918
map.m_lblk = cur_lblk_off;
919919
map.m_len = new_blk_size - cur_lblk_off + 1;
920920
ret = ext4_map_blocks(NULL, inode, &map,
921-
EXT4_GET_BLOCKS_IO_SUBMIT);
921+
EXT4_GET_BLOCKS_IO_SUBMIT |
922+
EXT4_EX_NOCACHE);
922923
if (ret < 0)
923924
return -ECANCELED;
924925

fs/ext4/inode.c

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -463,15 +463,16 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
463463
#endif /* ES_AGGRESSIVE_TEST */
464464

465465
static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
466-
struct ext4_map_blocks *map)
466+
struct ext4_map_blocks *map, int flags)
467467
{
468468
unsigned int status;
469469
int retval;
470470

471+
flags &= EXT4_EX_FILTER;
471472
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
472-
retval = ext4_ext_map_blocks(handle, inode, map, 0);
473+
retval = ext4_ext_map_blocks(handle, inode, map, flags);
473474
else
474-
retval = ext4_ind_map_blocks(handle, inode, map, 0);
475+
retval = ext4_ind_map_blocks(handle, inode, map, flags);
475476

476477
if (retval <= 0)
477478
return retval;
@@ -622,6 +623,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
622623
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
623624
return -EFSCORRUPTED;
624625

626+
/*
627+
* Do not allow caching of unrelated ranges of extents during I/O
628+
* submission.
629+
*/
630+
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
631+
WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
632+
625633
/* Lookup extent status tree firstly */
626634
if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
627635
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
@@ -667,7 +675,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
667675
* file system block.
668676
*/
669677
down_read(&EXT4_I(inode)->i_data_sem);
670-
retval = ext4_map_query_blocks(handle, inode, map);
678+
retval = ext4_map_query_blocks(handle, inode, map, flags);
671679
up_read((&EXT4_I(inode)->i_data_sem));
672680

673681
found:
@@ -1807,7 +1815,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
18071815
if (ext4_has_inline_data(inode))
18081816
retval = 0;
18091817
else
1810-
retval = ext4_map_query_blocks(NULL, inode, map);
1818+
retval = ext4_map_query_blocks(NULL, inode, map, 0);
18111819
up_read(&EXT4_I(inode)->i_data_sem);
18121820
if (retval)
18131821
return retval < 0 ? retval : 0;
@@ -1830,7 +1838,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
18301838
goto found;
18311839
}
18321840
} else if (!ext4_has_inline_data(inode)) {
1833-
retval = ext4_map_query_blocks(NULL, inode, map);
1841+
retval = ext4_map_query_blocks(NULL, inode, map, 0);
18341842
if (retval) {
18351843
up_write(&EXT4_I(inode)->i_data_sem);
18361844
return retval < 0 ? retval : 0;
@@ -2214,11 +2222,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
22142222
* previously reserved. However we must not fail because we're in
22152223
* writeback and there is nothing we can do about it so it might result
22162224
* in data loss. So use reserved blocks to allocate metadata if
2217-
* possible.
2225+
* possible. In addition, do not cache any unrelated extents, as it
2226+
* only holds the folio lock but does not hold the i_rwsem or
2227+
* invalidate_lock, which could corrupt the extent status tree.
22182228
*/
22192229
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
22202230
EXT4_GET_BLOCKS_METADATA_NOFAIL |
2221-
EXT4_GET_BLOCKS_IO_SUBMIT;
2231+
EXT4_GET_BLOCKS_IO_SUBMIT |
2232+
EXT4_EX_NOCACHE;
2233+
22222234
dioread_nolock = ext4_should_dioread_nolock(inode);
22232235
if (dioread_nolock)
22242236
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;

0 commit comments

Comments
 (0)