Skip to content

Commit b0ad381

Browse files
josefbacikkdave
authored andcommitted
btrfs: fix deadlock with fiemap and extent locking
While working on the patchset to remove extent locking I got a lockdep splat with fiemap and pagefaulting with my new extent lock replacement lock. This deadlock exists with our normal code, we just don't have lockdep annotations with the extent locking so we've never noticed it. Since we're copying the fiemap extent to user space on every iteration we have the chance of pagefaulting. Because we hold the extent lock for the entire range we could mkwrite into a range in the file that we have mmap'ed. This would deadlock with the following stack trace [<0>] lock_extent+0x28d/0x2f0 [<0>] btrfs_page_mkwrite+0x273/0x8a0 [<0>] do_page_mkwrite+0x50/0xb0 [<0>] do_fault+0xc1/0x7b0 [<0>] __handle_mm_fault+0x2fa/0x460 [<0>] handle_mm_fault+0xa4/0x330 [<0>] do_user_addr_fault+0x1f4/0x800 [<0>] exc_page_fault+0x7c/0x1e0 [<0>] asm_exc_page_fault+0x26/0x30 [<0>] rep_movs_alternative+0x33/0x70 [<0>] _copy_to_user+0x49/0x70 [<0>] fiemap_fill_next_extent+0xc8/0x120 [<0>] emit_fiemap_extent+0x4d/0xa0 [<0>] extent_fiemap+0x7f8/0xad0 [<0>] btrfs_fiemap+0x49/0x80 [<0>] __x64_sys_ioctl+0x3e1/0xb50 [<0>] do_syscall_64+0x94/0x1a0 [<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76 I wrote an fstest to reproduce this deadlock without my replacement lock and verified that the deadlock exists with our existing locking. To fix this simply don't take the extent lock for the entire duration of the fiemap. This is safe in general because we keep track of where we are when we're searching the tree, so if an ordered extent updates in the middle of our fiemap call we'll still emit the correct extents because we know what offset we were on before. The only place we maintain the lock is searching delalloc. Since the delalloc stuff can change during writeback we want to lock the extent range so we have a consistent view of delalloc at the time we're checking to see if we need to set the delalloc flag. With this patch applied we no longer deadlock with my testcase. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent e42b9d8 commit b0ad381

File tree

1 file changed

+45
-17
lines changed

1 file changed

+45
-17
lines changed

fs/btrfs/extent_io.c

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2689,16 +2689,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
26892689
* it beyond i_size.
26902690
*/
26912691
while (cur_offset < end && cur_offset < i_size) {
2692+
struct extent_state *cached_state = NULL;
26922693
u64 delalloc_start;
26932694
u64 delalloc_end;
26942695
u64 prealloc_start;
2696+
u64 lockstart;
2697+
u64 lockend;
26952698
u64 prealloc_len = 0;
26962699
bool delalloc;
26972700

2701+
lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
2702+
lockend = round_up(end, inode->root->fs_info->sectorsize);
2703+
2704+
/*
2705+
* We are only locking for the delalloc range because that's the
2706+
* only thing that can change here. With fiemap we have a lock
2707+
* on the inode, so no buffered or direct writes can happen.
2708+
*
2709+
* However mmaps and normal page writeback will cause this to
2710+
* change arbitrarily. We have to lock the extent lock here to
2711+
* make sure that nobody messes with the tree while we're doing
2712+
* btrfs_find_delalloc_in_range.
2713+
*/
2714+
lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
26982715
delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
26992716
delalloc_cached_state,
27002717
&delalloc_start,
27012718
&delalloc_end);
2719+
unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
27022720
if (!delalloc)
27032721
break;
27042722

@@ -2866,15 +2884,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
28662884
u64 start, u64 len)
28672885
{
28682886
const u64 ino = btrfs_ino(inode);
2869-
struct extent_state *cached_state = NULL;
28702887
struct extent_state *delalloc_cached_state = NULL;
28712888
struct btrfs_path *path;
28722889
struct fiemap_cache cache = { 0 };
28732890
struct btrfs_backref_share_check_ctx *backref_ctx;
28742891
u64 last_extent_end;
28752892
u64 prev_extent_end;
2876-
u64 lockstart;
2877-
u64 lockend;
2893+
u64 range_start;
2894+
u64 range_end;
2895+
const u64 sectorsize = inode->root->fs_info->sectorsize;
28782896
bool stopped = false;
28792897
int ret;
28802898

@@ -2885,20 +2903,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
28852903
goto out;
28862904
}
28872905

2888-
lockstart = round_down(start, inode->root->fs_info->sectorsize);
2889-
lockend = round_up(start + len, inode->root->fs_info->sectorsize);
2890-
prev_extent_end = lockstart;
2906+
range_start = round_down(start, sectorsize);
2907+
range_end = round_up(start + len, sectorsize);
2908+
prev_extent_end = range_start;
28912909

28922910
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
2893-
lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
28942911

28952912
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
28962913
if (ret < 0)
28972914
goto out_unlock;
28982915
btrfs_release_path(path);
28992916

29002917
path->reada = READA_FORWARD;
2901-
ret = fiemap_search_slot(inode, path, lockstart);
2918+
ret = fiemap_search_slot(inode, path, range_start);
29022919
if (ret < 0) {
29032920
goto out_unlock;
29042921
} else if (ret > 0) {
@@ -2910,7 +2927,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
29102927
goto check_eof_delalloc;
29112928
}
29122929

2913-
while (prev_extent_end < lockend) {
2930+
while (prev_extent_end < range_end) {
29142931
struct extent_buffer *leaf = path->nodes[0];
29152932
struct btrfs_file_extent_item *ei;
29162933
struct btrfs_key key;
@@ -2933,19 +2950,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
29332950
* The first iteration can leave us at an extent item that ends
29342951
* before our range's start. Move to the next item.
29352952
*/
2936-
if (extent_end <= lockstart)
2953+
if (extent_end <= range_start)
29372954
goto next_item;
29382955

29392956
backref_ctx->curr_leaf_bytenr = leaf->start;
29402957

29412958
/* We have in implicit hole (NO_HOLES feature enabled). */
29422959
if (prev_extent_end < key.offset) {
2943-
const u64 range_end = min(key.offset, lockend) - 1;
2960+
const u64 hole_end = min(key.offset, range_end) - 1;
29442961

29452962
ret = fiemap_process_hole(inode, fieinfo, &cache,
29462963
&delalloc_cached_state,
29472964
backref_ctx, 0, 0, 0,
2948-
prev_extent_end, range_end);
2965+
prev_extent_end, hole_end);
29492966
if (ret < 0) {
29502967
goto out_unlock;
29512968
} else if (ret > 0) {
@@ -2955,7 +2972,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
29552972
}
29562973

29572974
/* We've reached the end of the fiemap range, stop. */
2958-
if (key.offset >= lockend) {
2975+
if (key.offset >= range_end) {
29592976
stopped = true;
29602977
break;
29612978
}
@@ -3049,29 +3066,41 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
30493066
btrfs_free_path(path);
30503067
path = NULL;
30513068

3052-
if (!stopped && prev_extent_end < lockend) {
3069+
if (!stopped && prev_extent_end < range_end) {
30533070
ret = fiemap_process_hole(inode, fieinfo, &cache,
30543071
&delalloc_cached_state, backref_ctx,
3055-
0, 0, 0, prev_extent_end, lockend - 1);
3072+
0, 0, 0, prev_extent_end, range_end - 1);
30563073
if (ret < 0)
30573074
goto out_unlock;
3058-
prev_extent_end = lockend;
3075+
prev_extent_end = range_end;
30593076
}
30603077

30613078
if (cache.cached && cache.offset + cache.len >= last_extent_end) {
30623079
const u64 i_size = i_size_read(&inode->vfs_inode);
30633080

30643081
if (prev_extent_end < i_size) {
3082+
struct extent_state *cached_state = NULL;
30653083
u64 delalloc_start;
30663084
u64 delalloc_end;
3085+
u64 lockstart;
3086+
u64 lockend;
30673087
bool delalloc;
30683088

3089+
lockstart = round_down(prev_extent_end, sectorsize);
3090+
lockend = round_up(i_size, sectorsize);
3091+
3092+
/*
3093+
* See the comment in fiemap_process_hole as to why
3094+
* we're doing the locking here.
3095+
*/
3096+
lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
30693097
delalloc = btrfs_find_delalloc_in_range(inode,
30703098
prev_extent_end,
30713099
i_size - 1,
30723100
&delalloc_cached_state,
30733101
&delalloc_start,
30743102
&delalloc_end);
3103+
unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
30753104
if (!delalloc)
30763105
cache.flags |= FIEMAP_EXTENT_LAST;
30773106
} else {
@@ -3082,7 +3111,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
30823111
ret = emit_last_fiemap_cache(fieinfo, &cache);
30833112

30843113
out_unlock:
3085-
unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
30863114
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
30873115
out:
30883116
free_extent_state(delalloc_cached_state);

0 commit comments

Comments
 (0)