Skip to content

Commit 8bcbbe9

Browse files
committed
fuse: Convert to using invalidate_lock
Use invalidate_lock instead of fuse's private i_mmap_sem. The intended purpose is exactly the same. By this conversion we fix a long standing race between hole punching and read(2) / readahead(2) paths that can lead to stale page cache contents. CC: Miklos Szeredi <miklos@szeredi.hu> Reviewed-by: Miklos Szeredi <mszeredi@redhat.com> Signed-off-by: Jan Kara <jack@suse.cz>
1 parent edc6d01 commit 8bcbbe9

File tree

5 files changed

+35
-44
lines changed

5 files changed

+35
-44
lines changed

fs/fuse/dax.c

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
444444
/*
445445
* Can't do inline reclaim in fault path. We call
446446
* dax_layout_busy_page() before we free a range. And
447-
* fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
448-
* In fault path we enter with fi->i_mmap_sem held and can't drop
449-
* it. Also in fault path we hold fi->i_mmap_sem shared and not
450-
* exclusive, so that creates further issues with fuse_wait_dax_page().
451-
* Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
452-
* range to become free and retry.
447+
* fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
448+
* In fault path we enter with mapping->invalidate_lock held and can't
449+
* drop it. Also in fault path we hold mapping->invalidate_lock shared
450+
* and not exclusive, so that creates further issues with
451+
* fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault()
452+
* will wait for a memory range to become free and retry.
453453
*/
454454
if (flags & IOMAP_FAULT) {
455455
alloc_dmap = alloc_dax_mapping(fcd);
@@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
513513
down_write(&fi->dax->sem);
514514
node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
515515

516-
/* We are holding either inode lock or i_mmap_sem, and that should
516+
/* We are holding either inode lock or invalidate_lock, and that should
517517
* ensure that dmap can't be truncated. We are holding a reference
518518
* on dmap and that should make sure it can't be reclaimed. So dmap
519519
* should still be there in tree despite the fact we dropped and
@@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = {
660660

661661
static void fuse_wait_dax_page(struct inode *inode)
662662
{
663-
struct fuse_inode *fi = get_fuse_inode(inode);
664-
665-
up_write(&fi->i_mmap_sem);
663+
filemap_invalidate_unlock(inode->i_mapping);
666664
schedule();
667-
down_write(&fi->i_mmap_sem);
665+
filemap_invalidate_lock(inode->i_mapping);
668666
}
669667

670-
/* Should be called with fi->i_mmap_sem lock held exclusively */
668+
/* Should be called with mapping->invalidate_lock held exclusively */
671669
static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
672670
loff_t start, loff_t end)
673671
{
@@ -813,18 +811,18 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
813811
* we do not want any read/write/mmap to make progress and try
814812
* to populate page cache or access memory we are trying to free.
815813
*/
816-
down_read(&get_fuse_inode(inode)->i_mmap_sem);
814+
filemap_invalidate_lock_shared(inode->i_mapping);
817815
ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
818816
if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
819817
error = 0;
820818
retry = true;
821-
up_read(&get_fuse_inode(inode)->i_mmap_sem);
819+
filemap_invalidate_unlock_shared(inode->i_mapping);
822820
goto retry;
823821
}
824822

825823
if (ret & VM_FAULT_NEEDDSYNC)
826824
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
827-
up_read(&get_fuse_inode(inode)->i_mmap_sem);
825+
filemap_invalidate_unlock_shared(inode->i_mapping);
828826

829827
if (write)
830828
sb_end_pagefault(sb);
@@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
960958
int ret;
961959
struct interval_tree_node *node;
962960

963-
down_write(&fi->i_mmap_sem);
961+
filemap_invalidate_lock(inode->i_mapping);
964962

965963
/* Lookup a dmap and corresponding file offset to reclaim. */
966964
down_read(&fi->dax->sem);
@@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
10211019
out_write_dmap_sem:
10221020
up_write(&fi->dax->sem);
10231021
out_mmap_sem:
1024-
up_write(&fi->i_mmap_sem);
1022+
filemap_invalidate_unlock(inode->i_mapping);
10251023
return dmap;
10261024
}
10271025

@@ -1050,19 +1048,19 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
10501048
* had a reference or some other temporary failure,
10511049
* Try again. We want to give up inline reclaim only
10521050
* if there is no range assigned to this node. Otherwise
1053-
* if a deadlock is possible if we sleep with fi->i_mmap_sem
1054-
* held and worker to free memory can't make progress due
1055-
* to unavailability of fi->i_mmap_sem lock. So sleep
1056-
* only if fi->dax->nr=0
1051+
* if a deadlock is possible if we sleep with
1052+
* mapping->invalidate_lock held and worker to free memory
1053+
* can't make progress due to unavailability of
1054+
* mapping->invalidate_lock. So sleep only if fi->dax->nr=0
10571055
*/
10581056
if (retry)
10591057
continue;
10601058
/*
10611059
* There are no mappings which can be reclaimed. Wait for one.
10621060
* We are not holding fi->dax->sem. So it is possible
10631061
* that range gets added now. But as we are not holding
1064-
* fi->i_mmap_sem, worker should still be able to free up
1065-
* a range and wake us up.
1062+
* mapping->invalidate_lock, worker should still be able to
1063+
* free up a range and wake us up.
10661064
*/
10671065
if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
10681066
if (wait_event_killable_exclusive(fcd->range_waitq,
@@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
11081106
/*
11091107
* Free a range of memory.
11101108
* Locking:
1111-
* 1. Take fi->i_mmap_sem to block dax faults.
1109+
* 1. Take mapping->invalidate_lock to block dax faults.
11121110
* 2. Take fi->dax->sem to protect interval tree and also to make sure
11131111
* read/write can not reuse a dmap which we might be freeing.
11141112
*/
@@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
11221120
loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
11231121
loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
11241122

1125-
down_write(&fi->i_mmap_sem);
1123+
filemap_invalidate_lock(inode->i_mapping);
11261124
ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
11271125
if (ret) {
11281126
pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
@@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
11341132
ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
11351133
up_write(&fi->dax->sem);
11361134
out_mmap_sem:
1137-
up_write(&fi->i_mmap_sem);
1135+
filemap_invalidate_unlock(inode->i_mapping);
11381136
return ret;
11391137
}
11401138

fs/fuse/dir.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
15561556
struct fuse_mount *fm = get_fuse_mount(inode);
15571557
struct fuse_conn *fc = fm->fc;
15581558
struct fuse_inode *fi = get_fuse_inode(inode);
1559+
struct address_space *mapping = inode->i_mapping;
15591560
FUSE_ARGS(args);
15601561
struct fuse_setattr_in inarg;
15611562
struct fuse_attr_out outarg;
@@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
15801581
}
15811582

15821583
if (FUSE_IS_DAX(inode) && is_truncate) {
1583-
down_write(&fi->i_mmap_sem);
1584+
filemap_invalidate_lock(mapping);
15841585
fault_blocked = true;
15851586
err = fuse_dax_break_layouts(inode, 0, 0);
15861587
if (err) {
1587-
up_write(&fi->i_mmap_sem);
1588+
filemap_invalidate_unlock(mapping);
15881589
return err;
15891590
}
15901591
}
@@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
16941695
if ((is_truncate || !is_wb) &&
16951696
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
16961697
truncate_pagecache(inode, outarg.attr.size);
1697-
invalidate_inode_pages2(inode->i_mapping);
1698+
invalidate_inode_pages2(mapping);
16981699
}
16991700

17001701
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
17011702
out:
17021703
if (fault_blocked)
1703-
up_write(&fi->i_mmap_sem);
1704+
filemap_invalidate_unlock(mapping);
17041705

17051706
return 0;
17061707

@@ -1711,7 +1712,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
17111712
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
17121713

17131714
if (fault_blocked)
1714-
up_write(&fi->i_mmap_sem);
1715+
filemap_invalidate_unlock(mapping);
17151716
return err;
17161717
}
17171718

fs/fuse/file.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
243243
}
244244

245245
if (dax_truncate) {
246-
down_write(&get_fuse_inode(inode)->i_mmap_sem);
246+
filemap_invalidate_lock(inode->i_mapping);
247247
err = fuse_dax_break_layouts(inode, 0, 0);
248248
if (err)
249249
goto out;
@@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
255255

256256
out:
257257
if (dax_truncate)
258-
up_write(&get_fuse_inode(inode)->i_mmap_sem);
258+
filemap_invalidate_unlock(inode->i_mapping);
259259

260260
if (is_wb_truncate | dax_truncate) {
261261
fuse_release_nowrite(inode);
@@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
29202920
if (lock_inode) {
29212921
inode_lock(inode);
29222922
if (block_faults) {
2923-
down_write(&fi->i_mmap_sem);
2923+
filemap_invalidate_lock(inode->i_mapping);
29242924
err = fuse_dax_break_layouts(inode, 0, 0);
29252925
if (err)
29262926
goto out;
@@ -2976,7 +2976,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
29762976
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
29772977

29782978
if (block_faults)
2979-
up_write(&fi->i_mmap_sem);
2979+
filemap_invalidate_unlock(inode->i_mapping);
29802980

29812981
if (lock_inode)
29822982
inode_unlock(inode);
@@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
30453045
* modifications. Yet this does give less guarantees than if the
30463046
* copying was performed with write(2).
30473047
*
3048-
* To fix this a i_mmap_sem style lock could be used to prevent new
3048+
* To fix this a mapping->invalidate_lock could be used to prevent new
30493049
* faults while the copy is ongoing.
30503050
*/
30513051
err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);

fs/fuse/fuse_i.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,13 +149,6 @@ struct fuse_inode {
149149
/** Lock to protect write related fields */
150150
spinlock_t lock;
151151

152-
/**
153-
* Can't take inode lock in fault path (leads to circular dependency).
154-
* Introduce another semaphore which can be taken in fault path and
155-
* then other filesystem paths can take this to block faults.
156-
*/
157-
struct rw_semaphore i_mmap_sem;
158-
159152
#ifdef CONFIG_FUSE_DAX
160153
/*
161154
* Dax specific inode data

fs/fuse/inode.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
8585
fi->orig_ino = 0;
8686
fi->state = 0;
8787
mutex_init(&fi->mutex);
88-
init_rwsem(&fi->i_mmap_sem);
8988
spin_lock_init(&fi->lock);
9089
fi->forget = fuse_alloc_forget();
9190
if (!fi->forget)

0 commit comments

Comments
 (0)