Skip to content

Commit 48a066e

Browse files
author
Al Viro
committed
RCU'd vfsmounts
* RCU-delayed freeing of vfsmounts * vfsmount_lock replaced with a seqlock (mount_lock) * sequence number from mount_lock is stored in nameidata->m_seq and used when we exit RCU mode * new vfsmount flag - MNT_SYNC_UMOUNT. Set by umount_tree() when its caller knows that vfsmount will have no surviving references. * synchronize_rcu() done between unlocking namespace_sem in namespace_unlock() and doing pending mntput(). * new helper: legitimize_mnt(mnt, seq). Checks the mount_lock sequence number against seq, then grabs reference to mnt. Then it rechecks mount_lock again to close the race and either returns success or drops the reference it has acquired. The subtle point is that in case of MNT_SYNC_UMOUNT we can simply decrement the refcount and sod off - aforementioned synchronize_rcu() makes sure that final mntput() won't come until we leave RCU mode. We need that, since we don't want to end up with some lazy pathwalk racing with umount() and stealing the final mntput() from it - caller of umount() may expect it to return only once the fs is shut down and we don't want to break that. In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do full-blown mntput() in case of mount_lock sequence number mismatch happening just as we'd grabbed the reference, but in those cases we won't be stealing the final mntput() from anything that would care. * mntput_no_expire() doesn't lock anything on the fast path now. Incidentally, SMP and UP cases are handled the same way - no ifdefs there. * normal pathname resolution does *not* do any writes to mount_lock. It does, of course, bump the refcounts of vfsmount and dentry in the very end, but that's it. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 42c3260 commit 48a066e

File tree

6 files changed

+136
-83
lines changed

6 files changed

+136
-83
lines changed

fs/dcache.c

+14-6
Original file line numberDiff line numberDiff line change
@@ -2887,24 +2887,28 @@ static int prepend_path(const struct path *path,
28872887
struct vfsmount *vfsmnt = path->mnt;
28882888
struct mount *mnt = real_mount(vfsmnt);
28892889
int error = 0;
2890-
unsigned seq = 0;
2890+
unsigned seq, m_seq = 0;
28912891
char *bptr;
28922892
int blen;
28932893

2894-
br_read_lock(&vfsmount_lock);
28952894
rcu_read_lock();
2895+
restart_mnt:
2896+
read_seqbegin_or_lock(&mount_lock, &m_seq);
2897+
seq = 0;
28962898
restart:
28972899
bptr = *buffer;
28982900
blen = *buflen;
2901+
error = 0;
28992902
read_seqbegin_or_lock(&rename_lock, &seq);
29002903
while (dentry != root->dentry || vfsmnt != root->mnt) {
29012904
struct dentry * parent;
29022905

29032906
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
2907+
struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
29042908
/* Global root? */
2905-
if (mnt_has_parent(mnt)) {
2906-
dentry = mnt->mnt_mountpoint;
2907-
mnt = mnt->mnt_parent;
2909+
if (mnt != parent) {
2910+
dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
2911+
mnt = parent;
29082912
vfsmnt = &mnt->mnt;
29092913
continue;
29102914
}
@@ -2938,7 +2942,11 @@ static int prepend_path(const struct path *path,
29382942
goto restart;
29392943
}
29402944
done_seqretry(&rename_lock, seq);
2941-
br_read_unlock(&vfsmount_lock);
2945+
if (need_seqretry(&mount_lock, m_seq)) {
2946+
m_seq = 1;
2947+
goto restart_mnt;
2948+
}
2949+
done_seqretry(&mount_lock, m_seq);
29422950

29432951
if (error >= 0 && bptr == *buffer) {
29442952
if (--blen < 0)

fs/mount.h

+6-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include <linux/mount.h>
22
#include <linux/seq_file.h>
33
#include <linux/poll.h>
4-
#include <linux/lglock.h>
54

65
struct mnt_namespace {
76
atomic_t count;
@@ -30,6 +29,7 @@ struct mount {
3029
struct mount *mnt_parent;
3130
struct dentry *mnt_mountpoint;
3231
struct vfsmount mnt;
32+
struct rcu_head mnt_rcu;
3333
#ifdef CONFIG_SMP
3434
struct mnt_pcp __percpu *mnt_pcp;
3535
#else
@@ -80,21 +80,23 @@ static inline int is_mounted(struct vfsmount *mnt)
8080
extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
8181
extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
8282

83+
extern bool legitimize_mnt(struct vfsmount *, unsigned);
84+
8385
static inline void get_mnt_ns(struct mnt_namespace *ns)
8486
{
8587
atomic_inc(&ns->count);
8688
}
8789

88-
extern struct lglock vfsmount_lock;
90+
extern seqlock_t mount_lock;
8991

9092
static inline void lock_mount_hash(void)
9193
{
92-
br_write_lock(&vfsmount_lock);
94+
write_seqlock(&mount_lock);
9395
}
9496

9597
static inline void unlock_mount_hash(void)
9698
{
97-
br_write_unlock(&vfsmount_lock);
99+
write_sequnlock(&mount_lock);
98100
}
99101

100102
struct proc_mounts {

fs/namei.c

+26-24
Original file line numberDiff line numberDiff line change
@@ -484,14 +484,12 @@ EXPORT_SYMBOL(path_put);
484484

485485
static inline void lock_rcu_walk(void)
486486
{
487-
br_read_lock(&vfsmount_lock);
488487
rcu_read_lock();
489488
}
490489

491490
static inline void unlock_rcu_walk(void)
492491
{
493492
rcu_read_unlock();
494-
br_read_unlock(&vfsmount_lock);
495493
}
496494

497495
/**
@@ -512,26 +510,23 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
512510
BUG_ON(!(nd->flags & LOOKUP_RCU));
513511

514512
/*
515-
* Get a reference to the parent first: we're
516-
* going to make "path_put(nd->path)" valid in
517-
* non-RCU context for "terminate_walk()".
518-
*
519-
* If this doesn't work, return immediately with
520-
* RCU walking still active (and then we will do
521-
* the RCU walk cleanup in terminate_walk()).
513+
* After legitimizing the bastards, terminate_walk()
514+
* will do the right thing for non-RCU mode, and all our
515+
* subsequent exit cases should rcu_read_unlock()
516+
* before returning. Do vfsmount first; if dentry
517+
* can't be legitimized, just set nd->path.dentry to NULL
518+
* and rely on dput(NULL) being a no-op.
522519
*/
523-
if (!lockref_get_not_dead(&parent->d_lockref))
520+
if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
524521
return -ECHILD;
525-
526-
/*
527-
* After the mntget(), we terminate_walk() will do
528-
* the right thing for non-RCU mode, and all our
529-
* subsequent exit cases should unlock_rcu_walk()
530-
* before returning.
531-
*/
532-
mntget(nd->path.mnt);
533522
nd->flags &= ~LOOKUP_RCU;
534523

524+
if (!lockref_get_not_dead(&parent->d_lockref)) {
525+
nd->path.dentry = NULL;
526+
unlock_rcu_walk();
527+
return -ECHILD;
528+
}
529+
535530
/*
536531
* For a negative lookup, the lookup sequence point is the parents
537532
* sequence point, and it only needs to revalidate the parent dentry.
@@ -608,16 +603,21 @@ static int complete_walk(struct nameidata *nd)
608603
if (!(nd->flags & LOOKUP_ROOT))
609604
nd->root.mnt = NULL;
610605

606+
if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
607+
unlock_rcu_walk();
608+
return -ECHILD;
609+
}
611610
if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
612611
unlock_rcu_walk();
612+
mntput(nd->path.mnt);
613613
return -ECHILD;
614614
}
615615
if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
616616
unlock_rcu_walk();
617617
dput(dentry);
618+
mntput(nd->path.mnt);
618619
return -ECHILD;
619620
}
620-
mntget(nd->path.mnt);
621621
unlock_rcu_walk();
622622
}
623623

@@ -909,15 +909,15 @@ int follow_up(struct path *path)
909909
struct mount *parent;
910910
struct dentry *mountpoint;
911911

912-
br_read_lock(&vfsmount_lock);
912+
read_seqlock_excl(&mount_lock);
913913
parent = mnt->mnt_parent;
914914
if (parent == mnt) {
915-
br_read_unlock(&vfsmount_lock);
915+
read_sequnlock_excl(&mount_lock);
916916
return 0;
917917
}
918918
mntget(&parent->mnt);
919919
mountpoint = dget(mnt->mnt_mountpoint);
920-
br_read_unlock(&vfsmount_lock);
920+
read_sequnlock_excl(&mount_lock);
921921
dput(path->dentry);
922922
path->dentry = mountpoint;
923923
mntput(path->mnt);
@@ -1048,8 +1048,8 @@ static int follow_managed(struct path *path, unsigned flags)
10481048

10491049
/* Something is mounted on this dentry in another
10501050
* namespace and/or whatever was mounted there in this
1051-
* namespace got unmounted before we managed to get the
1052-
* vfsmount_lock */
1051+
* namespace got unmounted before lookup_mnt() could
1052+
* get it */
10531053
}
10541054

10551055
/* Handle an automount point */
@@ -1864,6 +1864,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
18641864
if (flags & LOOKUP_RCU) {
18651865
lock_rcu_walk();
18661866
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1867+
nd->m_seq = read_seqbegin(&mount_lock);
18671868
} else {
18681869
path_get(&nd->path);
18691870
}
@@ -1872,6 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
18721873

18731874
nd->root.mnt = NULL;
18741875

1876+
nd->m_seq = read_seqbegin(&mount_lock);
18751877
if (*name=='/') {
18761878
if (flags & LOOKUP_RCU) {
18771879
lock_rcu_walk();

0 commit comments

Comments
 (0)