Skip to content

Commit e74540b

Browse files
Shuning Zhangtorvalds
authored andcommitted
ocfs2: protect extent tree in ocfs2_prepare_inode_for_write()
When the extent tree is modified, it should be protected by inode cluster lock and ip_alloc_sem. The extent tree is accessed and modified in the ocfs2_prepare_inode_for_write, but isn't protected by ip_alloc_sem. The following is a case. The function ocfs2_fiemap is accessing the extent tree, which is modified at the same time. kernel BUG at fs/ocfs2/extent_map.c:475! invalid opcode: 0000 [#1] SMP Modules linked in: tun ocfs2 ocfs2_nodemanager configfs ocfs2_stackglue [...] CPU: 16 PID: 14047 Comm: o2info Not tainted 4.1.12-124.23.1.el6uek.x86_64 #2 Hardware name: Oracle Corporation ORACLE SERVER X7-2L/ASM, MB MECH, X7-2L, BIOS 42040600 10/19/2018 task: ffff88019487e200 ti: ffff88003daa4000 task.ti: ffff88003daa4000 RIP: ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2] Call Trace: ocfs2_fiemap+0x1e3/0x430 [ocfs2] do_vfs_ioctl+0x155/0x510 SyS_ioctl+0x81/0xa0 system_call_fastpath+0x18/0xd8 Code: 18 48 c7 c6 60 7f 65 a0 31 c0 bb e2 ff ff ff 48 8b 4a 40 48 8b 7a 28 48 c7 c2 78 2d 66 a0 e8 38 4f 05 00 e9 28 fe ff ff 0f 1f 00 <0f> 0b 66 0f 1f 44 00 00 bb 86 ff ff ff e9 13 fe ff ff 66 0f 1f RIP ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2] ---[ end trace c8aa0c8180e869dc ]--- Kernel panic - not syncing: Fatal exception Kernel Offset: disabled This issue can be reproduced every week in a production environment. This issue is related to the usage mode. If others use ocfs2 in this mode, the kernel will panic frequently. [akpm@linux-foundation.org: coding style fixes] [Fix new warning due to unused function by removing said function - Linus ] Link: http://lkml.kernel.org/r/1568772175-2906-2-git-send-email-sunny.s.zhang@oracle.com Signed-off-by: Shuning Zhang <sunny.s.zhang@oracle.com> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> Reviewed-by: Gang He <ghe@suse.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Joseph Qi <jiangqi903@gmail.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 169226f commit e74540b

File tree

1 file changed

+90
-44
lines changed

1 file changed

+90
-44
lines changed

fs/ocfs2/file.c

Lines changed: 90 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2098,53 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
20982098
return 0;
20992099
}
21002100

2101-
static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2102-
struct file *file,
2103-
loff_t pos, size_t count,
2104-
int *meta_level)
2101+
static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
2102+
struct buffer_head **di_bh,
2103+
int meta_level,
2104+
int overwrite_io,
2105+
int write_sem,
2106+
int wait)
21052107
{
2106-
int ret;
2107-
struct buffer_head *di_bh = NULL;
2108-
u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2109-
u32 clusters =
2110-
ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2108+
int ret = 0;
21112109

2112-
ret = ocfs2_inode_lock(inode, &di_bh, 1);
2113-
if (ret) {
2114-
mlog_errno(ret);
2110+
if (wait)
2111+
ret = ocfs2_inode_lock(inode, NULL, meta_level);
2112+
else
2113+
ret = ocfs2_try_inode_lock(inode,
2114+
overwrite_io ? NULL : di_bh, meta_level);
2115+
if (ret < 0)
21152116
goto out;
2117+
2118+
if (wait) {
2119+
if (write_sem)
2120+
down_write(&OCFS2_I(inode)->ip_alloc_sem);
2121+
else
2122+
down_read(&OCFS2_I(inode)->ip_alloc_sem);
2123+
} else {
2124+
if (write_sem)
2125+
ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2126+
else
2127+
ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2128+
2129+
if (!ret) {
2130+
ret = -EAGAIN;
2131+
goto out_unlock;
2132+
}
21162133
}
21172134

2118-
*meta_level = 1;
2135+
return ret;
21192136

2120-
ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
2121-
if (ret)
2122-
mlog_errno(ret);
2137+
out_unlock:
2138+
brelse(*di_bh);
2139+
ocfs2_inode_unlock(inode, meta_level);
21232140
out:
2124-
brelse(di_bh);
21252141
return ret;
21262142
}
21272143

2144+
static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
2145+
struct buffer_head **di_bh,
2146+
int meta_level,
2147+
int write_sem)
2148+
{
2149+
if (write_sem)
2150+
up_write(&OCFS2_I(inode)->ip_alloc_sem);
2151+
else
2152+
up_read(&OCFS2_I(inode)->ip_alloc_sem);
2153+
2154+
brelse(*di_bh);
2155+
*di_bh = NULL;
2156+
2157+
if (meta_level >= 0)
2158+
ocfs2_inode_unlock(inode, meta_level);
2159+
}
2160+
21282161
static int ocfs2_prepare_inode_for_write(struct file *file,
21292162
loff_t pos, size_t count, int wait)
21302163
{
21312164
int ret = 0, meta_level = 0, overwrite_io = 0;
2165+
int write_sem = 0;
21322166
struct dentry *dentry = file->f_path.dentry;
21332167
struct inode *inode = d_inode(dentry);
21342168
struct buffer_head *di_bh = NULL;
2169+
u32 cpos;
2170+
u32 clusters;
21352171

21362172
/*
21372173
* We start with a read level meta lock and only jump to an ex
21382174
* if we need to make modifications here.
21392175
*/
21402176
for(;;) {
2141-
if (wait)
2142-
ret = ocfs2_inode_lock(inode, NULL, meta_level);
2143-
else
2144-
ret = ocfs2_try_inode_lock(inode,
2145-
overwrite_io ? NULL : &di_bh, meta_level);
2177+
ret = ocfs2_inode_lock_for_extent_tree(inode,
2178+
&di_bh,
2179+
meta_level,
2180+
overwrite_io,
2181+
write_sem,
2182+
wait);
21462183
if (ret < 0) {
2147-
meta_level = -1;
21482184
if (ret != -EAGAIN)
21492185
mlog_errno(ret);
21502186
goto out;
@@ -2156,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
21562192
*/
21572193
if (!wait && !overwrite_io) {
21582194
overwrite_io = 1;
2159-
if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
2160-
ret = -EAGAIN;
2161-
goto out_unlock;
2162-
}
21632195

21642196
ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
2165-
brelse(di_bh);
2166-
di_bh = NULL;
2167-
up_read(&OCFS2_I(inode)->ip_alloc_sem);
21682197
if (ret < 0) {
21692198
if (ret != -EAGAIN)
21702199
mlog_errno(ret);
@@ -2183,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
21832212
* set inode->i_size at the end of a write. */
21842213
if (should_remove_suid(dentry)) {
21852214
if (meta_level == 0) {
2186-
ocfs2_inode_unlock(inode, meta_level);
2215+
ocfs2_inode_unlock_for_extent_tree(inode,
2216+
&di_bh,
2217+
meta_level,
2218+
write_sem);
21872219
meta_level = 1;
21882220
continue;
21892221
}
@@ -2197,18 +2229,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
21972229

21982230
ret = ocfs2_check_range_for_refcount(inode, pos, count);
21992231
if (ret == 1) {
2200-
ocfs2_inode_unlock(inode, meta_level);
2201-
meta_level = -1;
2202-
2203-
ret = ocfs2_prepare_inode_for_refcount(inode,
2204-
file,
2205-
pos,
2206-
count,
2207-
&meta_level);
2232+
ocfs2_inode_unlock_for_extent_tree(inode,
2233+
&di_bh,
2234+
meta_level,
2235+
write_sem);
2236+
ret = ocfs2_inode_lock_for_extent_tree(inode,
2237+
&di_bh,
2238+
meta_level,
2239+
overwrite_io,
2240+
1,
2241+
wait);
2242+
write_sem = 1;
2243+
if (ret < 0) {
2244+
if (ret != -EAGAIN)
2245+
mlog_errno(ret);
2246+
goto out;
2247+
}
2248+
2249+
cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2250+
clusters =
2251+
ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2252+
ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
22082253
}
22092254

22102255
if (ret < 0) {
2211-
mlog_errno(ret);
2256+
if (ret != -EAGAIN)
2257+
mlog_errno(ret);
22122258
goto out_unlock;
22132259
}
22142260

@@ -2219,10 +2265,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
22192265
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
22202266
pos, count, wait);
22212267

2222-
brelse(di_bh);
2223-
2224-
if (meta_level >= 0)
2225-
ocfs2_inode_unlock(inode, meta_level);
2268+
ocfs2_inode_unlock_for_extent_tree(inode,
2269+
&di_bh,
2270+
meta_level,
2271+
write_sem);
22262272

22272273
out:
22282274
return ret;

0 commit comments

Comments
 (0)