Skip to content

Commit 05a5a76

Browse files
fdmananakdave
authored andcommitted
Btrfs: implement full reflink support for inline extents
There are a few cases where we don't allow cloning an inline extent into the destination inode, returning -EOPNOTSUPP to user space. This was done to prevent several types of file corruption and because it's not very straightforward to deal with these cases, as they can't rely on simply copying the inline extent between leaves. Such cases require copying the inline extent's data into the respective page of the destination inode. Not supporting these cases makes it harder and more cumbersome to write applications/libraries that work on any filesystem with reflink support, since all these cases for which btrfs fails with -EOPNOTSUPP work just fine on xfs for example. These unsupported cases are also not documented anywhere and explaining which exact cases fail require a bit of too technical understanding of btrfs's internal (inline extents and when and where can they exist in a file), so it's not really user friendly. Also some test cases from fstests that use fsx, such as generic/522 for example, can sporadically fail because they trigger one of these cases, and fsx expects all operations to succeed. This change adds supports for cloning all these cases by copying the inline extent's data into the respective page of the destination inode. With this change test case btrfs/112 from fstests fails because it expects some clone operations to fail, so it will be updated. Also a new test case that exercises all these previously unsupported cases will be added to fstests. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent a61e1e0 commit 05a5a76

File tree

1 file changed

+187
-87
lines changed

1 file changed

+187
-87
lines changed

fs/btrfs/reflink.c

+187-87
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
// SPDX-License-Identifier: GPL-2.0
22

3+
#include <linux/blkdev.h>
34
#include <linux/iversion.h>
5+
#include "compression.h"
46
#include "ctree.h"
7+
#include "delalloc-space.h"
58
#include "reflink.h"
69
#include "transaction.h"
710

@@ -42,49 +45,131 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
4245
return ret;
4346
}
4447

48+
static int copy_inline_to_page(struct inode *inode,
49+
const u64 file_offset,
50+
char *inline_data,
51+
const u64 size,
52+
const u64 datal,
53+
const u8 comp_type)
54+
{
55+
const u64 block_size = btrfs_inode_sectorsize(inode);
56+
const u64 range_end = file_offset + block_size - 1;
57+
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
58+
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
59+
struct extent_changeset *data_reserved = NULL;
60+
struct page *page = NULL;
61+
int ret;
62+
63+
ASSERT(IS_ALIGNED(file_offset, block_size));
64+
65+
/*
66+
* We have flushed and locked the ranges of the source and destination
67+
* inodes, we also have locked the inodes, so we are safe to do a
68+
* reservation here. Also we must not do the reservation while holding
69+
* a transaction open, otherwise we would deadlock.
70+
*/
71+
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
72+
block_size);
73+
if (ret)
74+
goto out;
75+
76+
page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
77+
btrfs_alloc_write_mask(inode->i_mapping));
78+
if (!page) {
79+
ret = -ENOMEM;
80+
goto out_unlock;
81+
}
82+
83+
set_page_extent_mapped(page);
84+
clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
85+
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
86+
0, 0, NULL);
87+
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
88+
if (ret)
89+
goto out_unlock;
90+
91+
if (comp_type == BTRFS_COMPRESS_NONE) {
92+
char *map;
93+
94+
map = kmap(page);
95+
memcpy(map, data_start, datal);
96+
flush_dcache_page(page);
97+
kunmap(page);
98+
} else {
99+
ret = btrfs_decompress(comp_type, data_start, page, 0,
100+
inline_size, datal);
101+
if (ret)
102+
goto out_unlock;
103+
flush_dcache_page(page);
104+
}
105+
106+
/*
107+
* If our inline data is smaller then the block/page size, then the
108+
* remaining of the block/page is equivalent to zeroes. We had something
109+
* like the following done:
110+
*
111+
* $ xfs_io -f -c "pwrite -S 0xab 0 500" file
112+
* $ sync # (or fsync)
113+
* $ xfs_io -c "falloc 0 4K" file
114+
* $ xfs_io -c "pwrite -S 0xcd 4K 4K"
115+
*
116+
* So what's in the range [500, 4095] corresponds to zeroes.
117+
*/
118+
if (datal < block_size) {
119+
char *map;
120+
121+
map = kmap(page);
122+
memset(map + datal, 0, block_size - datal);
123+
flush_dcache_page(page);
124+
kunmap(page);
125+
}
126+
127+
SetPageUptodate(page);
128+
ClearPageChecked(page);
129+
set_page_dirty(page);
130+
out_unlock:
131+
if (page) {
132+
unlock_page(page);
133+
put_page(page);
134+
}
135+
if (ret)
136+
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
137+
block_size, true);
138+
btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
139+
out:
140+
extent_changeset_free(data_reserved);
141+
142+
return ret;
143+
}
144+
45145
/*
46-
* Make sure we do not end up inserting an inline extent into a file that has
47-
* already other (non-inline) extents. If a file has an inline extent it can
48-
* not have any other extents and the (single) inline extent must start at the
49-
* file offset 0. Failing to respect these rules will lead to file corruption,
50-
* resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
51-
*
52-
* We can have extents that have been already written to disk or we can have
53-
* dirty ranges still in delalloc, in which case the extent maps and items are
54-
* created only when we run delalloc, and the delalloc ranges might fall outside
55-
* the range we are currently locking in the inode's io tree. So we check the
56-
* inode's i_size because of that (i_size updates are done while holding the
57-
* i_mutex, which we are holding here).
58-
* We also check to see if the inode has a size not greater than "datal" but has
59-
* extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
60-
* protected against such concurrent fallocate calls by the i_mutex).
61-
*
62-
* If the file has no extents but a size greater than datal, do not allow the
63-
* copy because we would need turn the inline extent into a non-inline one (even
64-
* with NO_HOLES enabled). If we find our destination inode only has one inline
65-
* extent, just overwrite it with the source inline extent if its size is less
66-
* than the source extent's size, or we could copy the source inline extent's
67-
* data into the destination inode's inline extent if the later is greater then
68-
* the former.
146+
* Deal with cloning of inline extents. We try to copy the inline extent from
147+
* the source inode to destination inode when possible. When not possible we
148+
* copy the inline extent's data into the respective page of the inode.
69149
*/
70150
static int clone_copy_inline_extent(struct inode *dst,
71-
struct btrfs_trans_handle *trans,
72151
struct btrfs_path *path,
73152
struct btrfs_key *new_key,
74153
const u64 drop_start,
75154
const u64 datal,
76155
const u64 size,
77-
const char *inline_data)
156+
const u8 comp_type,
157+
char *inline_data,
158+
struct btrfs_trans_handle **trans_out)
78159
{
79160
struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
80161
struct btrfs_root *root = BTRFS_I(dst)->root;
81162
const u64 aligned_end = ALIGN(new_key->offset + datal,
82163
fs_info->sectorsize);
164+
struct btrfs_trans_handle *trans = NULL;
83165
int ret;
84166
struct btrfs_key key;
85167

86-
if (new_key->offset > 0)
87-
return -EOPNOTSUPP;
168+
if (new_key->offset > 0) {
169+
ret = copy_inline_to_page(dst, new_key->offset, inline_data,
170+
size, datal, comp_type);
171+
goto out;
172+
}
88173

89174
key.objectid = btrfs_ino(BTRFS_I(dst));
90175
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -103,81 +188,104 @@ static int clone_copy_inline_extent(struct inode *dst,
103188
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
104189
if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
105190
key.type == BTRFS_EXTENT_DATA_KEY) {
191+
/*
192+
* There's an implicit hole at file offset 0, copy the
193+
* inline extent's data to the page.
194+
*/
106195
ASSERT(key.offset > 0);
107-
return -EOPNOTSUPP;
196+
ret = copy_inline_to_page(dst, new_key->offset,
197+
inline_data, size, datal,
198+
comp_type);
199+
goto out;
108200
}
109201
} else if (i_size_read(dst) <= datal) {
110202
struct btrfs_file_extent_item *ei;
111-
u64 ext_len;
112203

113-
/*
114-
* If the file size is <= datal, make sure there are no other
115-
* extents following (can happen do to an fallocate call with
116-
* the flag FALLOC_FL_KEEP_SIZE).
117-
*/
118204
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
119205
struct btrfs_file_extent_item);
120206
/*
121-
* If it's an inline extent, it can not have other extents
122-
* following it.
207+
* If it's an inline extent replace it with the source inline
208+
* extent, otherwise copy the source inline extent data into
209+
* the respective page at the destination inode.
123210
*/
124211
if (btrfs_file_extent_type(path->nodes[0], ei) ==
125212
BTRFS_FILE_EXTENT_INLINE)
126213
goto copy_inline_extent;
127214

128-
ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
129-
if (ext_len > aligned_end)
130-
return -EOPNOTSUPP;
131-
132-
ret = btrfs_next_item(root, path);
133-
if (ret < 0) {
134-
return ret;
135-
} else if (ret == 0) {
136-
btrfs_item_key_to_cpu(path->nodes[0], &key,
137-
path->slots[0]);
138-
if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
139-
key.type == BTRFS_EXTENT_DATA_KEY)
140-
return -EOPNOTSUPP;
141-
}
215+
ret = copy_inline_to_page(dst, new_key->offset, inline_data,
216+
size, datal, comp_type);
217+
goto out;
142218
}
143219

144220
copy_inline_extent:
221+
ret = 0;
145222
/*
146223
* We have no extent items, or we have an extent at offset 0 which may
147224
* or may not be inlined. All these cases are dealt the same way.
148225
*/
149226
if (i_size_read(dst) > datal) {
150227
/*
151-
* If the destination inode has an inline extent.
152-
* This would require copying the data from the source inline
153-
* extent into the beginning of the destination's inline extent.
154-
* But this is really complex, both extents can be compressed
155-
* or just one of them, which would require decompressing and
156-
* re-compressing data (which could increase the new compressed
157-
* size, not allowing the compressed data to fit anymore in an
158-
* inline extent).
159-
* So just don't support this case for now (it should be rare,
160-
* we are not really saving space when cloning inline extents).
228+
* At the destination offset 0 we have either a hole, a regular
229+
* extent or an inline extent larger then the one we want to
230+
* clone. Deal with all these cases by copying the inline extent
231+
* data into the respective page at the destination inode.
161232
*/
162-
return -EOPNOTSUPP;
233+
ret = copy_inline_to_page(dst, new_key->offset, inline_data,
234+
size, datal, comp_type);
235+
goto out;
163236
}
164237

165238
btrfs_release_path(path);
239+
/*
240+
* If we end up here it means were copy the inline extent into a leaf
241+
* of the destination inode. We know we will drop or adjust at most one
242+
* extent item in the destination root.
243+
*
244+
* 1 unit - adjusting old extent (we may have to split it)
245+
* 1 unit - add new extent
246+
* 1 unit - inode update
247+
*/
248+
trans = btrfs_start_transaction(root, 3);
249+
if (IS_ERR(trans)) {
250+
ret = PTR_ERR(trans);
251+
trans = NULL;
252+
goto out;
253+
}
166254
ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
167255
if (ret)
168-
return ret;
256+
goto out;
169257
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
170258
if (ret)
171-
return ret;
259+
goto out;
172260

173261
write_extent_buffer(path->nodes[0], inline_data,
174262
btrfs_item_ptr_offset(path->nodes[0],
175263
path->slots[0]),
176264
size);
177265
inode_add_bytes(dst, datal);
178266
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
267+
out:
268+
if (!ret && !trans) {
269+
/*
270+
* No transaction here means we copied the inline extent into a
271+
* page of the destination inode.
272+
*
273+
* 1 unit to update inode item
274+
*/
275+
trans = btrfs_start_transaction(root, 1);
276+
if (IS_ERR(trans)) {
277+
ret = PTR_ERR(trans);
278+
trans = NULL;
279+
}
280+
}
281+
if (ret && trans) {
282+
btrfs_abort_transaction(trans, ret);
283+
btrfs_end_transaction(trans);
284+
}
285+
if (!ret)
286+
*trans_out = trans;
179287

180-
return 0;
288+
return ret;
181289
}
182290

183291
/**
@@ -196,7 +304,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
196304
const u64 destoff, int no_time_update)
197305
{
198306
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
199-
struct btrfs_root *root = BTRFS_I(inode)->root;
200307
struct btrfs_path *path = NULL;
201308
struct extent_buffer *leaf;
202309
struct btrfs_trans_handle *trans;
@@ -233,6 +340,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
233340
struct btrfs_key new_key;
234341
u64 disko = 0, diskl = 0;
235342
u64 datao = 0, datal = 0;
343+
u8 comp;
236344
u64 drop_start;
237345

238346
/* Note the key will change type as we walk through the tree */
@@ -275,6 +383,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
275383

276384
extent = btrfs_item_ptr(leaf, slot,
277385
struct btrfs_file_extent_item);
386+
comp = btrfs_file_extent_compression(leaf, extent);
278387
type = btrfs_file_extent_type(leaf, extent);
279388
if (type == BTRFS_FILE_EXTENT_REG ||
280389
type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -369,29 +478,11 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
369478
if (key.offset != 0 || datal > fs_info->sectorsize)
370479
return -EUCLEAN;
371480

372-
/*
373-
* If our extent is inline, we know we will drop or
374-
* adjust at most 1 extent item in the destination root.
375-
*
376-
* 1 - adjusting old extent (we may have to split it)
377-
* 1 - add new extent
378-
* 1 - inode update
379-
*/
380-
trans = btrfs_start_transaction(root, 3);
381-
if (IS_ERR(trans)) {
382-
ret = PTR_ERR(trans);
383-
goto out;
384-
}
385-
386-
ret = clone_copy_inline_extent(inode, trans, path,
387-
&new_key, drop_start,
388-
datal, size, buf);
389-
if (ret) {
390-
if (ret != -EOPNOTSUPP)
391-
btrfs_abort_transaction(trans, ret);
392-
btrfs_end_transaction(trans);
481+
ret = clone_copy_inline_extent(inode, path, &new_key,
482+
drop_start, datal, size,
483+
comp, buf, &trans);
484+
if (ret)
393485
goto out;
394-
}
395486
}
396487

397488
btrfs_release_path(path);
@@ -526,6 +617,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
526617
struct inode *src = file_inode(file_src);
527618
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
528619
int ret;
620+
int wb_ret;
529621
u64 len = olen;
530622
u64 bs = fs_info->sb->s_blocksize;
531623

@@ -566,6 +658,14 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
566658
btrfs_double_extent_lock(src, off, inode, destoff, len);
567659
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
568660
btrfs_double_extent_unlock(src, off, inode, destoff, len);
661+
662+
/*
663+
* We may have copied an inline extent into a page of the destination
664+
* range, so wait for writeback to complete before truncating pages
665+
* from the page cache. This is a rare case.
666+
*/
667+
wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
668+
ret = ret ? ret : wb_ret;
569669
/*
570670
* Truncate page cache pages so that future reads will see the cloned
571671
* data immediately and not the previous data.

0 commit comments

Comments
 (0)