1
1
// SPDX-License-Identifier: GPL-2.0
2
2
3
+ #include <linux/blkdev.h>
3
4
#include <linux/iversion.h>
5
+ #include "compression.h"
4
6
#include "ctree.h"
7
+ #include "delalloc-space.h"
5
8
#include "reflink.h"
6
9
#include "transaction.h"
7
10
@@ -42,49 +45,131 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
42
45
return ret ;
43
46
}
44
47
48
+ static int copy_inline_to_page (struct inode * inode ,
49
+ const u64 file_offset ,
50
+ char * inline_data ,
51
+ const u64 size ,
52
+ const u64 datal ,
53
+ const u8 comp_type )
54
+ {
55
+ const u64 block_size = btrfs_inode_sectorsize (inode );
56
+ const u64 range_end = file_offset + block_size - 1 ;
57
+ const size_t inline_size = size - btrfs_file_extent_calc_inline_size (0 );
58
+ char * data_start = inline_data + btrfs_file_extent_calc_inline_size (0 );
59
+ struct extent_changeset * data_reserved = NULL ;
60
+ struct page * page = NULL ;
61
+ int ret ;
62
+
63
+ ASSERT (IS_ALIGNED (file_offset , block_size ));
64
+
65
+ /*
66
+ * We have flushed and locked the ranges of the source and destination
67
+ * inodes, we also have locked the inodes, so we are safe to do a
68
+ * reservation here. Also we must not do the reservation while holding
69
+ * a transaction open, otherwise we would deadlock.
70
+ */
71
+ ret = btrfs_delalloc_reserve_space (inode , & data_reserved , file_offset ,
72
+ block_size );
73
+ if (ret )
74
+ goto out ;
75
+
76
+ page = find_or_create_page (inode -> i_mapping , file_offset >> PAGE_SHIFT ,
77
+ btrfs_alloc_write_mask (inode -> i_mapping ));
78
+ if (!page ) {
79
+ ret = - ENOMEM ;
80
+ goto out_unlock ;
81
+ }
82
+
83
+ set_page_extent_mapped (page );
84
+ clear_extent_bit (& BTRFS_I (inode )-> io_tree , file_offset , range_end ,
85
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG ,
86
+ 0 , 0 , NULL );
87
+ ret = btrfs_set_extent_delalloc (inode , file_offset , range_end , 0 , NULL );
88
+ if (ret )
89
+ goto out_unlock ;
90
+
91
+ if (comp_type == BTRFS_COMPRESS_NONE ) {
92
+ char * map ;
93
+
94
+ map = kmap (page );
95
+ memcpy (map , data_start , datal );
96
+ flush_dcache_page (page );
97
+ kunmap (page );
98
+ } else {
99
+ ret = btrfs_decompress (comp_type , data_start , page , 0 ,
100
+ inline_size , datal );
101
+ if (ret )
102
+ goto out_unlock ;
103
+ flush_dcache_page (page );
104
+ }
105
+
106
+ /*
107
+ * If our inline data is smaller then the block/page size, then the
108
+ * remaining of the block/page is equivalent to zeroes. We had something
109
+ * like the following done:
110
+ *
111
+ * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
112
+ * $ sync # (or fsync)
113
+ * $ xfs_io -c "falloc 0 4K" file
114
+ * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
115
+ *
116
+ * So what's in the range [500, 4095] corresponds to zeroes.
117
+ */
118
+ if (datal < block_size ) {
119
+ char * map ;
120
+
121
+ map = kmap (page );
122
+ memset (map + datal , 0 , block_size - datal );
123
+ flush_dcache_page (page );
124
+ kunmap (page );
125
+ }
126
+
127
+ SetPageUptodate (page );
128
+ ClearPageChecked (page );
129
+ set_page_dirty (page );
130
+ out_unlock :
131
+ if (page ) {
132
+ unlock_page (page );
133
+ put_page (page );
134
+ }
135
+ if (ret )
136
+ btrfs_delalloc_release_space (inode , data_reserved , file_offset ,
137
+ block_size , true);
138
+ btrfs_delalloc_release_extents (BTRFS_I (inode ), block_size );
139
+ out :
140
+ extent_changeset_free (data_reserved );
141
+
142
+ return ret ;
143
+ }
144
+
45
145
/*
46
- * Make sure we do not end up inserting an inline extent into a file that has
47
- * already other (non-inline) extents. If a file has an inline extent it can
48
- * not have any other extents and the (single) inline extent must start at the
49
- * file offset 0. Failing to respect these rules will lead to file corruption,
50
- * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
51
- *
52
- * We can have extents that have been already written to disk or we can have
53
- * dirty ranges still in delalloc, in which case the extent maps and items are
54
- * created only when we run delalloc, and the delalloc ranges might fall outside
55
- * the range we are currently locking in the inode's io tree. So we check the
56
- * inode's i_size because of that (i_size updates are done while holding the
57
- * i_mutex, which we are holding here).
58
- * We also check to see if the inode has a size not greater than "datal" but has
59
- * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
60
- * protected against such concurrent fallocate calls by the i_mutex).
61
- *
62
- * If the file has no extents but a size greater than datal, do not allow the
63
- * copy because we would need turn the inline extent into a non-inline one (even
64
- * with NO_HOLES enabled). If we find our destination inode only has one inline
65
- * extent, just overwrite it with the source inline extent if its size is less
66
- * than the source extent's size, or we could copy the source inline extent's
67
- * data into the destination inode's inline extent if the later is greater then
68
- * the former.
146
+ * Deal with cloning of inline extents. We try to copy the inline extent from
147
+ * the source inode to destination inode when possible. When not possible we
148
+ * copy the inline extent's data into the respective page of the inode.
69
149
*/
70
150
static int clone_copy_inline_extent (struct inode * dst ,
71
- struct btrfs_trans_handle * trans ,
72
151
struct btrfs_path * path ,
73
152
struct btrfs_key * new_key ,
74
153
const u64 drop_start ,
75
154
const u64 datal ,
76
155
const u64 size ,
77
- const char * inline_data )
156
+ const u8 comp_type ,
157
+ char * inline_data ,
158
+ struct btrfs_trans_handle * * trans_out )
78
159
{
79
160
struct btrfs_fs_info * fs_info = btrfs_sb (dst -> i_sb );
80
161
struct btrfs_root * root = BTRFS_I (dst )-> root ;
81
162
const u64 aligned_end = ALIGN (new_key -> offset + datal ,
82
163
fs_info -> sectorsize );
164
+ struct btrfs_trans_handle * trans = NULL ;
83
165
int ret ;
84
166
struct btrfs_key key ;
85
167
86
- if (new_key -> offset > 0 )
87
- return - EOPNOTSUPP ;
168
+ if (new_key -> offset > 0 ) {
169
+ ret = copy_inline_to_page (dst , new_key -> offset , inline_data ,
170
+ size , datal , comp_type );
171
+ goto out ;
172
+ }
88
173
89
174
key .objectid = btrfs_ino (BTRFS_I (dst ));
90
175
key .type = BTRFS_EXTENT_DATA_KEY ;
@@ -103,81 +188,104 @@ static int clone_copy_inline_extent(struct inode *dst,
103
188
btrfs_item_key_to_cpu (path -> nodes [0 ], & key , path -> slots [0 ]);
104
189
if (key .objectid == btrfs_ino (BTRFS_I (dst )) &&
105
190
key .type == BTRFS_EXTENT_DATA_KEY ) {
191
+ /*
192
+ * There's an implicit hole at file offset 0, copy the
193
+ * inline extent's data to the page.
194
+ */
106
195
ASSERT (key .offset > 0 );
107
- return - EOPNOTSUPP ;
196
+ ret = copy_inline_to_page (dst , new_key -> offset ,
197
+ inline_data , size , datal ,
198
+ comp_type );
199
+ goto out ;
108
200
}
109
201
} else if (i_size_read (dst ) <= datal ) {
110
202
struct btrfs_file_extent_item * ei ;
111
- u64 ext_len ;
112
203
113
- /*
114
- * If the file size is <= datal, make sure there are no other
115
- * extents following (can happen do to an fallocate call with
116
- * the flag FALLOC_FL_KEEP_SIZE).
117
- */
118
204
ei = btrfs_item_ptr (path -> nodes [0 ], path -> slots [0 ],
119
205
struct btrfs_file_extent_item );
120
206
/*
121
- * If it's an inline extent, it can not have other extents
122
- * following it.
207
+ * If it's an inline extent replace it with the source inline
208
+ * extent, otherwise copy the source inline extent data into
209
+ * the respective page at the destination inode.
123
210
*/
124
211
if (btrfs_file_extent_type (path -> nodes [0 ], ei ) ==
125
212
BTRFS_FILE_EXTENT_INLINE )
126
213
goto copy_inline_extent ;
127
214
128
- ext_len = btrfs_file_extent_num_bytes (path -> nodes [0 ], ei );
129
- if (ext_len > aligned_end )
130
- return - EOPNOTSUPP ;
131
-
132
- ret = btrfs_next_item (root , path );
133
- if (ret < 0 ) {
134
- return ret ;
135
- } else if (ret == 0 ) {
136
- btrfs_item_key_to_cpu (path -> nodes [0 ], & key ,
137
- path -> slots [0 ]);
138
- if (key .objectid == btrfs_ino (BTRFS_I (dst )) &&
139
- key .type == BTRFS_EXTENT_DATA_KEY )
140
- return - EOPNOTSUPP ;
141
- }
215
+ ret = copy_inline_to_page (dst , new_key -> offset , inline_data ,
216
+ size , datal , comp_type );
217
+ goto out ;
142
218
}
143
219
144
220
copy_inline_extent :
221
+ ret = 0 ;
145
222
/*
146
223
* We have no extent items, or we have an extent at offset 0 which may
147
224
* or may not be inlined. All these cases are dealt the same way.
148
225
*/
149
226
if (i_size_read (dst ) > datal ) {
150
227
/*
151
- * If the destination inode has an inline extent.
152
- * This would require copying the data from the source inline
153
- * extent into the beginning of the destination's inline extent.
154
- * But this is really complex, both extents can be compressed
155
- * or just one of them, which would require decompressing and
156
- * re-compressing data (which could increase the new compressed
157
- * size, not allowing the compressed data to fit anymore in an
158
- * inline extent).
159
- * So just don't support this case for now (it should be rare,
160
- * we are not really saving space when cloning inline extents).
228
+ * At the destination offset 0 we have either a hole, a regular
229
+ * extent or an inline extent larger then the one we want to
230
+ * clone. Deal with all these cases by copying the inline extent
231
+ * data into the respective page at the destination inode.
161
232
*/
162
- return - EOPNOTSUPP ;
233
+ ret = copy_inline_to_page (dst , new_key -> offset , inline_data ,
234
+ size , datal , comp_type );
235
+ goto out ;
163
236
}
164
237
165
238
btrfs_release_path (path );
239
+ /*
240
+ * If we end up here it means were copy the inline extent into a leaf
241
+ * of the destination inode. We know we will drop or adjust at most one
242
+ * extent item in the destination root.
243
+ *
244
+ * 1 unit - adjusting old extent (we may have to split it)
245
+ * 1 unit - add new extent
246
+ * 1 unit - inode update
247
+ */
248
+ trans = btrfs_start_transaction (root , 3 );
249
+ if (IS_ERR (trans )) {
250
+ ret = PTR_ERR (trans );
251
+ trans = NULL ;
252
+ goto out ;
253
+ }
166
254
ret = btrfs_drop_extents (trans , root , dst , drop_start , aligned_end , 1 );
167
255
if (ret )
168
- return ret ;
256
+ goto out ;
169
257
ret = btrfs_insert_empty_item (trans , root , path , new_key , size );
170
258
if (ret )
171
- return ret ;
259
+ goto out ;
172
260
173
261
write_extent_buffer (path -> nodes [0 ], inline_data ,
174
262
btrfs_item_ptr_offset (path -> nodes [0 ],
175
263
path -> slots [0 ]),
176
264
size );
177
265
inode_add_bytes (dst , datal );
178
266
set_bit (BTRFS_INODE_NEEDS_FULL_SYNC , & BTRFS_I (dst )-> runtime_flags );
267
+ out :
268
+ if (!ret && !trans ) {
269
+ /*
270
+ * No transaction here means we copied the inline extent into a
271
+ * page of the destination inode.
272
+ *
273
+ * 1 unit to update inode item
274
+ */
275
+ trans = btrfs_start_transaction (root , 1 );
276
+ if (IS_ERR (trans )) {
277
+ ret = PTR_ERR (trans );
278
+ trans = NULL ;
279
+ }
280
+ }
281
+ if (ret && trans ) {
282
+ btrfs_abort_transaction (trans , ret );
283
+ btrfs_end_transaction (trans );
284
+ }
285
+ if (!ret )
286
+ * trans_out = trans ;
179
287
180
- return 0 ;
288
+ return ret ;
181
289
}
182
290
183
291
/**
@@ -196,7 +304,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
196
304
const u64 destoff , int no_time_update )
197
305
{
198
306
struct btrfs_fs_info * fs_info = btrfs_sb (inode -> i_sb );
199
- struct btrfs_root * root = BTRFS_I (inode )-> root ;
200
307
struct btrfs_path * path = NULL ;
201
308
struct extent_buffer * leaf ;
202
309
struct btrfs_trans_handle * trans ;
@@ -233,6 +340,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
233
340
struct btrfs_key new_key ;
234
341
u64 disko = 0 , diskl = 0 ;
235
342
u64 datao = 0 , datal = 0 ;
343
+ u8 comp ;
236
344
u64 drop_start ;
237
345
238
346
/* Note the key will change type as we walk through the tree */
@@ -275,6 +383,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
275
383
276
384
extent = btrfs_item_ptr (leaf , slot ,
277
385
struct btrfs_file_extent_item );
386
+ comp = btrfs_file_extent_compression (leaf , extent );
278
387
type = btrfs_file_extent_type (leaf , extent );
279
388
if (type == BTRFS_FILE_EXTENT_REG ||
280
389
type == BTRFS_FILE_EXTENT_PREALLOC ) {
@@ -369,29 +478,11 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
369
478
if (key .offset != 0 || datal > fs_info -> sectorsize )
370
479
return - EUCLEAN ;
371
480
372
- /*
373
- * If our extent is inline, we know we will drop or
374
- * adjust at most 1 extent item in the destination root.
375
- *
376
- * 1 - adjusting old extent (we may have to split it)
377
- * 1 - add new extent
378
- * 1 - inode update
379
- */
380
- trans = btrfs_start_transaction (root , 3 );
381
- if (IS_ERR (trans )) {
382
- ret = PTR_ERR (trans );
383
- goto out ;
384
- }
385
-
386
- ret = clone_copy_inline_extent (inode , trans , path ,
387
- & new_key , drop_start ,
388
- datal , size , buf );
389
- if (ret ) {
390
- if (ret != - EOPNOTSUPP )
391
- btrfs_abort_transaction (trans , ret );
392
- btrfs_end_transaction (trans );
481
+ ret = clone_copy_inline_extent (inode , path , & new_key ,
482
+ drop_start , datal , size ,
483
+ comp , buf , & trans );
484
+ if (ret )
393
485
goto out ;
394
- }
395
486
}
396
487
397
488
btrfs_release_path (path );
@@ -526,6 +617,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
526
617
struct inode * src = file_inode (file_src );
527
618
struct btrfs_fs_info * fs_info = btrfs_sb (inode -> i_sb );
528
619
int ret ;
620
+ int wb_ret ;
529
621
u64 len = olen ;
530
622
u64 bs = fs_info -> sb -> s_blocksize ;
531
623
@@ -566,6 +658,14 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
566
658
btrfs_double_extent_lock (src , off , inode , destoff , len );
567
659
ret = btrfs_clone (src , inode , off , olen , len , destoff , 0 );
568
660
btrfs_double_extent_unlock (src , off , inode , destoff , len );
661
+
662
+ /*
663
+ * We may have copied an inline extent into a page of the destination
664
+ * range, so wait for writeback to complete before truncating pages
665
+ * from the page cache. This is a rare case.
666
+ */
667
+ wb_ret = btrfs_wait_ordered_range (inode , destoff , len );
668
+ ret = ret ? ret : wb_ret ;
569
669
/*
570
670
* Truncate page cache pages so that future reads will see the cloned
571
671
* data immediately and not the previous data.
0 commit comments