Skip to content

Commit 2f3d878

Browse files
ahrensChristopher Siden
authored andcommitted
3834 incremental replication of 'holey' file systems is slow
Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net>
1 parent 34f2f8c commit 2f3d878

File tree

5 files changed

+110
-22
lines changed

5 files changed

+110
-22
lines changed

usr/src/uts/common/fs/zfs/dbuf.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
#include <sys/zfs_context.h>
2929
#include <sys/dmu.h>
30+
#include <sys/dmu_send.h>
3031
#include <sys/dmu_impl.h>
3132
#include <sys/dbuf.h>
3233
#include <sys/dmu_objset.h>
@@ -796,9 +797,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
796797
/*
797798
* Evict (if its unreferenced) or clear (if its referenced) any level-0
798799
* data blocks in the free range, so that any future readers will find
799-
* empty blocks. Also, if we happen accross any level-1 dbufs in the
800+
* empty blocks. Also, if we happen across any level-1 dbufs in the
800801
* range that have not already been marked dirty, mark them dirty so
801802
* they stay in memory.
803+
*
804+
* This is a no-op if the dataset is in the middle of an incremental
805+
* receive; see comment below for details.
802806
*/
803807
void
804808
dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
@@ -814,6 +818,20 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
814818
last_l1 = end >> epbs;
815819
}
816820
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
821+
822+
if (dmu_objset_is_receiving(dn->dn_objset)) {
823+
/*
824+
* When processing a free record from a zfs receive,
825+
* there should have been no previous modifications to the
826+
* data in this range. Therefore there should be no dbufs
827+
* in the range. Searching dn_dbufs for these non-existent
828+
* dbufs can be very expensive, so simply ignore this.
829+
*/
830+
VERIFY3P(dbuf_find(dn, 0, start), ==, NULL);
831+
VERIFY3P(dbuf_find(dn, 0, end), ==, NULL);
832+
return;
833+
}
834+
817835
mutex_enter(&dn->dn_dbufs_mtx);
818836
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
819837
db_next = list_next(&dn->dn_dbufs, db);

usr/src/uts/common/fs/zfs/dmu_send.c

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,32 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
8080
{
8181
struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
8282

83+
/*
84+
* When we receive a free record, dbuf_free_range() assumes
85+
* that the receiving system doesn't have any dbufs in the range
86+
* being freed. This is always true because there is a one-record
87+
* constraint: we only send one WRITE record for any given
88+
* object+offset. We know that the one-record constraint is
89+
* true because we always send data in increasing order by
90+
* object,offset.
91+
*
92+
* If the increasing-order constraint ever changes, we should find
93+
* another way to assert that the one-record constraint is still
94+
* satisfied.
95+
*/
96+
ASSERT(object > dsp->dsa_last_data_object ||
97+
(object == dsp->dsa_last_data_object &&
98+
offset > dsp->dsa_last_data_offset));
99+
100+
/*
101+
* If we are doing a non-incremental send, then there can't
102+
* be any data in the dataset we're receiving into. Therefore
103+
* a free record would simply be a no-op. Save space by not
104+
* sending it to begin with.
105+
*/
106+
if (!dsp->dsa_incremental)
107+
return (0);
108+
83109
if (length != -1ULL && offset + length < offset)
84110
length = -1ULL;
85111

@@ -146,6 +172,15 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
146172
{
147173
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
148174

175+
/*
176+
* We send data in increasing object, offset order.
177+
* See comment in dump_free() for details.
178+
*/
179+
ASSERT(object > dsp->dsa_last_data_object ||
180+
(object == dsp->dsa_last_data_object &&
181+
offset > dsp->dsa_last_data_offset));
182+
dsp->dsa_last_data_object = object;
183+
dsp->dsa_last_data_offset = offset + blksz - 1;
149184

150185
/*
151186
* If there is any kind of pending aggregation (currently either
@@ -213,6 +248,10 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
213248
{
214249
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
215250

251+
/* See comment in dump_free(). */
252+
if (!dsp->dsa_incremental)
253+
return (0);
254+
216255
/*
217256
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
218257
* push it out, since free block aggregation can only be done for
@@ -289,9 +328,9 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
289328
if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
290329
return (SET_ERROR(EINTR));
291330

292-
/* free anything past the end of the file */
331+
/* Free anything past the end of the file. */
293332
if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
294-
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
333+
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
295334
return (SET_ERROR(EINTR));
296335
if (dsp->dsa_err != 0)
297336
return (SET_ERROR(EINTR));
@@ -474,6 +513,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
474513
dsp->dsa_toguid = ds->ds_phys->ds_guid;
475514
ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
476515
dsp->dsa_pending_op = PENDING_NONE;
516+
dsp->dsa_incremental = (fromtxg != 0);
477517

478518
mutex_enter(&ds->ds_sendstream_lock);
479519
list_insert_head(&ds->ds_sendstreams, dsp);
@@ -1765,3 +1805,13 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
17651805
else
17661806
return (dmu_recv_existing_end(drc));
17671807
}
1808+
1809+
/*
1810+
* Return TRUE if this objset is currently being received into.
1811+
*/
1812+
boolean_t
1813+
dmu_objset_is_receiving(objset_t *os)
1814+
{
1815+
return (os->os_dsl_dataset != NULL &&
1816+
os->os_dsl_dataset->ds_owner == dmu_recv_tag);
1817+
}

usr/src/uts/common/fs/zfs/dmu_tx.c

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -587,8 +587,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
587587
{
588588
dmu_tx_hold_t *txh;
589589
dnode_t *dn;
590-
uint64_t start, end, i;
591-
int err, shift;
590+
int err;
592591
zio_t *zio;
593592

594593
ASSERT(tx->tx_txg == 0);
@@ -599,34 +598,48 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
599598
return;
600599
dn = txh->txh_dnode;
601600

602-
/* first block */
603-
if (off != 0)
604-
dmu_tx_count_write(txh, off, 1);
605-
/* last block */
606-
if (len != DMU_OBJECT_END)
607-
dmu_tx_count_write(txh, off+len, 1);
608-
609-
dmu_tx_count_dnode(txh);
610-
611601
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
612602
return;
613603
if (len == DMU_OBJECT_END)
614604
len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
615605

606+
dmu_tx_count_dnode(txh);
607+
616608
/*
617-
* For i/o error checking, read the first and last level-0
618-
* blocks, and all the level-1 blocks. The above count_write's
619-
* have already taken care of the level-0 blocks.
609+
* For i/o error checking, we read the first and last level-0
610+
* blocks if they are not aligned, and all the level-1 blocks.
611+
*
612+
* Note: dbuf_free_range() assumes that we have not instantiated
613+
* any level-0 dbufs that will be completely freed. Therefore we must
614+
* exercise care to not read or count the first and last blocks
615+
* if they are blocksize-aligned.
616+
*/
617+
if (dn->dn_datablkshift == 0) {
618+
dmu_tx_count_write(txh, off, len);
619+
} else {
620+
/* first block will be modified if it is not aligned */
621+
if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
622+
dmu_tx_count_write(txh, off, 1);
623+
/* last block will be modified if it is not aligned */
624+
if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
625+
dmu_tx_count_write(txh, off+len, 1);
626+
}
627+
628+
/*
629+
* Check level-1 blocks.
620630
*/
621631
if (dn->dn_nlevels > 1) {
622-
shift = dn->dn_datablkshift + dn->dn_indblkshift -
632+
int shift = dn->dn_datablkshift + dn->dn_indblkshift -
623633
SPA_BLKPTRSHIFT;
624-
start = off >> shift;
625-
end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
634+
uint64_t start = off >> shift;
635+
uint64_t end = (off + len) >> shift;
636+
637+
ASSERT(dn->dn_datablkshift != 0);
638+
ASSERT(dn->dn_indblkshift != 0);
626639

627640
zio = zio_root(tx->tx_pool->dp_spa,
628641
NULL, NULL, ZIO_FLAG_CANFAIL);
629-
for (i = start; i <= end; i++) {
642+
for (uint64_t i = start; i <= end; i++) {
630643
uint64_t ibyte = i << shift;
631644
err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
632645
i = ibyte >> shift;

usr/src/uts/common/fs/zfs/sys/dmu_impl.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121
/*
2222
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
2323
* Use is subject to license terms.
24+
*/
25+
/*
2426
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
27+
* Copyright (c) 2013 by Delphix. All rights reserved.
2528
*/
2629

2730
#ifndef _SYS_DMU_IMPL_H
@@ -290,6 +293,9 @@ typedef struct dmu_sendarg {
290293
uint64_t dsa_toguid;
291294
int dsa_err;
292295
dmu_pendop_t dsa_pending_op;
296+
boolean_t dsa_incremental;
297+
uint64_t dsa_last_data_object;
298+
uint64_t dsa_last_data_offset;
293299
} dmu_sendarg_t;
294300

295301

usr/src/uts/common/fs/zfs/sys/dmu_send.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
/*
2323
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24-
* Copyright (c) 2012 by Delphix. All rights reserved.
24+
* Copyright (c) 2013 by Delphix. All rights reserved.
2525
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
2626
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
2727
*/
@@ -63,5 +63,6 @@ int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
6363
int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
6464
int cleanup_fd, uint64_t *action_handlep);
6565
int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
66+
boolean_t dmu_objset_is_receiving(objset_t *os);
6667

6768
#endif /* _DMU_SEND_H */

0 commit comments

Comments
 (0)