From 755c7bf5ddcaf88d7f39bbd702c6c082cef5b8a2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Nov 2016 11:55:48 +1100 Subject: [PATCH 01/63] libxfs: convert ushort to unsigned short Since xfsprogs dropped ushort in favor of unsigned short, do that here too. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_ialloc.c | 5 +++-- fs/xfs/libxfs/xfs_inode_buf.h | 4 ++-- fs/xfs/libxfs/xfs_log_format.h | 4 ++-- fs/xfs/libxfs/xfs_log_recover.h | 2 +- fs/xfs/xfs_log_recover.c | 4 ++-- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 51b4e0de1fdc42..c507c1b17ca186 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2344,7 +2344,8 @@ xfs_imap( imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << + mp->m_sb.sb_inodelog); return 0; } @@ -2372,7 +2373,7 @@ xfs_imap( imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* * If the inode number maps to a block outside the bounds diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 3cfe12a4f58ac8..6848a0afbce7a4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -58,8 +58,8 @@ struct xfs_icdinode { */ struct xfs_imap { xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - ushort im_len; /* length in BBs of inode chunk */ - ushort im_boffset; /* inode offset in block in bytes */ + unsigned short im_len; /* length in BBs of inode chunk */ + unsigned short im_boffset; /* inode offset in block in bytes */ }; int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 083cdd6d6c28ce..7ae571f8e34ac7 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ + unsigned short blf_flags; /* misc state */ + unsigned short blf_len; /* number of blocks in this buf */ __int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 8e385f91d66023..d9f65e2d5cc818 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -52,7 +52,7 @@ typedef struct xlog_recover { struct list_head r_itemq; /* q for items */ } xlog_recover_t; -#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) +#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) /* * This is the number of entries in the l_buf_cancel_table used during diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d9c..cf754bcbcb1caa 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct list_head *bucket; struct xfs_buf_cancel *bcp; @@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct xfs_buf_cancel *bcp; From 420fbeb4bff483d89dd8de9242b8fd83e2eb3527 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Nov 2016 11:56:06 +1100 Subject: [PATCH 02/63] libxfs: synchronize dinode_verify with userspace The userspace version of _dinode_verify takes a raw inode number instead of an inode itself. Since neither version actually needs the inode, port the changes to the kernel. This will also reduce the libxfs diff noise. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_buf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 134424fac434fd..54817f82212cfe 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -383,7 +383,7 @@ xfs_log_dinode_to_disk( static bool xfs_dinode_verify( struct xfs_mount *mp, - struct xfs_inode *ip, + xfs_ino_t ino, struct xfs_dinode *dip) { uint16_t flags; @@ -401,7 +401,7 @@ xfs_dinode_verify( if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) return false; - if (be64_to_cpu(dip->di_ino) != ip->i_ino) + if (be64_to_cpu(dip->di_ino) != ino) return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; @@ -493,7 +493,7 @@ xfs_iread( return error; /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip, dip)) { + if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { xfs_alert(mp, "%s: validation failed for inode %lld failed", __func__, ip->i_ino); From 68c098582b20ddae04ed04d173d915f5ca64be5e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Nov 2016 11:56:13 +1100 Subject: [PATCH 03/63] libxfs: fix whitespace problems Fix some whitespace problems that trip up my libxfs-diff script. Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_rtbitmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e2e1106c9fadc9..ea45584a9913bb 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1016,4 +1016,3 @@ xfs_rtfree_extent( } return 0; } - From ae90b994b40f7c724701877c27b9148ee262ba62 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Nov 2016 11:56:20 +1100 Subject: [PATCH 04/63] libxfs: fix xfs_attr_shortform_bytesfit declaration Change the xfs_attr_shortform_bytesfit declaration to have struct xfs_inode to avoid tripping up the libxfs-diff scanner. Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 4f2aed04f8273b..8ef420a16f0819 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -51,7 +51,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); -int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); +int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* From 523b2e76e3ecb54e0ec8651e32291bdaefc5f866 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Nov 2016 11:56:51 +1100 Subject: [PATCH 05/63] libxfs: clean up _dir2_data_freescan Refactor the implementations of xfs_dir2_data_freescan into a routine that takes the raw directory block parameters and a second function that figures out the raw parameters from the directory inode. This enables us to use the exact same code for both userspace and the kernel, since repair knows exactly which directory block geometry parameters it needs. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2.h | 3 +++ fs/xfs/libxfs/xfs_dir2_data.c | 24 +++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index becc926c3e3d90..b55db61bd913ca 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -157,6 +157,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); +extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, + struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 725fc7841fdeb3..cd75ab9f3bf85c 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -505,8 +505,9 @@ xfs_dir2_data_freeremove( * Given a data block, reconstruct its bestfree map. */ void -xfs_dir2_data_freescan( - struct xfs_inode *dp, +xfs_dir2_data_freescan_int( + struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, struct xfs_dir2_data_hdr *hdr, int *loghead) { @@ -516,7 +517,6 @@ xfs_dir2_data_freescan( struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ - struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -526,13 +526,13 @@ xfs_dir2_data_freescan( /* * Start by clearing the table. */ - bf = dp->d_ops->data_bestfree_p(hdr); + bf = ops->data_bestfree_p(hdr); memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)dp->d_ops->data_entry_p(hdr); + p = (char *)ops->data_entry_p(hdr); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { btp = xfs_dir2_block_tail_p(geo, hdr); @@ -559,12 +559,22 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); - p += dp->d_ops->data_entsize(dep->namelen); + be16_to_cpu(*ops->data_entry_tag_p(dep))); + p += ops->data_entsize(dep->namelen); } } } +void +xfs_dir2_data_freescan( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) +{ + return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops, + hdr, loghead); +} + /* * Initialize a data block at the given block number in the directory. * Give back the buffer for the created block. From e6fc6fcf4447c9266038c55c25e4c7c14bee110c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 8 Nov 2016 11:58:55 +1100 Subject: [PATCH 06/63] xfs: don't call xfs_sb_quota_from_disk twice Source xfsprogs commit: ee3754254e8c186c99b6cdd4d59f741759d04acb Kernel commit 5ef828c4 ("xfs: avoid false quotacheck after unclean shutdown") made xfs_sb_from_disk() also call xfs_sb_quota_from_disk by default. However, when this was merged to libxfs, existing separate calls to libxfs_sb_quota_from_disk remained, and calling it twice in a row on a V4 superblock leads to issues, because: if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { ... sbp->sb_pquotino = sbp->sb_gquotino; sbp->sb_gquotino = NULLFSINO; and after the second call, we have set both pquotino and gquotino to NULLFSINO. Fix this by making it safe to call twice, and also remove the extra calls to libxfs_sb_quota_from_disk. This is only spotted when running xfstests with "-m crc=0" because the sb_from_disk change came about after V5 became default, and the above behavior only exists on a V4 superblock. Reported-by: Eryu Guan Signed-off-by: Eric Sandeen Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a70aec9106263f..7a392402f2f815 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -338,13 +338,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); - if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + if (sbp->sb_qflags & XFS_PQUOTA_ACCT && + sbp->sb_gquotino != NULLFSINO) { /* * In older version of superblock, on-disk superblock only * has sb_gquotino, and in-core superblock has both sb_gquotino * and sb_pquotino. But, only one of them is supported at any * point of time. So, if PQUOTA is set in disk superblock, - * copy over sb_gquotino to sb_pquotino. + * copy over sb_gquotino to sb_pquotino. The NULLFSINO test + * above is to make sure we don't do this twice and wipe them + * both out! */ sbp->sb_pquotino = sbp->sb_gquotino; sbp->sb_gquotino = NULLFSINO; From 5e52365ac86394ce4e5e2305ef4e66bd05cbf120 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Nov 2016 11:59:12 +1100 Subject: [PATCH 07/63] xfs: move dir_ino_validate declaration per xfsprogs Move the declaration of _dir_ino_validate out of the private dir2 header file into the public one, since xfsprogs did that for the benefit of xfs_repair. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2.h | 2 ++ fs/xfs/libxfs/xfs_dir2_priv.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index b55db61bd913ca..0197590fa7d7c0 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -180,6 +180,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup); +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index ef9f6ead96a469..d04547fcf274af 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -21,7 +21,6 @@ struct dir_context; /* xfs_dir2.c */ -extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, From 4fd29ec47212c8cbf98916af519019ccc5e58e49 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Nov 2016 11:59:26 +1100 Subject: [PATCH 08/63] xfs: check return value of _trans_reserve_quota_nblks Check the return value of xfs_trans_reserve_quota_nblks for errors. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c6eb21940783e4..71dd6d78371001 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4902,8 +4902,11 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0, + error = xfs_trans_reserve_quota_nblks(NULL, ip, + -((long)del->br_blockcount), 0, isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; ip->i_delayed_blks -= del->br_blockcount; if (whichfork == XFS_COW_FORK) From 399372349a7f9b2d7e56e4fa4467c69822d07024 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 8 Nov 2016 12:53:33 +1100 Subject: [PATCH 09/63] xfs: don't skip cow forks w/ delalloc blocks in cowblocks scan The cowblocks background scanner currently clears the cowblocks tag for inodes without any real allocations in the cow fork. This excludes inodes with only delalloc blocks in the cow fork. While we might never expect to clear delalloc blocks from the cow fork in the background scanner, it is not necessarily correct to clear the cowblocks tag from such inodes. For example, if the background scanner happens to process an inode between a buffered write and writeback, the scanner catches the inode in a state after delalloc blocks have been allocated to the cow fork but before the delalloc blocks have been converted to real blocks by writeback. The background scanner then incorrectly clears the cowblocks tag, even if part of the aforementioned delalloc reservation will not be remapped to the data fork (i.e., extra blocks due to the cowextsize hint). This means that any such additional blocks in the cow fork might never be reclaimed by the background scanner and could persist until the inode itself is reclaimed. To address this problem, only skip and clear inodes without any cow fork allocations whatsoever from the background scanner. While we generally do not want to cancel delalloc reservations from the background scanner, the pagecache dirty check following the cowblocks check should prevent that situation. If we do end up with delalloc cow fork blocks without a dirty address space mapping, this is probably an indication that something has gone wrong and the blocks should be reclaimed, as they may never be converted to a real allocation. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_icache.c | 7 ++++++- fs/xfs/xfs_reflink.c | 34 ---------------------------------- fs/xfs/xfs_reflink.h | 2 -- 3 files changed, 6 insertions(+), 37 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f295049db68159..1b4861f5d3d8ed 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1580,10 +1580,15 @@ xfs_inode_free_cowblocks( struct xfs_eofblocks *eofb = args; bool need_iolock = true; int match; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_reflink_has_real_cow_blocks(ip)) { + /* + * Just clear the tag if we have an empty cow fork or none at all. It's + * possible the inode was fully unshared since it was originally tagged. + */ + if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { trace_xfs_inode_free_cowblocks_invalid(ip); xfs_inode_clear_cowblocks_tag(ip); return 0; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index a279b4e7f5feaa..c069048932022f 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1697,37 +1697,3 @@ xfs_reflink_unshare( trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; } - -/* - * Does this inode have any real CoW reservations? - */ -bool -xfs_reflink_has_real_cow_blocks( - struct xfs_inode *ip) -{ - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; - xfs_extnum_t idx; - - if (!xfs_is_reflink_inode(ip)) - return false; - - /* Go find the old extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); - while (gotp) { - xfs_bmbt_get_all(gotp, &irec); - - if (!isnullstartblock(irec.br_startblock)) - return true; - - /* Roll on... */ - idx++; - if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - break; - gotp = xfs_iext_get_ext(ifp, idx); - } - - return false; -} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index fad11607c9adf3..97ea9b487884e3 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -50,6 +50,4 @@ extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); -extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); - #endif /* __XFS_REFLINK_H */ From 04197b341f23b908193308b8d63d17ff23232598 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 8 Nov 2016 12:54:14 +1100 Subject: [PATCH 10/63] xfs: don't BUG() on mixed direct and mapped I/O We've had reports of generic/095 causing XFS to BUG() in __xfs_get_blocks() due to the existence of delalloc blocks on a direct I/O read. generic/095 issues a mix of various types of I/O, including direct and memory mapped I/O to a single file. This is clearly not supported behavior and is known to lead to such problems. E.g., the lack of exclusion between the direct I/O and write fault paths means that a write fault can allocate delalloc blocks in a region of a file that was previously a hole after the direct read has attempted to flush/inval the file range, but before it actually reads the block mapping. In turn, the direct read discovers a delalloc extent and cannot proceed. While the appropriate solution here is to not mix direct and memory mapped I/O to the same regions of the same file, the current BUG_ON() behavior is probably overkill as it can crash the entire system. Instead, localize the failure to the I/O in question by returning an error for a direct I/O that cannot be handled safely due to delalloc blocks. Be careful to allow the case of a direct write to post-eof delalloc blocks. This can occur due to speculative preallocation and is safe as post-eof blocks are not accompanied by dirty pages in pagecache (conversely, preallocation within eof must have been zeroed, and thus dirtied, before the inode size could have been increased beyond said blocks). Finally, provide an additional warning if a direct I/O write occurs while the file is memory mapped. This may not catch all problematic scenarios, but provides a hint that some known-to-be-problematic I/O methods are in use. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3e57a56cf82947..2693ba84ec2541 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1361,6 +1361,26 @@ __xfs_get_blocks( if (error) goto out_unlock; + /* + * The only time we can ever safely find delalloc blocks on direct I/O + * is a dio write to post-eof speculative preallocation. All other + * scenarios are indicative of a problem or misuse (such as mixing + * direct and mapped I/O). + * + * The file may be unmapped by the time we get here so we cannot + * reliably fail the I/O based on mapping. Instead, fail the I/O if this + * is a read or a write within eof. Otherwise, carry on but warn as a + * precuation if the file happens to be mapped. + */ + if (direct && imap.br_startblock == DELAYSTARTBLOCK) { + if (!create || offset < i_size_read(VFS_I(ip))) { + WARN_ON_ONCE(1); + error = -EIO; + goto out_unlock; + } + WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping)); + } + /* for DAX, we convert unwritten extents directly */ if (create && (!nimaps || @@ -1450,8 +1470,6 @@ __xfs_get_blocks( (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); - return 0; out_unlock: From 4dfce57db6354603641132fac3c887614e3ebe81 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 8 Nov 2016 12:55:18 +1100 Subject: [PATCH 11/63] xfs: fix up xfs_swap_extent_forks inline extent handling There have been several reports over the years of NULL pointer dereferences in xfs_trans_log_inode during xfs_fsr processes, when the process is doing an fput and tearing down extents on the temporary inode, something like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 PID: 29439 TASK: ffff880550584fa0 CPU: 6 COMMAND: "xfs_fsr" [exception RIP: xfs_trans_log_inode+0x10] #9 [ffff8800a57bbbe0] xfs_bunmapi at ffffffffa037398e [xfs] #10 [ffff8800a57bbce8] xfs_itruncate_extents at ffffffffa0391b29 [xfs] #11 [ffff8800a57bbd88] xfs_inactive_truncate at ffffffffa0391d0c [xfs] #12 [ffff8800a57bbdb8] xfs_inactive at ffffffffa0392508 [xfs] #13 [ffff8800a57bbdd8] xfs_fs_evict_inode at ffffffffa035907e [xfs] #14 [ffff8800a57bbe00] evict at ffffffff811e1b67 #15 [ffff8800a57bbe28] iput at ffffffff811e23a5 #16 [ffff8800a57bbe58] dentry_kill at ffffffff811dcfc8 #17 [ffff8800a57bbe88] dput at ffffffff811dd06c #18 [ffff8800a57bbea8] __fput at ffffffff811c823b #19 [ffff8800a57bbef0] ____fput at ffffffff811c846e #20 [ffff8800a57bbf00] task_work_run at ffffffff81093b27 #21 [ffff8800a57bbf30] do_notify_resume at ffffffff81013b0c #22 [ffff8800a57bbf50] int_signal at ffffffff8161405d As it turns out, this is because the i_itemp pointer, along with the d_ops pointer, has been overwritten with zeros when we tear down the extents during truncate. When the in-core inode fork on the temporary inode used by xfs_fsr was originally set up during the extent swap, we mistakenly looked at di_nextents to determine whether all extents fit inline, but this misses extents generated by speculative preallocation; we should be using if_bytes instead. This mistake corrupts the in-memory inode, and code in xfs_iext_remove_inline eventually gets bad inputs, causing it to memmove and memset incorrect ranges; this became apparent because the two values in ifp->if_u2.if_inline_ext[1] contained what should have been in d_ops and i_itemp; they were memmoved due to incorrect array indexing and then the original locations were zeroed with memset, again due to an array overrun. Fix this by properly using i_df.if_bytes to determine the number of extents, not di_nextents. Thanks to dchinner for looking at this with me and spotting the root cause. Cc: stable@vger.kernel.org Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 552465e011ecb0..47074e0c33f3c8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1792,6 +1792,7 @@ xfs_swap_extent_forks( struct xfs_ifork tempifp, *ifp, *tifp; int aforkblks = 0; int taforkblks = 0; + xfs_extnum_t nextents; __uint64_t tmp; int error; @@ -1881,7 +1882,8 @@ xfs_swap_extent_forks( * pointer. Otherwise it's already NULL or * pointing to the extent. */ - if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + nextents = ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents <= XFS_INLINE_EXTS) { ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } @@ -1900,7 +1902,8 @@ xfs_swap_extent_forks( * pointer. Otherwise it's already NULL or * pointing to the extent. */ - if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + nextents = tip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents <= XFS_INLINE_EXTS) { tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } From 5d829300bee000980a09ac2ccb761cb25867b67c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 8 Nov 2016 12:59:42 +1100 Subject: [PATCH 12/63] xfs: provide helper for counting extents from if_bytes The open-coded pattern: ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) is all over the xfs code; provide a new helper xfs_iext_count(ifp) to count the number of inline extents in an inode fork. [dchinner: pick up several missed conversions] Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 49 ++++++++++++++++------------------ fs/xfs/libxfs/xfs_inode_fork.c | 31 ++++++++++++--------- fs/xfs/libxfs/xfs_inode_fork.h | 1 + fs/xfs/xfs_bmap_util.c | 34 ++++++++++------------- fs/xfs/xfs_inode_item.c | 4 +-- fs/xfs/xfs_ioctl.c | 6 ++--- fs/xfs/xfs_qm.c | 2 +- fs/xfs/xfs_reflink.c | 4 +-- 8 files changed, 64 insertions(+), 67 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c6eb21940783e4..42f4e7a84e2e80 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -515,7 +515,7 @@ xfs_bmap_trace_exlist( state |= BMAP_ATTRFORK; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(cnt == xfs_iext_count(ifp)); for (idx = 0; idx < cnt; idx++) trace_xfs_extlist(ip, idx, whichfork, caller_ip); } @@ -811,7 +811,7 @@ xfs_bmap_extents_to_btree( XFS_BTREE_LONG_PTRS); arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (cnt = i = 0; i < nextents; i++) { ep = xfs_iext_get_ext(ifp, i); if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { @@ -1296,7 +1296,7 @@ xfs_bmap_read_extents( /* * Here with bp and block set to the leftmost leaf node in the tree. */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + room = xfs_iext_count(ifp); i = 0; /* * Loop over all leaf nodes. Copy information to the extent records. @@ -1361,7 +1361,7 @@ xfs_bmap_read_extents( return error; block = XFS_BUF_TO_BLOCK(bp); } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == xfs_iext_count(ifp)); ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; @@ -1404,7 +1404,7 @@ xfs_bmap_search_multi_extents( if (lastx > 0) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + if (lastx < xfs_iext_count(ifp)) { xfs_bmbt_get_all(ep, gotp); *eofp = 0; } else { @@ -1497,7 +1497,7 @@ xfs_bmap_first_unused( (error = xfs_iread_extents(tp, ip, whichfork))) return error; lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); off = xfs_bmbt_get_startoff(ep); @@ -1582,7 +1582,7 @@ xfs_bmap_last_extent( return error; } - nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { *is_empty = 1; return 0; @@ -1735,7 +1735,7 @@ xfs_bmap_add_extent_delay_real( &bma->ip->i_d.di_nextents); ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -1794,7 +1794,7 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); @@ -2300,7 +2300,7 @@ xfs_bmap_add_extent_unwritten_real( ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ASSERT(*idx >= 0); - ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2356,7 +2356,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (*idx < xfs_iext_count(&ip->i_df) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2836,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2966,7 +2966,7 @@ xfs_bmap_add_extent_hole_real( ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -2992,7 +2992,7 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (bma->idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); if (isnullstartblock(right.br_startblock)) @@ -4221,7 +4221,7 @@ xfs_bmapi_read( break; /* Else go on to the next record. */ - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + if (++lastx < xfs_iext_count(ifp)) xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); else eof = 1; @@ -4733,7 +4733,7 @@ xfs_bmapi_write( /* Else go on to the next record. */ bma.prev = bma.got; - if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { + if (++bma.idx < xfs_iext_count(ifp)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), &bma.got); } else @@ -4885,7 +4885,7 @@ xfs_bmap_del_extent_delay( da_new = 0; ASSERT(*idx >= 0); - ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -5013,7 +5013,7 @@ xfs_bmap_del_extent_cow( got_endoff = got->br_startoff + got->br_blockcount; ASSERT(*idx >= 0); - ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -5119,8 +5119,7 @@ xfs_bmap_del_extent( state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp))); ASSERT(del->br_blockcount > 0); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &got); @@ -5445,7 +5444,6 @@ __xfs_bunmapi( int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ xfs_bmbt_irec_t prev; /* previous extent record */ xfs_fileoff_t start; /* first file offset deleted */ int tmp_logflags; /* partial logging flags */ @@ -5477,8 +5475,7 @@ __xfs_bunmapi( if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { + if (xfs_iext_count(ifp) == 0) { *rlen = 0; return 0; } @@ -5963,7 +5960,7 @@ xfs_bmse_shift_one( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); xfs_bmbt_get_all(gotp, &got); @@ -6140,7 +6137,7 @@ xfs_bmap_shift_extents( * are collapsing out, so we cannot use the count of real extents here. * Instead we have to calculate it from the incore fork. */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); if (total_extents == 0) { *done = 1; goto del_cursor; @@ -6200,7 +6197,7 @@ xfs_bmap_shift_extents( * count can change. Update the total and grade the next record. */ if (direction == SHIFT_LEFT) { - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); stop_extent = total_extents; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5dd56d3dbb3aeb..5fbe24c3167926 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -775,6 +775,13 @@ xfs_idestroy_fork( } } +/* Count number of incore extents based on if_bytes */ +xfs_extnum_t +xfs_iext_count(struct xfs_ifork *ifp) +{ + return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); +} + /* * Convert in-core extents to on-disk form * @@ -803,7 +810,7 @@ xfs_iextents_copy( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); - nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nrecs = xfs_iext_count(ifp); XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); ASSERT(nrecs > 0); @@ -941,7 +948,7 @@ xfs_iext_get_ext( xfs_extnum_t idx) /* index of target extent */ { ASSERT(idx >= 0); - ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(idx < xfs_iext_count(ifp)); if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { return ifp->if_u1.if_ext_irec->er_extbuf; @@ -1017,7 +1024,7 @@ xfs_iext_add( int new_size; /* size of extents after adding */ xfs_extnum_t nextents; /* number of extents in file */ - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT((idx >= 0) && (idx <= nextents)); byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); new_size = ifp->if_bytes + byte_diff; @@ -1241,7 +1248,7 @@ xfs_iext_remove( trace_xfs_iext_remove(ip, idx, state, _RET_IP_); ASSERT(ext_diff > 0); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); if (new_size == 0) { @@ -1270,7 +1277,7 @@ xfs_iext_remove_inline( ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); ASSERT(idx < XFS_INLINE_EXTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(((nextents - ext_diff) > 0) && (nextents - ext_diff) < XFS_INLINE_EXTS); @@ -1309,7 +1316,7 @@ xfs_iext_remove_direct( ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); new_size = ifp->if_bytes - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (new_size == 0) { xfs_iext_destroy(ifp); @@ -1546,7 +1553,7 @@ xfs_iext_indirect_to_direct( int size; /* size of file extents */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); @@ -1620,7 +1627,7 @@ xfs_iext_bno_to_ext( xfs_extnum_t nextents; /* number of file extents */ xfs_fileoff_t startoff = 0; /* start offset of extent */ - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { *idxp = 0; return NULL; @@ -1733,8 +1740,8 @@ xfs_iext_idx_to_irec( ASSERT(ifp->if_flags & XFS_IFEXTIREC); ASSERT(page_idx >= 0); - ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + ASSERT(page_idx <= xfs_iext_count(ifp)); + ASSERT(page_idx < xfs_iext_count(ifp) || realloc); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; erp_idx = 0; @@ -1782,7 +1789,7 @@ xfs_iext_irec_init( xfs_extnum_t nextents; /* number of extents in file */ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(nextents <= XFS_LINEAR_EXTS); erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); @@ -1906,7 +1913,7 @@ xfs_iext_irec_compact( ASSERT(ifp->if_flags & XFS_IFEXTIREC); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { xfs_iext_destroy(ifp); diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index c9476f50e32d11..8bf112e29aa1c8 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -152,6 +152,7 @@ void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); struct xfs_bmbt_rec_host * xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +xfs_extnum_t xfs_iext_count(struct xfs_ifork *); void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, struct xfs_bmbt_irec *, int); void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 47074e0c33f3c8..0670a8bd5818aa 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -359,9 +359,7 @@ xfs_bmap_count_blocks( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); + xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count); return 0; } @@ -426,7 +424,7 @@ xfs_getbmapx_fix_eof_hole( ifp = XFS_IFORK_PTR(ip, whichfork); if (!moretocome && xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && - (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + (lastx == xfs_iext_count(ifp) - 1)) out->bmv_oflags |= BMV_OF_LAST; } @@ -1878,15 +1876,13 @@ xfs_swap_extent_forks( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. + /* + * If the extents fit in the inode, fix the pointer. Otherwise + * it's already NULL or pointing to the extent. */ - nextents = ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents <= XFS_INLINE_EXTS) { - ifp->if_u1.if_extents = - ifp->if_u2.if_inline_ext; - } + nextents = xfs_iext_count(&ip->i_df); + if (nextents <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1898,15 +1894,13 @@ xfs_swap_extent_forks( switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. + /* + * If the extents fit in the inode, fix the pointer. Otherwise + * it's already NULL or pointing to the extent. */ - nextents = tip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents <= XFS_INLINE_EXTS) { - tifp->if_u1.if_extents = - tifp->if_u2.if_inline_ext; - } + nextents = xfs_iext_count(&tip->i_df); + if (nextents <= XFS_INLINE_EXTS) + tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9610e9c0095277..d90e7811ccdd9c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -164,7 +164,7 @@ xfs_inode_item_format_data_fork( struct xfs_bmbt_rec *p; ASSERT(ip->i_df.if_u1.if_extents != NULL); - ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); + ASSERT(xfs_iext_count(&ip->i_df) > 0); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); @@ -261,7 +261,7 @@ xfs_inode_item_format_attr_fork( ip->i_afp->if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ASSERT(xfs_iext_count(ip->i_afp) == ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c245bed3249bf1..a39197501a7cd8 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -910,16 +910,14 @@ xfs_ioc_fsgetxattr( if (attr) { if (ip->i_afp) { if (ip->i_afp->if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = ip->i_afp->if_bytes / - sizeof(xfs_bmbt_rec_t); + fa.fsx_nextents = xfs_iext_count(ip->i_afp); else fa.fsx_nextents = ip->i_d.di_anextents; } else fa.fsx_nextents = 0; } else { if (ip->i_df.if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = ip->i_df.if_bytes / - sizeof(xfs_bmbt_rec_t); + fa.fsx_nextents = xfs_iext_count(&ip->i_df); else fa.fsx_nextents = ip->i_d.di_nextents; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a60d9e2739d14a..45e50ea90769f1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1135,7 +1135,7 @@ xfs_qm_get_rtblks( return error; } rtblks = 0; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (idx = 0; idx < nextents; idx++) rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); *O_rtblks = (xfs_qcnt_t)rtblks; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c069048932022f..0edf835af32dff 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -486,7 +486,7 @@ xfs_reflink_trim_irec_to_next_cow( /* This is the extent before; try sliding up one. */ if (irec.br_startoff < offset_fsb) { idx++; - if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + if (idx >= xfs_iext_count(ifp)) return 0; gotp = xfs_iext_get_ext(ifp, idx); xfs_bmbt_get_all(gotp, &irec); @@ -566,7 +566,7 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &idx, &got, &del); } - if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) + if (++idx >= xfs_iext_count(ifp)) break; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } From bec9d48d7a303a5bb95c05961ff07ec7eeb59058 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Nov 2016 12:11:12 +1100 Subject: [PATCH 13/63] xfs: check minimum block size for CRC filesystems Check the minimum block size on v5 filesystems. [dchinner: cleaned up XFS_MIN_CRC_BLOCKSIZE check] Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_sb.c | 6 ++++++ fs/xfs/libxfs/xfs_types.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 7a392402f2f815..2580262e4ea00c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -262,6 +262,12 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + if (xfs_sb_version_hascrc(&mp->m_sb) && + sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { + xfs_notice(mp, "v5 SB sanity check failed"); + return -EFSCORRUPTED; + } + /* * Until this is fixed only page-sized or smaller data blocks work. */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8d74870468c24b..cf044c0f4d4178 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -75,11 +75,14 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). + * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes + * cannot be used. */ #define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ #define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) #define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1)) #define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ #define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) From 98efe8af1c9ffac47e842b7a75ded903e2f028da Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 10 Nov 2016 08:23:22 +1100 Subject: [PATCH 14/63] xfs: fix unbalanced inode reclaim flush locking Filesystem shutdown testing on an older distro kernel has uncovered an imbalanced locking pattern for the inode flush lock in xfs_reclaim_inode(). Specifically, there is a double unlock sequence between the call to xfs_iflush_abort() and xfs_reclaim_inode() at the "reclaim:" label. This actually does not cause obvious problems on current kernels due to the current flush lock implementation. Older kernels use a counting based flush lock mechanism, however, which effectively breaks the lock indefinitely when an already unlocked flush lock is repeatedly unlocked. Though this only currently occurs on filesystem shutdown, it has reproduced the effect of elevating an fs shutdown to a system-wide crash or hang. As it turns out, the flush lock is not actually required for the reclaim logic in xfs_reclaim_inode() because by that time we have already cycled the flush lock once while holding ILOCK_EXCL. Therefore, remove the additional flush lock/unlock cycle around the 'reclaim:' label and update branches into this label to release the flush lock where appropriate. Add an assert to xfs_ifunlock() to help prevent future occurences of the same problem. Reported-by: Zorro Lang Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_icache.c | 27 ++++++++++++++------------- fs/xfs/xfs_inode.h | 11 ++++++----- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 1b4861f5d3d8ed..9c3e5c6ddf20d1 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -123,7 +123,6 @@ __xfs_inode_free( { /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); @@ -133,6 +132,8 @@ void xfs_inode_free( struct xfs_inode *ip) { + ASSERT(!xfs_isiflocked(ip)); + /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -981,6 +982,7 @@ xfs_reclaim_inode( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip, false); goto reclaim; } @@ -989,10 +991,10 @@ xfs_reclaim_inode( goto out_ifunlock; xfs_iunpin_wait(ip); } - if (xfs_iflags_test(ip, XFS_ISTALE)) - goto reclaim; - if (xfs_inode_clean(ip)) + if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { + xfs_ifunlock(ip); goto reclaim; + } /* * Never flush out dirty data during non-blocking reclaim, as it would @@ -1030,25 +1032,24 @@ xfs_reclaim_inode( xfs_buf_relse(bp); } - xfs_iflock(ip); reclaim: + ASSERT(!xfs_isiflocked(ip)); + /* * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. - * We do this as early as possible under the ILOCK and flush lock so - * that xfs_iflush_cluster() can be guaranteed to detect races with us - * here. By doing this, we guarantee that once xfs_iflush_cluster has - * locked both the XFS_ILOCK and the flush lock that it will see either - * a valid, flushable inode that will serialise correctly against the - * locks below, or it will see a clean (and invalid) inode that it can - * skip. + * We do this as early as possible under the ILOCK so that + * xfs_iflush_cluster() can be guaranteed to detect races with us here. + * By doing this, we guarantee that once xfs_iflush_cluster has locked + * XFS_ILOCK that it will see either a valid, flushable inode that will + * serialise correctly, or it will see a clean (and invalid) inode that + * it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f14c1de2549db7..71e8a81c91a371 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -246,6 +246,11 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) * Synchronize processes attempting to flush the in-core inode back to disk. */ +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + extern void __xfs_iflock(struct xfs_inode *ip); static inline int xfs_iflock_nowait(struct xfs_inode *ip) @@ -261,16 +266,12 @@ static inline void xfs_iflock(struct xfs_inode *ip) static inline void xfs_ifunlock(struct xfs_inode *ip) { + ASSERT(xfs_isiflocked(ip)); xfs_iflags_clear(ip, XFS_IFLOCK); smp_mb(); wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); } -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) From 93533c7855c3c78c8a900cac65c8d669bb14935d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:32 +1100 Subject: [PATCH 15/63] xfs: new inode extent list lookup helpers xfs_iext_lookup_extent looks up a single extent at the passed in offset, and returns the extent covering the area, or the one behind it in case of a hole, as well as the index of the returned extent in arguments, as well as a simple bool as return value that is set to false if no extent could be found because the offset is behind EOF. It is a simpler replacement for xfs_bmap_search_extent that leaves looking up the rarely needed previous extent to the caller and has a nicer calling convention. xfs_iext_get_extent is a helper for iterating over the extent list, it takes an extent index as input, and returns the extent at that index in it's expanded form in an argument if it exists. The actual return value is a bool whether the index is valid or not. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_fork.c | 46 ++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_fork.h | 6 +++++ 2 files changed, 52 insertions(+) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5fbe24c3167926..222e103356c6d8 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -2003,3 +2003,49 @@ xfs_ifork_init_cow( ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; } + +/* + * Lookup the extent covering bno. + * + * If there is an extent covering bno return the extent index, and store the + * expanded extent structure in *gotp, and the extent index in *idx. + * If there is no extent covering bno, but there is an extent after it (e.g. + * it lies in a hole) return that extent in *gotp and its index in *idx + * instead. + * If bno is beyond the last extent return false, and return the index after + * the last valid index in *idxp. + */ +bool +xfs_iext_lookup_extent( + struct xfs_inode *ip, + struct xfs_ifork *ifp, + xfs_fileoff_t bno, + xfs_extnum_t *idxp, + struct xfs_bmbt_irec *gotp) +{ + struct xfs_bmbt_rec_host *ep; + + XFS_STATS_INC(ip->i_mount, xs_look_exlist); + + ep = xfs_iext_bno_to_ext(ifp, bno, idxp); + if (!ep) + return false; + xfs_bmbt_get_all(ep, gotp); + return true; +} + +/* + * Return true if there is an extent at index idx, and return the expanded + * extent structure at idx in that case. Else return false. + */ +bool +xfs_iext_get_extent( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp) +{ + if (idx < 0 || idx >= xfs_iext_count(ifp)) + return false; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); + return true; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 8bf112e29aa1c8..7fb8365326d1a7 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -182,6 +182,12 @@ void xfs_iext_irec_compact_pages(struct xfs_ifork *); void xfs_iext_irec_compact_full(struct xfs_ifork *); void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); +bool xfs_iext_lookup_extent(struct xfs_inode *ip, + struct xfs_ifork *ifp, xfs_fileoff_t bno, + xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); +bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp); + extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); From 86685f7ba5c0fdbcc159ecdbcc530b9b3509a675 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:38 +1100 Subject: [PATCH 16/63] xfs: cleanup xfs_bmap_last_before Rewrite the function using xfs_iext_lookup_extent and xfs_iext_get_extent, and massage the flow into something easily understandable. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 64 ++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 42f4e7a84e2e80..13f62da67bbcca 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1523,44 +1523,44 @@ xfs_bmap_first_unused( */ int /* error */ xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ { - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + xfs_extnum_t idx; + int error; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return -EIO; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_EXTENTS: + break; + default: + return -EIO; } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; } - /* - * Otherwise *last_block is already the right answer. - */ + + if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) { + if (got.br_startoff <= *last_block - 1) + return 0; + } + + if (xfs_iext_get_extent(ifp, idx - 1, &got)) { + *last_block = got.br_startoff + got.br_blockcount; + return 0; + } + + *last_block = 0; return 0; } From 334f3423d6a6c39483aa0744ea044e105ca0e5ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:43 +1100 Subject: [PATCH 17/63] xfs: use new extent lookup helpers in xfs_bmapi_read Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 13f62da67bbcca..aac5cc3cd9ef99 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4145,12 +4145,11 @@ xfs_bmapi_read( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec prev; xfs_fileoff_t obno; xfs_fileoff_t end; - xfs_extnum_t lastx; + xfs_extnum_t idx; int error; - int eof; + bool eof = false; int n = 0; int whichfork = xfs_bmapi_whichfork(flags); @@ -4190,7 +4189,8 @@ xfs_bmapi_read( return error; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) + eof = true; end = bno + len; obno = bno; @@ -4221,10 +4221,8 @@ xfs_bmapi_read( break; /* Else go on to the next record. */ - if (++lastx < xfs_iext_count(ifp)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); - else - eof = 1; + if (!xfs_iext_get_extent(ifp, ++idx, &got)) + eof = true; } *nmap = n; return 0; From 2d58f6ef79db10257d77ae8cd8e9e8432816c9ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:43 +1100 Subject: [PATCH 18/63] xfs: use new extent lookup helpers in xfs_bmapi_write Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index aac5cc3cd9ef99..1c2e0eaf5aaf5b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4561,7 +4561,7 @@ xfs_bmapi_write( struct xfs_ifork *ifp; struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* after the end of extents */ + bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ @@ -4639,12 +4639,14 @@ xfs_bmapi_write( goto error0; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, - &bma.prev); n = 0; end = bno + len; obno = bno; + if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got)) + eof = true; + if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; bma.tp = tp; bma.ip = ip; bma.total = total; @@ -4731,11 +4733,8 @@ xfs_bmapi_write( /* Else go on to the next record. */ bma.prev = bma.got; - if (++bma.idx < xfs_iext_count(ifp)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), - &bma.got); - } else - eof = 1; + if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got)) + eof = true; } *nmap = n; From 7efc794561f6bfe34d26c3724289108f6cda3a4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:44 +1100 Subject: [PATCH 19/63] xfs: use new extent lookup helpers in __xfs_bunmapi Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1c2e0eaf5aaf5b..05608faa28ce25 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5430,8 +5430,6 @@ __xfs_bunmapi( { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ - int eof; /* is deleting at eof */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ xfs_extnum_t extno; /* extent number in list */ xfs_bmbt_irec_t got; /* current extent record */ @@ -5441,7 +5439,6 @@ __xfs_bunmapi( int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ xfs_mount_t *mp; /* mount structure */ - xfs_bmbt_irec_t prev; /* previous extent record */ xfs_fileoff_t start; /* first file offset deleted */ int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ @@ -5480,18 +5477,17 @@ __xfs_bunmapi( isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); start = bno; bno = start + len - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); /* * Check to see if the given block number is past the end of the * file, back up to the last block if so... */ - if (eof) { - ep = xfs_iext_get_ext(ifp, --lastx); - xfs_bmbt_get_all(ep, &got); + if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) { + ASSERT(lastx > 0); + xfs_iext_get_extent(ifp, --lastx, &got); bno = got.br_startoff + got.br_blockcount - 1; } + logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); @@ -5522,8 +5518,7 @@ __xfs_bunmapi( if (got.br_startoff > bno) { if (--lastx < 0) break; - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, lastx, &got); } /* * Is the last block of this extent before the range @@ -5537,7 +5532,6 @@ __xfs_bunmapi( * Then deal with the (possibly delayed) allocated space * we found. */ - ASSERT(ep != NULL); del = got; wasdel = isnullstartblock(del.br_startblock); if (got.br_startoff < start) { @@ -5618,15 +5612,12 @@ __xfs_bunmapi( */ ASSERT(bno >= del.br_blockcount); bno -= del.br_blockcount; - if (got.br_startoff > bno) { - if (--lastx >= 0) { - ep = xfs_iext_get_ext(ifp, - lastx); - xfs_bmbt_get_all(ep, &got); - } - } + if (got.br_startoff > bno && --lastx >= 0) + xfs_iext_get_extent(ifp, lastx, &got); continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec prev; + /* * This one is already unwritten. * It must have a written left neighbor. @@ -5634,8 +5625,7 @@ __xfs_bunmapi( * try again. */ ASSERT(lastx > 0); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, - lastx - 1), &prev); + xfs_iext_get_extent(ifp, lastx - 1, &prev); ASSERT(prev.br_state == XFS_EXT_NORM); ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == @@ -5733,13 +5723,9 @@ __xfs_bunmapi( */ if (bno != (xfs_fileoff_t)-1 && bno >= start) { if (lastx >= 0) { - ep = xfs_iext_get_ext(ifp, lastx); - if (xfs_bmbt_get_startoff(ep) > bno) { - if (--lastx >= 0) - ep = xfs_iext_get_ext(ifp, - lastx); - } - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, lastx, &got); + if (got.br_startoff > bno && --lastx >= 0) + xfs_iext_get_extent(ifp, lastx, &got); } extno++; } From 65c5f419788d623a0410eca1866134f5e4628594 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:44 +1100 Subject: [PATCH 20/63] xfs: remove prev argument to xfs_bmapi_reserve_delalloc We can easily lookup the previous extent for the cases where we need it, which saves the callers from looking it up for us later in the series. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 8 ++++++-- fs/xfs/libxfs/xfs_bmap.h | 3 +-- fs/xfs/xfs_iomap.c | 3 +-- fs/xfs/xfs_reflink.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 05608faa28ce25..6af34b734ce874 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4235,7 +4235,6 @@ xfs_bmapi_reserve_delalloc( xfs_fileoff_t aoff, xfs_filblks_t len, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof) { @@ -4257,7 +4256,12 @@ xfs_bmapi_reserve_delalloc( else extsz = xfs_get_extsz_hint(ip); if (extsz) { - error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + struct xfs_bmbt_irec prev; + + if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof, 1, 0, &aoff, &alen); ASSERT(!error); } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 7cae6ec27fa6b2..e3c2b5adf50530 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -243,8 +243,7 @@ struct xfs_bmbt_rec_host * struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t aoff, xfs_filblks_t len, - struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, - xfs_extnum_t *lastx, int eof); + struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 436e109bb01e59..59ffcac8a47df8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -622,8 +622,7 @@ xfs_file_iomap_begin_delay( retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, &got, - &prev, &idx, eof); + end_fsb - offset_fsb, &got, &idx, eof); switch (error) { case 0: break; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0edf835af32dff..52cdfbaca30d14 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -293,7 +293,7 @@ xfs_reflink_reserve_cow( retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - end_fsb - imap->br_startoff, &got, &prev, &idx, eof); + end_fsb - imap->br_startoff, &got, &idx, eof); switch (error) { case 0: break; From 656152e552e5cbe0c11ad261b524376217c2fb13 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:44 +1100 Subject: [PATCH 21/63] xfs: use new extent lookup helpers xfs_file_iomap_begin_delay And only lookup the previous extent inside xfs_iomap_prealloc_size if we actually need it. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_iomap.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 59ffcac8a47df8..2272190b70aeec 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -395,11 +395,12 @@ xfs_iomap_prealloc_size( struct xfs_inode *ip, loff_t offset, loff_t count, - xfs_extnum_t idx, - struct xfs_bmbt_irec *prev) + xfs_extnum_t idx) { struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + struct xfs_bmbt_irec prev; int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; @@ -419,8 +420,8 @@ xfs_iomap_prealloc_size( */ if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - idx == 0 || - prev->br_startoff + prev->br_blockcount < offset_fsb) + !xfs_iext_get_extent(ifp, idx - 1, &prev) || + prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_writeio_blocks; /* @@ -439,8 +440,8 @@ xfs_iomap_prealloc_size( * always extends to MAXEXTLEN rather than falling short due to things * like stripe unit/width alignment of real extents. */ - if (prev->br_blockcount <= (MAXEXTLEN >> 1)) - alloc_blocks = prev->br_blockcount << 1; + if (prev.br_blockcount <= (MAXEXTLEN >> 1)) + alloc_blocks = prev.br_blockcount << 1; else alloc_blocks = XFS_B_TO_FSB(mp, offset); if (!alloc_blocks) @@ -538,7 +539,6 @@ xfs_file_iomap_begin_delay( xfs_fileoff_t end_fsb, orig_end_fsb; int error = 0, eof = 0; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec prev; xfs_extnum_t idx; ASSERT(!XFS_IS_REALTIME_INODE(ip)); @@ -563,8 +563,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, - &got, &prev); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); if (!eof && got.br_startoff <= offset_fsb) { if (xfs_is_reflink_inode(ip)) { bool shared; @@ -601,8 +600,7 @@ xfs_file_iomap_begin_delay( if (eof) { xfs_fsblock_t prealloc_blocks; - prealloc_blocks = - xfs_iomap_prealloc_size(ip, offset, count, idx, &prev); + prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; From 2755fc4438501c8c28e7783df890e889f6772bee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:49 +1100 Subject: [PATCH 22/63] xfs: use new extent lookup helpers in __xfs_reflink_reserve_cow Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 52cdfbaca30d14..35e02cecc4443e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -243,10 +243,11 @@ xfs_reflink_reserve_cow( struct xfs_bmbt_irec *imap, bool *shared) { - struct xfs_bmbt_irec got, prev; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got; xfs_fileoff_t end_fsb, orig_end_fsb; - int eof = 0, error = 0; - bool trimmed; + int error = 0; + bool eof = false, trimmed; xfs_extnum_t idx; xfs_extlen_t align; @@ -258,8 +259,9 @@ xfs_reflink_reserve_cow( * extent list is generally faster than going out to the shared extent * tree. */ - xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx, - &got, &prev); + + if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got)) + eof = true; if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); From 092d5d9d5812a80e4496cdbb24fe58e6f77da56b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:49 +1100 Subject: [PATCH 23/63] xfs: cleanup xfs_reflink_find_cow_mapping Use xfs_iext_lookup_extent to look up the extent, drop a useless check, drop a unneeded return value and clean up the general style a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 16 ++++++++-------- fs/xfs/xfs_reflink.c | 33 ++++++++++----------------------- fs/xfs/xfs_reflink.h | 2 +- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2693ba84ec2541..4923b54da55605 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -777,7 +777,7 @@ xfs_map_cow( { struct xfs_inode *ip = XFS_I(inode); struct xfs_bmbt_irec imap; - bool is_cow = false, need_alloc = false; + bool is_cow = false; int error; /* @@ -795,7 +795,7 @@ xfs_map_cow( * Else we need to check if there is a COW mapping at this offset. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!is_cow) @@ -805,7 +805,7 @@ xfs_map_cow( * And if the COW mapping has a delayed extent here we need to * allocate real space for it now. */ - if (need_alloc) { + if (isnullstartblock(imap.br_startblock)) { error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, &imap); if (error) @@ -1312,7 +1312,6 @@ __xfs_get_blocks( ssize_t size; int new = 0; bool is_cow = false; - bool need_alloc = false; BUG_ON(create && !direct); @@ -1338,9 +1337,11 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - if (create && direct && xfs_is_reflink_inode(ip)) - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, - &need_alloc); + if (create && direct && xfs_is_reflink_inode(ip)) { + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); + ASSERT(!is_cow || !isnullstartblock(imap.br_startblock)); + } + if (!is_cow) { error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ENTIRE); @@ -1357,7 +1358,6 @@ __xfs_get_blocks( xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, &imap); } - ASSERT(!need_alloc); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 35e02cecc4443e..e92355a39304c9 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -420,44 +420,31 @@ xfs_reflink_allocate_cow_range( } /* - * Find the CoW reservation (and whether or not it needs block allocation) - * for a given byte offset of a file. + * Find the CoW reservation for a given byte offset of a file. */ bool xfs_reflink_find_cow_mapping( struct xfs_inode *ip, xfs_off_t offset, - struct xfs_bmbt_irec *imap, - bool *need_alloc) + struct xfs_bmbt_irec *imap) { - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; - xfs_fileoff_t bno; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_fileoff_t offset_fsb; + struct xfs_bmbt_irec got; xfs_extnum_t idx; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(xfs_is_reflink_inode(ip)); - /* Find the extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - bno = XFS_B_TO_FSBT(ip->i_mount, offset); - gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); - if (!gotp) + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return false; - - xfs_bmbt_get_all(gotp, &irec); - if (bno >= irec.br_startoff + irec.br_blockcount || - bno < irec.br_startoff) + if (got.br_startoff > offset_fsb) return false; trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, - &irec); - - /* If it's still delalloc, we must allocate later. */ - *imap = irec; - *need_alloc = !!(isnullstartblock(irec.br_startblock)); - + &got); + *imap = got; return true; } diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 97ea9b487884e3..cff5fc329359e2 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -31,7 +31,7 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, - struct xfs_bmbt_irec *imap, bool *need_alloc); + struct xfs_bmbt_irec *imap); extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); From 86f12ab05ffc45fa88a09110f582875fa9f07c8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:50 +1100 Subject: [PATCH 24/63] xfs: use new extent lookup helpers in xfs_reflink_trim_irec_to_next_cow And remove the unused return value. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 33 ++++++++++++--------------------- fs/xfs/xfs_reflink.h | 2 +- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e92355a39304c9..d3cfae842237a0 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -451,43 +451,34 @@ xfs_reflink_find_cow_mapping( /* * Trim an extent to end at the next CoW reservation past offset_fsb. */ -int +void xfs_reflink_trim_irec_to_next_cow( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap) { - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got; xfs_extnum_t idx; if (!xfs_is_reflink_inode(ip)) - return 0; + return; /* Find the extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); - if (!gotp) - return 0; - xfs_bmbt_get_all(gotp, &irec); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + return; /* This is the extent before; try sliding up one. */ - if (irec.br_startoff < offset_fsb) { - idx++; - if (idx >= xfs_iext_count(ifp)) - return 0; - gotp = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_get_all(gotp, &irec); + if (got.br_startoff < offset_fsb) { + if (!xfs_iext_get_extent(ifp, idx + 1, &got)) + return; } - if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) - return 0; + if (got.br_startoff >= imap->br_startoff + imap->br_blockcount) + return; - imap->br_blockcount = irec.br_startoff - imap->br_startoff; + imap->br_blockcount = got.br_startoff - imap->br_startoff; trace_xfs_reflink_trim_irec(ip, imap); - - return 0; } /* diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index cff5fc329359e2..aa6a4d64bd35d9 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -32,7 +32,7 @@ extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, struct xfs_bmbt_irec *imap); -extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, +extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, From df5ab1b5a841aecd4854b75b4c2102a9d2d1996b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:50 +1100 Subject: [PATCH 25/63] xfs: use new extent lookup helpers in xfs_reflink_cancel_cow_blocks Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d3cfae842237a0..4e07da3d5af554 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -492,18 +492,15 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t end_fsb) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, prev, del; + struct xfs_bmbt_irec got, del; xfs_extnum_t idx; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error = 0, eof = 0; + int error = 0; if (!xfs_is_reflink_inode(ip)) return 0; - - xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx, - &got, &prev); - if (eof) + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return 0; while (got.br_startoff < end_fsb) { @@ -546,9 +543,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &idx, &got, &del); } - if (++idx >= xfs_iext_count(ifp)) + if (!xfs_iext_get_extent(ifp, ++idx, &got)) break; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } /* clear tag if cow fork is emptied */ From 4ab8671c19620f335d97673f941a631e15aa0d6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:39:50 +1100 Subject: [PATCH 26/63] xfs: use new extent lookup helpers in xfs_reflink_end_cow Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4e07da3d5af554..56372bee08c534 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -614,13 +614,13 @@ xfs_reflink_end_cow( xfs_off_t count) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, prev, del; + struct xfs_bmbt_irec got, del; struct xfs_trans *tp; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error, eof = 0; + int error; unsigned int resblks; xfs_filblks_t rlen; xfs_extnum_t idx; @@ -644,13 +644,11 @@ xfs_reflink_end_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx, - &got, &prev); - /* If there is a hole at end_fsb - 1 go to the previous extent */ - if (eof || got.br_startoff > end_fsb) { + if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || + got.br_startoff > end_fsb) { ASSERT(idx > 0); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); + xfs_iext_get_extent(ifp, --idx, &got); } /* Walk backwards until we're out of the I/O range... */ @@ -698,11 +696,9 @@ xfs_reflink_end_cow( error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto out_defer; - next_extent: - if (idx < 0) + if (!xfs_iext_get_extent(ifp, idx, &got)) break; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } error = xfs_trans_commit(tp); From 6edc977f775e5ac10655b03607ef091d2b06f2f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:40:14 +1100 Subject: [PATCH 27/63] xfs: remove xfs_bmap_search_extents Now that all users are gone. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 91 ---------------------------------------- fs/xfs/libxfs/xfs_bmap.h | 4 -- 2 files changed, 95 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6af34b734ce874..dcc847ceaebee6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1370,97 +1370,6 @@ xfs_bmap_read_extents( return -EFSCORRUPTED; } - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < xfs_iext_count(ifp)) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int fork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - - XFS_STATS_INC(ip->i_mount, xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, fork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && - !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state, *lastxp); - *lastxp = NULLEXTNUM; - *eofp = 1; - return NULL; - } - return ep; -} - /* * Returns the file-relative block number of the first unused block(s) * in the file with at least "len" logically contiguous blocks free. diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e3c2b5adf50530..ffed1f9208ced9 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -237,10 +237,6 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); -struct xfs_bmbt_rec_host * - xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, - int fork, int *eofp, xfs_extnum_t *lastxp, - struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t aoff, xfs_filblks_t len, struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); From 0e8d630ba039d9976d250eedb82c3a423ad15447 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Nov 2016 11:40:32 +1100 Subject: [PATCH 28/63] xfs: remove NULLEXTNUM We only ever set a field to this constant for an impossible to reach error case in xfs_bmap_search_extents. That functions has been removed, so we can remove the constant as well. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_types.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index dcc847ceaebee6..c2e13db1456318 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4260,7 +4260,7 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - if (bma->idx != NULLEXTNUM && bma->idx) { + if (bma->idx) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &bma->prev); } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8d74870468c24b..f9a1076de911e4 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -57,7 +57,6 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) -#define NULLEXTNUM ((xfs_extnum_t)-1) #define NULLCOMMITLSN ((xfs_lsn_t)-1) From 1247ec4c5f6e64b3cf01af58173820cdc61f11ed Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 28 Nov 2016 14:57:42 +1100 Subject: [PATCH 29/63] xfs: add XBF_XBF_NO_IOACCT to buf trace output When XBF_NO_IOACCT got added, it missed the translation in XFS_BUF_FLAGS, so we see "0x8" in trace output rather than the flag name. Fix it. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1c2e52b2d92614..38c9a95bda7d7c 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -71,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_NO_IOACCT, "NO_IOACCT" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ From fd26a88093bab6529ea2de819114ca92dbd1d71d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2016 14:57:42 +1100 Subject: [PATCH 30/63] xfs: factor rmap btree size into the indlen calculations When we're estimating the amount of space it's going to take to satisfy a delalloc reservation, we need to include the space that we might need to grow the rmapbt. This helps us to avoid running out of space later when _iomap_write_allocate needs more space than we reserved. Eryu Guan observed this happening on generic/224 when sunit/swidth were set. Reported-by: Eryu Guan Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c2e13db1456318..164790c1b41984 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -49,6 +49,7 @@ #include "xfs_rmap.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" +#include "xfs_rmap_btree.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -190,8 +191,12 @@ xfs_bmap_worst_indlen( int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ + xfs_filblks_t orig_len; mp = ip->i_mount; + + /* Calculate the worst-case size of the bmbt. */ + orig_len = len; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); @@ -199,12 +204,20 @@ xfs_bmap_worst_indlen( len += maxrecs - 1; do_div(len, maxrecs); rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + if (len == 1) { + rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; + break; + } if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } + + /* Calculate the worst-case size of the rmapbt. */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) + + mp->m_rmap_maxlevels; + return rval; } From bb6e0ebed7f7474e0e44cf6b778ee2fda759ddd6 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Mon, 28 Nov 2016 14:57:42 +1100 Subject: [PATCH 31/63] fs: xfs: xfs_icreate_item: constify xfs_item_ops structure Declare the structure xfs_item_ops as const as it is only passed as an argument to the function xfs_log_item_init. As this argument is of type const struct xfs_item_ops *, so xfs_item_ops structures having this property can be declared as const. Done using Coccinelle: @r1 disable optional_qualifier @ identifier i; position p; @@ static struct xfs_item_ops i@p = {...}; @ok1@ identifier r1.i; position p; expression e1,e2,e3; @@ xfs_log_item_init(e1,e2,e3,&i@p) @bad@ position p!={r1.p,ok1.p}; identifier r1.i; @@ i@p @depends on !bad disable optional_qualifier@ identifier r1.i; @@ static +const struct xfs_item_ops i={...}; @depends on !bad disable optional_qualifier@ identifier r1.i; @@ +const struct xfs_item_ops i; File size before: text data bss dec hex filename 737 64 8 809 329 fs/xfs/xfs_icreate_item.o File size after: text data bss dec hex filename 801 0 8 809 329 fs/xfs/xfs_icreate_item.o Signed-off-by: Bhumika Goyal Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_icreate_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d45ca72af6fbcb..865ad1373e5eb5 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -133,7 +133,7 @@ xfs_icreate_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_icreate_item_ops = { +static const struct xfs_item_ops xfs_icreate_item_ops = { .iop_size = xfs_icreate_item_size, .iop_format = xfs_icreate_item_format, .iop_pin = xfs_icreate_item_pin, From cf7841c12d85d1fe0ad33fb8bc5746809a882010 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Mon, 28 Nov 2016 14:57:42 +1100 Subject: [PATCH 32/63] fs: xfs: libxfs: constify xfs_nameops structures Declare the structure xfs_nameops as const as it is only stored in the m_dirnameops field of a xfs_mount structure. This field is of type const struct xfs_nameops *, so xfs_nameops structures having this property can be declared as const. Done using Coccinelle: @r1 disable optional_qualifier @ identifier i; position p; @@ static struct xfs_nameops i@p = {...}; @ok1@ identifier r1.i; position p; struct xfs_mount mp; @@ mp.m_dirnameops=&i@p @bad@ position p!={r1.p,ok1.p}; identifier r1.i; @@ i@p @depends on !bad disable optional_qualifier@ identifier r1.i; @@ static +const struct xfs_nameops i={...}; @depends on !bad disable optional_qualifier@ identifier r1.i; @@ +const struct xfs_nameops i; File size before: text data bss dec hex filename 5302 85 0 5387 150b fs/xfs/libxfs/xfs_dir2.o File size after: text data bss dec hex filename 5318 69 0 5387 150b fs/xfs/libxfs/xfs_dir2.o Signed-off-by: Bhumika Goyal Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 20a96dd5af7eb6..c58d72c220f585 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -93,7 +93,7 @@ xfs_ascii_ci_compname( return result; } -static struct xfs_nameops xfs_ascii_ci_nameops = { +static const struct xfs_nameops xfs_ascii_ci_nameops = { .hashname = xfs_ascii_ci_hashname, .compname = xfs_ascii_ci_compname, }; From fba3e594ef0ad911fa8f559732d588172f212d71 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 28 Nov 2016 14:57:42 +1100 Subject: [PATCH 33/63] xfs: always succeed when deduping zero bytes It turns out that btrfs and xfs had differing interpretations of what to do when the dedupe length is zero. Change xfs to follow btrfs' semantics so that the userland interface is consistent. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 56372bee08c534..c58371fde08def 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1317,8 +1317,14 @@ xfs_reflink_remap_range( goto out_unlock; } - if (len == 0) + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (len == 0) { + if (is_dedupe) { + ret = 0; + goto out_unlock; + } len = isize - pos_in; + } /* Ensure offsets don't wrap and the input is inside i_size */ if (pos_in + len < pos_in || pos_out + len < pos_out || From 974ae922efd93b07b6cdf989ae959883f6f05fd8 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 28 Nov 2016 14:57:42 +1100 Subject: [PATCH 34/63] xfs: track preallocation separately in xfs_bmapi_reserve_delalloc() Speculative preallocation is currently processed entirely by the callers of xfs_bmapi_reserve_delalloc(). The caller determines how much preallocation to include, adjusts the extent length and passes down the resulting request. While this works fine for post-eof speculative preallocation, it is not as reliable for COW fork preallocation. COW fork preallocation is implemented via the cowextszhint, which aligns the start offset as well as the length of the extent. Further, it is difficult for the caller to accurately identify when preallocation occurs because the returned extent could have been merged with neighboring extents in the fork. To simplify this situation and facilitate further COW fork preallocation enhancements, update xfs_bmapi_reserve_delalloc() to take a separate preallocation parameter to incorporate into the allocation request. The preallocation blocks value is tacked onto the end of the request and adjusted to accommodate neighboring extents and extent size limits. Since xfs_bmapi_reserve_delalloc() now knows precisely how much preallocation was included in the allocation, it can also tag the inodes appropriately to support preallocation reclaim. Note that xfs_bmapi_reserve_delalloc() callers are not yet updated to use the preallocation mechanism. This patch should not change behavior outside of correctly tagging reflink inodes when start offset preallocation occurs (which the caller does not handle correctly). Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 23 +++++++++++++++++++++-- fs/xfs/libxfs/xfs_bmap.h | 2 +- fs/xfs/xfs_iomap.c | 2 +- fs/xfs/xfs_reflink.c | 2 +- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 164790c1b41984..6b7e6eb2941453 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -50,6 +50,7 @@ #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_rmap_btree.h" +#include "xfs_icache.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -4154,8 +4155,9 @@ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, int whichfork, - xfs_fileoff_t aoff, + xfs_fileoff_t off, xfs_filblks_t len, + xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof) @@ -4167,10 +4169,17 @@ xfs_bmapi_reserve_delalloc( char rt = XFS_IS_REALTIME_INODE(ip); xfs_extlen_t extsz; int error; + xfs_fileoff_t aoff = off; - alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; /* Figure out the extent size, adjust alen */ if (whichfork == XFS_COW_FORK) @@ -4236,6 +4245,16 @@ xfs_bmapi_reserve_delalloc( */ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + ASSERT(got->br_startoff <= aoff); ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); ASSERT(isnullstartblock(got->br_startblock)); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index ffed1f9208ced9..cecd094404cc53 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -238,7 +238,7 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t aoff, xfs_filblks_t len, + xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); enum xfs_bmap_intent_type { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2272190b70aeec..e4bfde212cc287 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -620,7 +620,7 @@ xfs_file_iomap_begin_delay( retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, &got, &idx, eof); + end_fsb - offset_fsb, 0, &got, &idx, eof); switch (error) { case 0: break; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c58371fde08def..a5b027a43990e6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -295,7 +295,7 @@ xfs_reflink_reserve_cow( retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - end_fsb - imap->br_startoff, &got, &idx, eof); + end_fsb - imap->br_startoff, 0, &got, &idx, eof); switch (error) { case 0: break; From 0260d8ff5f76617e3a55a1c471383ecb4404c3ad Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 28 Nov 2016 14:57:42 +1100 Subject: [PATCH 35/63] xfs: clean up cow fork reservation and tag inodes correctly COW fork reservation is implemented via delayed allocation. The code is modeled after the traditional delalloc allocation code, but is slightly different in terms of how preallocation occurs. Rather than post-eof speculative preallocation, COW fork preallocation is implemented via a COW extent size hint that is designed to minimize fragmentation as a reflinked file is split over time. xfs_reflink_reserve_cow() still uses logic that is oriented towards dealing with post-eof speculative preallocation, however, and is stale or not necessarily correct. First, the EOF alignment to the COW extent size hint is implemented in xfs_bmapi_reserve_delalloc() (which does so correctly by aligning the start and end offsets) and so is not necessary in xfs_reflink_reserve_cow(). The backoff and retry logic on ENOSPC is also ineffective for the same reason, as xfs_bmapi_reserve_delalloc() will simply perform the same allocation request on the retry. Finally, since the COW extent size hint aligns the start and end offset of the range to allocate, the end_fsb != orig_end_fsb logic is not sufficient. Indeed, if a write request happens to end on an aligned offset, it is possible that we do not tag the inode for COW preallocation even though xfs_bmapi_reserve_delalloc() may have preallocated at the start offset. Kill the unnecessary, duplicate code in xfs_reflink_reserve_cow(). Remove the inode tag logic as well since xfs_bmapi_reserve_delalloc() has been updated to tag the inode correctly. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index a5b027a43990e6..becf2465dd2312 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -245,11 +245,9 @@ xfs_reflink_reserve_cow( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got; - xfs_fileoff_t end_fsb, orig_end_fsb; int error = 0; bool eof = false, trimmed; xfs_extnum_t idx; - xfs_extlen_t align; /* * Search the COW fork extent list first. This serves two purposes: @@ -287,33 +285,12 @@ xfs_reflink_reserve_cow( if (error) return error; - end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount; - - align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); - if (align) - end_fsb = roundup_64(end_fsb, align); - -retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - end_fsb - imap->br_startoff, 0, &got, &idx, eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ + imap->br_blockcount, 0, &got, &idx, eof); + if (error == -ENOSPC || error == -EDQUOT) trace_xfs_reflink_cow_enospc(ip, imap); - if (end_fsb != orig_end_fsb) { - end_fsb = orig_end_fsb; - goto retry; - } - /*FALLTHRU*/ - default: + if (error) return error; - } - - if (end_fsb != orig_end_fsb) - xfs_inode_set_cowblocks_tag(ip); trace_xfs_reflink_cow_alloc(ip, &got); return 0; From f782088c9e5d08e9494c63e68b4e85716df3e5f8 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 28 Nov 2016 14:57:42 +1100 Subject: [PATCH 36/63] xfs: pass post-eof speculative prealloc blocks to bmapi xfs_file_iomap_begin_delay() implements post-eof speculative preallocation by extending the block count of the requested delayed allocation. Now that xfs_bmapi_reserve_delalloc() has been updated to handle prealloc blocks separately and tag the inode, update xfs_file_iomap_begin_delay() to use the new parameter and rely on the former to tag the inode. Note that this patch does not change behavior. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_iomap.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e4bfde212cc287..15a83813b7085a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -536,10 +536,11 @@ xfs_file_iomap_begin_delay( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - xfs_fileoff_t end_fsb, orig_end_fsb; + xfs_fileoff_t end_fsb; int error = 0, eof = 0; struct xfs_bmbt_irec got; xfs_extnum_t idx; + xfs_fsblock_t prealloc_blocks = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -594,33 +595,32 @@ xfs_file_iomap_begin_delay( * the lower level functions are updated. */ count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = orig_end_fsb = - min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); if (eof) { - xfs_fsblock_t prealloc_blocks; - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; + xfs_fileoff_t p_end_fsb; end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); - end_fsb = XFS_B_TO_FSBT(mp, end_offset) + - prealloc_blocks; + p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; align = xfs_eof_alignment(ip, 0); if (align) - end_fsb = roundup_64(end_fsb, align); + p_end_fsb = roundup_64(p_end_fsb, align); - end_fsb = min(end_fsb, maxbytes_fsb); - ASSERT(end_fsb > offset_fsb); + p_end_fsb = min(p_end_fsb, maxbytes_fsb); + ASSERT(p_end_fsb > offset_fsb); + prealloc_blocks = p_end_fsb - end_fsb; } } retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, 0, &got, &idx, eof); + end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof); switch (error) { case 0: break; @@ -628,8 +628,8 @@ xfs_file_iomap_begin_delay( case -EDQUOT: /* retry without any preallocation */ trace_xfs_delalloc_enospc(ip, offset, count); - if (end_fsb != orig_end_fsb) { - end_fsb = orig_end_fsb; + if (prealloc_blocks) { + prealloc_blocks = 0; goto retry; } /*FALLTHRU*/ @@ -637,13 +637,6 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - /* - * Tag the inode as speculatively preallocated so we can reclaim this - * space on demand, if necessary. - */ - if (end_fsb != orig_end_fsb) - xfs_inode_set_eofblocks_tag(ip); - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) From 3816199506c7826983096fc65ed46f2733a47bb8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Oct 2016 11:59:24 -0600 Subject: [PATCH 37/63] block: add bio_iov_iter_get_pages() This is a helper that pins down a range from an iov_iter and adds it to a bio without requiring a separate memory allocation for the page array. It will be used for upcoming direct I/O implementations for block devices and iomap based file systems. Signed-off-by: Kent Overstreet [hch: ported to the iov_iter interface, renamed and added comments. All blame should be directed to me and all fame should go to Kent after this!] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe (cherry picked from commit 9cd56d916aa481ce8f56d9c5302a6ed90c2e0b5f) --- block/bio.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/bio.h | 1 + 2 files changed, 50 insertions(+) diff --git a/block/bio.c b/block/bio.c index db85c5753a7656..2cf6ebabc68c5c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -847,6 +847,55 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); +/** + * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * @bio: bio to add pages to + * @iter: iov iterator describing the region to be mapped + * + * Pins as many pages from *iter and appends them to @bio's bvec array. The + * pages will have to be released using put_page() when done. + */ +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + size_t offset, diff; + ssize_t size; + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE; + + /* + * Deep magic below: We need to walk the pinned pages backwards + * because we are abusing the space allocated for the bio_vecs + * for the page array. Because the bio_vecs are larger than the + * page pointers by definition this will always work. But it also + * means we can't use bio_add_page, so any changes to it's semantics + * need to be reflected here as well. + */ + bio->bi_iter.bi_size += size; + bio->bi_vcnt += nr_pages; + + diff = (nr_pages * PAGE_SIZE - offset) - size; + while (nr_pages--) { + bv[nr_pages].bv_page = pages[nr_pages]; + bv[nr_pages].bv_len = PAGE_SIZE; + bv[nr_pages].bv_offset = 0; + } + + bv[0].bv_offset += offset; + bv[0].bv_len -= offset; + if (diff) + bv[bio->bi_vcnt - 1].bv_len -= diff; + + iov_iter_advance(iter, size); + return 0; +} +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); + struct submit_bio_ret { struct completion event; int error; diff --git a/include/linux/bio.h b/include/linux/bio.h index 97cb48f03dc73c..66228c28c621c0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -430,6 +430,7 @@ void bio_chain(struct bio *, struct bio *); extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, const struct iov_iter *, gfp_t); From f8319483f57f1ca22370f4150bb990aca7728a67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Nov 2016 14:32:25 +1100 Subject: [PATCH 38/63] locking/lockdep: Provide a type check for lock_is_held Christoph requested lockdep_assert_held() variants that distinguish between held-for-read or held-for-write. Provide: int lock_is_held_type(struct lockdep_map *lock, int read) which takes the same argument as lock_acquire(.read) and matches it to the held_lock instance. Use of this function should be gated by the debug_locks variable. When that is 0 the return value of the lock_is_held_type() function is undefined. This is done to allow both negative and positive tests for holding locks. By default we provide (positive) lockdep_assert_held{,_exclusive,_read}() macros. Requested-by: Christoph Hellwig Signed-off-by: Peter Zijlstra (Intel) Tested-by: Jens Axboe Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- include/linux/lockdep.h | 25 +++++++++++++++++++++++-- kernel/locking/lockdep.c | 20 ++++++++++++-------- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index c1458fede1f964..1e327bb80838d8 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -338,9 +338,18 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); -#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) +/* + * Same "read" as for lock_acquire(), except -1 means any. + */ +extern int lock_is_held_type(struct lockdep_map *lock, int read); + +static inline int lock_is_held(struct lockdep_map *lock) +{ + return lock_is_held_type(lock, -1); +} -extern int lock_is_held(struct lockdep_map *lock); +#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) +#define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, @@ -372,6 +381,14 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); WARN_ON(debug_locks && !lockdep_is_held(l)); \ } while (0) +#define lockdep_assert_held_exclusive(l) do { \ + WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \ + } while (0) + +#define lockdep_assert_held_read(l) do { \ + WARN_ON(debug_locks && !lockdep_is_held_type(l, 1)); \ + } while (0) + #define lockdep_assert_held_once(l) do { \ WARN_ON_ONCE(debug_locks && !lockdep_is_held(l)); \ } while (0) @@ -428,7 +445,11 @@ struct lock_class_key { }; #define lockdep_depth(tsk) (0) +#define lockdep_is_held_type(l, r) (1) + #define lockdep_assert_held(l) do { (void)(l); } while (0) +#define lockdep_assert_held_exclusive(l) do { (void)(l); } while (0) +#define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_recursing(tsk) (0) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 589d763a49b395..cff580a6edf965 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3188,7 +3188,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; } -static int __lock_is_held(struct lockdep_map *lock); +static int __lock_is_held(struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. @@ -3329,7 +3329,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock)) + if (nest_lock && !__lock_is_held(nest_lock, -1)) return print_lock_nested_lock_not_held(curr, hlock, ip); if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) @@ -3576,7 +3576,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 1; } -static int __lock_is_held(struct lockdep_map *lock) +static int __lock_is_held(struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3584,8 +3584,12 @@ static int __lock_is_held(struct lockdep_map *lock) for (i = 0; i < curr->lockdep_depth; i++) { struct held_lock *hlock = curr->held_locks + i; - if (match_held_lock(hlock, lock)) - return 1; + if (match_held_lock(hlock, lock)) { + if (read == -1 || hlock->read == read) + return 1; + + return 0; + } } return 0; @@ -3769,7 +3773,7 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held(struct lockdep_map *lock) +int lock_is_held_type(struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; @@ -3781,13 +3785,13 @@ int lock_is_held(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - ret = __lock_is_held(lock); + ret = __lock_is_held(lock, read); current->lockdep_recursion = 0; raw_local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(lock_is_held); +EXPORT_SYMBOL_GPL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { From 6552321831dce87ff5c466a55b58d472732caadc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2016 14:33:25 +1100 Subject: [PATCH 39/63] xfs: remove i_iolock and use i_rwsem in the VFS inode instead This patch drops the XFS-own i_iolock and uses the VFS i_rwsem which recently replaced i_mutex instead. This means we only have to take one lock instead of two in many fast path operations, and we can also shrink the xfs_inode structure. Thanks to the xfs_ilock family there is very little churn, the only thing of note is that we need to switch to use the lock_two_directory helper for taking the i_rwsem on two inodes in a few places to make sure our lock order matches the one used in the VFS. Signed-off-by: Christoph Hellwig Tested-by: Jens Axboe Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 7 +--- fs/xfs/xfs_bmap_util.c | 12 +++--- fs/xfs/xfs_dir2_readdir.c | 2 - fs/xfs/xfs_file.c | 79 +++++++++++-------------------------- fs/xfs/xfs_icache.c | 6 +-- fs/xfs/xfs_inode.c | 82 ++++++++++++++++----------------------- fs/xfs/xfs_inode.h | 7 +--- fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_iops.c | 14 +++---- fs/xfs/xfs_pnfs.c | 7 +--- fs/xfs/xfs_pnfs.h | 4 +- fs/xfs/xfs_reflink.c | 14 +++---- fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_symlink.c | 7 ++-- 14 files changed, 86 insertions(+), 159 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index ab266d66124da0..e8f6c2bcd4a4f8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1585,7 +1585,6 @@ xfs_vm_bmap( struct xfs_inode *ip = XFS_I(inode); trace_xfs_vm_bmap(XFS_I(inode)); - xfs_ilock(ip, XFS_IOLOCK_SHARED); /* * The swap code (ab-)uses ->bmap to get a block mapping and then @@ -1593,12 +1592,10 @@ xfs_vm_bmap( * that on reflinks inodes, so we have to skip out here. And yes, * 0 is the magic code for a bmap error.. */ - if (xfs_is_reflink_inode(ip)) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (xfs_is_reflink_inode(ip)) return 0; - } + filemap_write_and_wait(mapping); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0670a8bd5818aa..b9abce524c33b1 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1935,8 +1935,8 @@ xfs_swap_extents( * page cache safely. Once we have done this we can take the ilocks and * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); + lock_flags = XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ @@ -2076,15 +2076,13 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); + unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - -out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); - return error; + goto out_unlock; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 29816981b50a92..003a99b83bd884 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -677,7 +677,6 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; - xfs_ilock(dp, XFS_IOLOCK_SHARED); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) @@ -686,7 +685,6 @@ xfs_readdir( rval = xfs_dir2_block_getdents(&args, ctx); else rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return rval; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d818c160451f50..d054b73b56fbba 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -47,40 +47,6 @@ static const struct vm_operations_struct xfs_file_vm_ops; -/* - * Locking primitives for read and write IO paths to ensure we consistently use - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. - */ -static inline void -xfs_rw_ilock( - struct xfs_inode *ip, - int type) -{ - if (type & XFS_IOLOCK_EXCL) - inode_lock(VFS_I(ip)); - xfs_ilock(ip, type); -} - -static inline void -xfs_rw_iunlock( - struct xfs_inode *ip, - int type) -{ - xfs_iunlock(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -static inline void -xfs_rw_ilock_demote( - struct xfs_inode *ip, - int type) -{ - xfs_ilock_demote(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - /* * Clear the specified ranges to zero through either the pagecache or DAX. * Holes and unwritten extents will be left as-is as they already are zeroed. @@ -273,7 +239,7 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); if (mapping->nrpages) { ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); if (ret) @@ -299,7 +265,7 @@ xfs_file_dio_aio_read( } out_unlock: - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -317,9 +283,9 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); return ret; @@ -335,9 +301,9 @@ xfs_file_buffered_aio_read( trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -418,15 +384,18 @@ xfs_file_aio_write_checks( if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock, true); + error = xfs_break_layouts(inode, iolock); if (error) return error; - /* For changing security info in file_remove_privs() we need i_mutex */ + /* + * For changing security info in file_remove_privs() we need i_rwsem + * exclusively. + */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); goto restart; } /* @@ -451,9 +420,9 @@ xfs_file_aio_write_checks( spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* @@ -559,7 +528,7 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -591,7 +560,7 @@ xfs_file_dio_aio_write( if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -621,7 +590,7 @@ xfs_file_dio_aio_write( iov_iter_advance(from, ret); } out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either @@ -643,7 +612,7 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -652,15 +621,13 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); - ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } - out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return error ? error : ret; } @@ -677,7 +644,7 @@ xfs_file_buffered_aio_write( int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -721,7 +688,7 @@ xfs_file_buffered_aio_write( current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return ret; } @@ -797,7 +764,7 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9c3e5c6ddf20d1..ff4d6311c7f4b4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -70,8 +70,6 @@ xfs_inode_alloc( ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - /* initialise the xfs inode */ ip->i_ino = ino; ip->i_mount = mp; @@ -394,8 +392,8 @@ xfs_iget_cache_hit( xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + init_rwsem(&inode->i_rwsem); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4e560e6a12c1cc..e9ab42d8965ba3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and - * the i_lock. This routine allows various combinations of the locks to be - * obtained. + * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 + * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, * the mmap lock second and the ilock last in order to prevent deadlock. * * Basic locking order: * - * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock * * mmap_sem locking order: * - * i_iolock -> page lock -> mmap_sem + * i_rwsem -> page lock -> mmap_sem * mmap_sem -> i_mmap_lock -> page_lock * * The difference in mmap_sem locking order mean that we cannot hold the * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can * fault in pages during copy in/out (for buffered IO) or require the mmap_sem * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because * page faults already hold the mmap_sem. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both * taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). @@ -191,10 +191,13 @@ xfs_ilock( (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_IOLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_IOLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_MMAPLOCK_EXCL) mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) + if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) goto out; } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) + if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) goto out; } @@ -271,9 +274,9 @@ xfs_ilock_nowait( mrunlock_shared(&ip->i_mmaplock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); out: return 0; } @@ -310,9 +313,9 @@ xfs_iunlock( ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) mrunlock_excl(&ip->i_mmaplock); @@ -345,7 +348,7 @@ xfs_ilock_demote( if (lock_flags & XFS_MMAPLOCK_EXCL) mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); + downgrade_write(&VFS_I(ip)->i_rwsem); trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } @@ -370,8 +373,9 @@ xfs_isilocked( if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); + return !debug_locks || + lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); + return rwsem_is_locked(&VFS_I(ip)->i_rwsem); } ASSERT(0); @@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); - ASSERT(xfs_lockdep_subclass_ok(subclass + - XFS_IOLOCK_PARENT_VAL)); class += subclass << XFS_IOLOCK_SHIFT; - if (lock_mode & XFS_IOLOCK_PARENT) - class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { @@ -477,8 +477,6 @@ xfs_lock_inodes( XFS_ILOCK_EXCL)); ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | XFS_ILOCK_SHARED))); - ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || - inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || @@ -581,10 +579,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { - ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); - ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); @@ -715,7 +711,6 @@ xfs_lookup( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - xfs_ilock(dp, XFS_IOLOCK_SHARED); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) goto out_unlock; @@ -724,14 +719,12 @@ xfs_lookup( if (error) goto out_free_name; - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return 0; out_free_name: if (ci_name) kmem_free(ci_name->name); out_unlock: - xfs_iunlock(dp, XFS_IOLOCK_SHARED); *ipp = NULL; return error; } @@ -1215,8 +1208,7 @@ xfs_create( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; xfs_defer_init(&dfops, &first_block); @@ -1252,7 +1244,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1325,7 +1317,7 @@ xfs_create( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -1466,11 +1458,10 @@ xfs_link( if (error) goto std_return; - xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow hard link @@ -2579,10 +2570,9 @@ xfs_remove( goto std_return; } - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* @@ -2963,12 +2953,6 @@ xfs_rename( * whether the target directory is the same as the source * directory, we can lock from 2 to 4 inodes. */ - if (!new_parent) - xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - else - xfs_lock_two_inodes(src_dp, target_dp, - XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* @@ -2976,9 +2960,9 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 71e8a81c91a371..10dcf27b4c85ae 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,7 +56,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_iolock; /* inode IO lock */ mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ @@ -333,7 +332,7 @@ static inline void xfs_ifunlock(struct xfs_inode *ip) * IOLOCK values * * 0-3 subclass value - * 4-7 PARENT subclass values + * 4-7 unused * * MMAPLOCK values * @@ -348,10 +347,8 @@ static inline void xfs_ifunlock(struct xfs_inode *ip) * */ #define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT_VAL 4 -#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1) +#define XFS_IOLOCK_MAX_SUBCLASS 3 #define XFS_IOLOCK_DEP_MASK 0x000f0000 -#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT) #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a39197501a7cd8..fc563b82aea65a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -639,7 +639,7 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 405a65cd9d6bcf..c962999a87ab7c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -983,15 +983,13 @@ xfs_vn_setattr( struct xfs_inode *ip = XFS_I(d_inode(dentry)); uint iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); - error = xfs_break_layouts(d_inode(dentry), &iolock, true); - if (!error) { - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_break_layouts(d_inode(dentry), &iolock); + if (error) + return error; - error = xfs_vn_setattr_size(dentry, iattr); - } - xfs_iunlock(ip, iolock); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_setattr_size(ip, iattr); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { error = xfs_vn_setattr_nonsize(dentry, iattr); } diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 93a7aafa56d6fd..2f2dc3c09ad008 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -32,8 +32,7 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock, - bool with_imutex) + uint *iolock) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -42,12 +41,8 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); - if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; - if (with_imutex) - inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index e8339f74966b18..b587cb99b2b761 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); +int xfs_break_layouts(struct inode *inode, uint *iolock); #else static inline int -xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) +xfs_break_layouts(struct inode *inode, uint *iolock) { return 0; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index becf2465dd2312..88fd03c66e990a 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1251,13 +1251,11 @@ xfs_reflink_remap_range( return -EIO; /* Lock both files against IO */ - if (same_inode) { - xfs_ilock(src, XFS_IOLOCK_EXCL); + lock_two_nondirectories(inode_in, inode_out); + if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); - } else { - xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + else xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); - } /* Don't touch certain kinds of inodes */ ret = -EPERM; @@ -1402,11 +1400,9 @@ xfs_reflink_remap_range( out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - xfs_iunlock(src, XFS_IOLOCK_EXCL); - if (src->i_ino != dest->i_ino) { + if (!same_inode) xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - xfs_iunlock(dest, XFS_IOLOCK_EXCL); - } + unlock_two_nondirectories(inode_in, inode_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ade4691e3f7403..563d1d146b8c3b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -943,7 +943,7 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 58142aeeeea69d..f2cb45ed1d54c3 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -238,8 +238,7 @@ xfs_symlink( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* @@ -287,7 +286,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; /* @@ -412,7 +411,7 @@ xfs_symlink( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } From ec1b826097f30858f9ed201cb78f1a762c50d0aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2016 14:33:53 +1100 Subject: [PATCH 40/63] fs: make sb_init_dio_done_wq available outside of direct-io.c We want to use the per-sb completion workqueue from the new iomap direct I/O code. Signed-off-by: Christoph Hellwig Tested-by: Jens Axboe Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/direct-io.c | 2 +- fs/internal.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index fb9aa16a772728..19aa448fde6aa0 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ -static int sb_init_dio_done_wq(struct super_block *sb) +int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", diff --git a/fs/internal.h b/fs/internal.h index f4da3341b4a37d..4fcf51766d4a6b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap_ops *ops, void *data, iomap_actor_t actor); + +/* direct-io.c: */ +int sb_init_dio_done_wq(struct super_block *sb); From ff6a9292e6f633d596826be5ba70d3ef90cc3300 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2016 14:36:01 +1100 Subject: [PATCH 41/63] iomap: implement direct I/O This adds a full fledget direct I/O implementation using the iomap interface. Full fledged in this case means all features are supported: AIO, vectored I/O, any iov_iter type including kernel pointers, bvecs and pipes, support for hole filling and async apending writes. It does not mean supporting all the warts of the old generic code. We expect i_rwsem to be held over the duration of the call, and we expect to maintain i_dio_count ourselves, and we pass on any kinds of mapping to the file system for now. The algorithm used is very simple: We use iomap_apply to iterate over the range of the I/O, and then we use the new bio_iov_iter_get_pages helper to lock down the user range for the size of the extent. bio_iov_iter_get_pages can currently lock down twice as many pages as the old direct I/O code did, which means that we will have a better batch factor for everything but overwrites of badly fragmented files. Signed-off-by: Christoph Hellwig Reviewed-by: Kent Overstreet Tested-by: Jens Axboe Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/iomap.c | 373 ++++++++++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 11 ++ 2 files changed, 384 insertions(+) diff --git a/fs/iomap.c b/fs/iomap.c index 13dd413b2b9c6a..fc244624293540 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "internal.h" @@ -584,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); + +/* + * Private flags for iomap_dio, must not overlap with the public ones in + * iomap.h: + */ +#define IOMAP_DIO_WRITE (1 << 30) +#define IOMAP_DIO_DIRTY (1 << 31) + +struct iomap_dio { + struct kiocb *iocb; + iomap_dio_end_io_t *end_io; + loff_t i_size; + loff_t size; + atomic_t ref; + unsigned flags; + int error; + + union { + /* used during submission and for synchronous completion: */ + struct { + struct iov_iter *iter; + struct task_struct *waiter; + struct request_queue *last_queue; + blk_qc_t cookie; + } submit; + + /* used for aio completion: */ + struct { + struct work_struct work; + } aio; + }; +}; + +static ssize_t iomap_dio_complete(struct iomap_dio *dio) +{ + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (dio->end_io) { + ret = dio->end_io(iocb, + dio->error ? dio->error : dio->size, + dio->flags); + } else { + ret = dio->error; + } + + if (likely(!ret)) { + ret = dio->size; + /* check for short read */ + if (iocb->ki_pos + ret > dio->i_size && + !(dio->flags & IOMAP_DIO_WRITE)) + ret = dio->i_size - iocb->ki_pos; + iocb->ki_pos += ret; + } + + inode_dio_end(file_inode(iocb->ki_filp)); + kfree(dio); + + return ret; +} + +static void iomap_dio_complete_work(struct work_struct *work) +{ + struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); + struct kiocb *iocb = dio->iocb; + bool is_write = (dio->flags & IOMAP_DIO_WRITE); + ssize_t ret; + + ret = iomap_dio_complete(dio); + if (is_write && ret > 0) + ret = generic_write_sync(iocb, ret); + iocb->ki_complete(iocb, ret, 0); +} + +/* + * Set an error in the dio if none is set yet. We have to use cmpxchg + * as the submission context and the completion context(s) can race to + * update the error. + */ +static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) +{ + cmpxchg(&dio->error, 0, ret); +} + +static void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_error) + iomap_dio_set_error(dio, bio->bi_error); + + if (atomic_dec_and_test(&dio->ref)) { + if (is_sync_kiocb(dio->iocb)) { + struct task_struct *waiter = dio->submit.waiter; + + WRITE_ONCE(dio->submit.waiter, NULL); + wake_up_process(waiter); + } else if (dio->flags & IOMAP_DIO_WRITE) { + struct inode *inode = file_inode(dio->iocb->ki_filp); + + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + } else { + iomap_dio_complete_work(&dio->aio.work); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); + bio_put(bio); + } +} + +static blk_qc_t +iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, + unsigned len) +{ + struct page *page = ZERO_PAGE(0); + struct bio *bio; + + bio = bio_alloc(GFP_KERNEL, 1); + bio->bi_bdev = iomap->bdev; + bio->bi_iter.bi_sector = + iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + get_page(page); + if (bio_add_page(bio, page, len, 0) != len) + BUG(); + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT); + + atomic_inc(&dio->ref); + return submit_bio(bio); +} + +static loff_t +iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct iomap_dio *dio = data; + unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); + unsigned fs_block_size = (1 << inode->i_blkbits), pad; + unsigned align = iov_iter_alignment(dio->submit.iter); + struct iov_iter iter; + struct bio *bio; + bool need_zeroout = false; + int nr_pages, ret; + + if ((pos | length | align) & ((1 << blkbits) - 1)) + return -EINVAL; + + switch (iomap->type) { + case IOMAP_HOLE: + if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) + return -EIO; + /*FALLTHRU*/ + case IOMAP_UNWRITTEN: + if (!(dio->flags & IOMAP_DIO_WRITE)) { + iov_iter_zero(length, dio->submit.iter); + dio->size += length; + return length; + } + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + break; + case IOMAP_MAPPED: + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; + if (iomap->flags & IOMAP_F_NEW) + need_zeroout = true; + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + /* + * Operate on a partial iter trimmed to the extent we were called for. + * We'll update the iter in the dio once we're done with this extent. + */ + iter = *dio->submit.iter; + iov_iter_truncate(&iter, length); + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + if (nr_pages <= 0) + return nr_pages; + + if (need_zeroout) { + /* zero out from the start of the block to the write offset */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos - pad, pad); + } + + do { + if (dio->error) + return 0; + + bio = bio_alloc(GFP_KERNEL, nr_pages); + bio->bi_bdev = iomap->bdev; + bio->bi_iter.bi_sector = + iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + ret = bio_iov_iter_get_pages(bio, &iter); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + + if (dio->flags & IOMAP_DIO_WRITE) { + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT); + task_io_account_write(bio->bi_iter.bi_size); + } else { + bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); + } + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + + atomic_inc(&dio->ref); + + dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.cookie = submit_bio(bio); + } while (nr_pages); + + if (need_zeroout) { + /* zero out from the end of the write to the end of the block */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + } + + iov_iter_advance(dio->submit.iter, length); + return length; +} + +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops, + iomap_dio_end_io_t end_io) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = file_inode(iocb->ki_filp); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0; + unsigned int flags = IOMAP_DIRECT; + struct blk_plug plug; + struct iomap_dio *dio; + + lockdep_assert_held(&inode->i_rwsem); + + if (!count) + return 0; + + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + if (!dio) + return -ENOMEM; + + dio->iocb = iocb; + atomic_set(&dio->ref, 1); + dio->size = 0; + dio->i_size = i_size_read(inode); + dio->end_io = end_io; + dio->error = 0; + dio->flags = 0; + + dio->submit.iter = iter; + if (is_sync_kiocb(iocb)) { + dio->submit.waiter = current; + dio->submit.cookie = BLK_QC_T_NONE; + dio->submit.last_queue = NULL; + } + + if (iov_iter_rw(iter) == READ) { + if (pos >= dio->i_size) + goto out_free_dio; + + if (iter->type == ITER_IOVEC) + dio->flags |= IOMAP_DIO_DIRTY; + } else { + dio->flags |= IOMAP_DIO_WRITE; + flags |= IOMAP_WRITE; + } + + if (mapping->nrpages) { + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); + if (ret) + goto out_free_dio; + + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; + } + + inode_dio_begin(inode); + + blk_start_plug(&plug); + do { + ret = iomap_apply(inode, pos, count, flags, ops, dio, + iomap_dio_actor); + if (ret <= 0) { + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) + ret = 0; + break; + } + pos += ret; + } while ((count = iov_iter_count(iter)) > 0); + blk_finish_plug(&plug); + + if (ret < 0) + iomap_dio_set_error(dio, ret); + + if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + iomap_dio_set_error(dio, ret); + } + + if (!atomic_dec_and_test(&dio->ref)) { + if (!is_sync_kiocb(iocb)) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !dio->submit.last_queue || + !blk_poll(dio->submit.last_queue, + dio->submit.cookie)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + } + + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (iov_iter_rw(iter) == WRITE && mapping->nrpages) { + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + return iomap_dio_complete(dio); + +out_free_dio: + kfree(dio); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f185156de74d82..a4c94b86401e16 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -50,6 +50,7 @@ struct iomap { #define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ +#define IOMAP_DIRECT (1 << 4) /* direct I/O */ struct iomap_ops { /* @@ -83,4 +84,14 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, struct iomap_ops *ops); +/* + * Flags for direct I/O ->end_io: + */ +#define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ +#define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ +typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, + unsigned flags); +ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops, iomap_dio_end_io_t end_io); + #endif /* LINUX_IOMAP_H */ From acdda3aae146d9b69d30e9d8a32a8d8937055523 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Nov 2016 14:37:15 +1100 Subject: [PATCH 42/63] xfs: use iomap_dio_rw Straight switch over to using iomap for direct I/O - we already have the non-COW dio path in write_begin for DAX and files with extent size hints, so nothing to add there. The COW path is ported over from the old get_blocks version and a bit of a mess, but I have some work in progress to make it look more like the buffered I/O COW path. This gets rid of xfs_get_blocks_direct and the last caller of xfs_get_blocks with the create flag set, so all that code can be removed. Last but not least I've removed a comment in xfs_filemap_fault that refers to xfs_get_blocks entirely instead of updating it - while the reference is correct, the whole DAX fault path looks different than the non-DAX one, so it seems rather pointless. Signed-off-by: Christoph Hellwig Tested-by: Jens Axboe Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 291 ++------------------------------------------- fs/xfs/xfs_aops.h | 6 - fs/xfs/xfs_file.c | 149 ++++++++++------------- fs/xfs/xfs_iomap.c | 50 ++++++-- 4 files changed, 110 insertions(+), 386 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e8f6c2bcd4a4f8..265000a093277e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -37,11 +37,6 @@ #include #include -/* flags for direct write completions */ -#define XFS_DIO_FLAG_UNWRITTEN (1 << 0) -#define XFS_DIO_FLAG_APPEND (1 << 1) -#define XFS_DIO_FLAG_COW (1 << 2) - /* * structure owned by writepages passed to individual writepage calls */ @@ -1175,45 +1170,6 @@ xfs_vm_releasepage( return try_to_free_buffers(page); } -/* - * When we map a DIO buffer, we may need to pass flags to - * xfs_end_io_direct_write to tell it what kind of write IO we are doing. - * - * Note that for DIO, an IO to the highest supported file block offset (i.e. - * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 - * bit variable. Hence if we see this overflow, we have to assume that the IO is - * extending the file size. We won't know for sure until IO completion is run - * and the actual max write offset is communicated to the IO completion - * routine. - */ -static void -xfs_map_direct( - struct inode *inode, - struct buffer_head *bh_result, - struct xfs_bmbt_irec *imap, - xfs_off_t offset, - bool is_cow) -{ - uintptr_t *flags = (uintptr_t *)&bh_result->b_private; - xfs_off_t size = bh_result->b_size; - - trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : - XFS_IO_OVERWRITE, imap); - - if (ISUNWRITTEN(imap)) { - *flags |= XFS_DIO_FLAG_UNWRITTEN; - set_buffer_defer_completion(bh_result); - } else if (is_cow) { - *flags |= XFS_DIO_FLAG_COW; - set_buffer_defer_completion(bh_result); - } - if (offset + size > i_size_read(inode) || offset + size < 0) { - *flags |= XFS_DIO_FLAG_APPEND; - set_buffer_defer_completion(bh_result); - } -} - /* * If this is O_DIRECT or the mpage code calling tell them how large the mapping * is, so that we can avoid repeated get_blocks calls. @@ -1254,51 +1210,12 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } -/* Bounce unaligned directio writes to the page cache. */ static int -xfs_bounce_unaligned_dio_write( - struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap) -{ - struct xfs_bmbt_irec irec; - xfs_fileoff_t delta; - bool shared; - bool x; - int error; - - irec = *imap; - if (offset_fsb > irec.br_startoff) { - delta = offset_fsb - irec.br_startoff; - irec.br_blockcount -= delta; - irec.br_startblock += delta; - irec.br_startoff = offset_fsb; - } - error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); - if (error) - return error; - - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If any part - * of the extent is shared, fall back to buffered mode to handle - * the RMW. This is done by returning -EREMCHG ("remote addr - * changed"), which is caught further up the call stack. - */ - if (shared) { - trace_xfs_reflink_bounce_dio_write(ip, imap); - return -EREMCHG; - } - return 0; -} - -STATIC int -__xfs_get_blocks( +xfs_get_blocks( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, - int create, - bool direct) + int create) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1309,10 +1226,8 @@ __xfs_get_blocks( int nimaps = 1; xfs_off_t offset; ssize_t size; - int new = 0; - bool is_cow = false; - BUG_ON(create && !direct); + BUG_ON(create); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1321,7 +1236,7 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && offset >= i_size_read(inode)) + if (offset >= i_size_read(inode)) return 0; /* @@ -1336,73 +1251,12 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - if (create && direct && xfs_is_reflink_inode(ip)) { - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); - ASSERT(!is_cow || !isnullstartblock(imap.br_startblock)); - } - - if (!is_cow) { - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); - /* - * Truncate an overwrite extent if there's a pending CoW - * reservation before the end of this extent. This - * forces us to come back to get_blocks to take care of - * the CoW. - */ - if (create && direct && nimaps && - imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(&imap)) - xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, - &imap); - } + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; - /* - * The only time we can ever safely find delalloc blocks on direct I/O - * is a dio write to post-eof speculative preallocation. All other - * scenarios are indicative of a problem or misuse (such as mixing - * direct and mapped I/O). - * - * The file may be unmapped by the time we get here so we cannot - * reliably fail the I/O based on mapping. Instead, fail the I/O if this - * is a read or a write within eof. Otherwise, carry on but warn as a - * precuation if the file happens to be mapped. - */ - if (direct && imap.br_startblock == DELAYSTARTBLOCK) { - if (!create || offset < i_size_read(VFS_I(ip))) { - WARN_ON_ONCE(1); - error = -EIO; - goto out_unlock; - } - WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping)); - } - - /* for DAX, we convert unwritten extents directly */ - if (create && - (!nimaps || - (imap.br_startblock == HOLESTARTBLOCK || - imap.br_startblock == DELAYSTARTBLOCK) || - (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; - - trace_xfs_get_blocks_alloc(ip, offset, size, - ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN - : XFS_IO_DELALLOC, &imap); - } else if (nimaps) { + if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); @@ -1412,12 +1266,6 @@ __xfs_get_blocks( goto out_unlock; } - if (IS_DAX(inode) && create) { - ASSERT(!ISUNWRITTEN(&imap)); - /* zeroing is not needed at a higher layer */ - new = 0; - } - /* trim mapping down to size requested */ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); @@ -1427,43 +1275,14 @@ __xfs_get_blocks( */ if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && - (create || !ISUNWRITTEN(&imap))) { - if (create && direct && !is_cow) { - error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, - &imap); - if (error) - return error; - } - + !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); - if (ISUNWRITTEN(&imap)) - set_buffer_unwritten(bh_result); - /* direct IO needs special help */ - if (create) - xfs_map_direct(inode, bh_result, &imap, offset, is_cow); - } /* * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ bh_result->b_bdev = xfs_find_bdev_for_inode(inode); - - /* - * If we previously allocated a block out beyond eof and we are now - * coming back to use it then we will need to flag it as new even if it - * has a disk address. - * - * With sub-block writes into unwritten extents we also need to mark - * the buffer as new so that the unwritten parts of the buffer gets - * correctly zeroed. - */ - if (create && - ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || - (offset >= i_size_read(inode)) || - (new || ISUNWRITTEN(&imap)))) - set_buffer_new(bh_result); - return 0; out_unlock: @@ -1471,100 +1290,6 @@ __xfs_get_blocks( return error; } -int -xfs_get_blocks( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, false); -} - -int -xfs_get_blocks_direct( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true); -} - -/* - * Complete a direct I/O write request. - * - * xfs_map_direct passes us some flags in the private data to tell us what to - * do. If no flags are set, then the write IO is an overwrite wholly within - * the existing allocated file size and so there is nothing for us to do. - * - * Note that in this case the completion can be called in interrupt context, - * whereas if we have flags set we will always be called in task context - * (i.e. from a workqueue). - */ -int -xfs_end_io_direct_write( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - uintptr_t flags = (uintptr_t)private; - int error = 0; - - trace_xfs_end_io_direct_write(ip, offset, size); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - if (size <= 0) - return size; - - /* - * The flags tell us whether we are doing unwritten extent conversions - * or an append transaction that updates the on-disk file size. These - * cases are the only cases where we should *potentially* be needing - * to update the VFS inode size. - */ - if (flags == 0) { - ASSERT(offset + size <= i_size_read(inode)); - return 0; - } - - /* - * We need to update the in-core inode size here so that we don't end up - * with the on-disk inode size being outside the in-core inode size. We - * have no other method of updating EOF for AIO, so always do it here - * if necessary. - * - * We need to lock the test/set EOF update as we can be racing with - * other IO completions here to update the EOF. Failing to serialise - * here can result in EOF moving backwards and Bad Things Happen when - * that occurs. - */ - spin_lock(&ip->i_flags_lock); - if (offset + size > i_size_read(inode)) - i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); - - if (flags & XFS_DIO_FLAG_COW) - error = xfs_reflink_end_cow(ip, offset, size); - if (flags & XFS_DIO_FLAG_UNWRITTEN) { - trace_xfs_end_io_direct_write_unwritten(ip, offset, size); - - error = xfs_iomap_write_unwritten(ip, offset, size); - } - if (flags & XFS_DIO_FLAG_APPEND) { - trace_xfs_end_io_direct_write_append(ip, offset, size); - - error = xfs_setfilesize(ip, offset, size); - } - - return error; -} - STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 34dc00dfb91d80..cc174ec6c2fd08 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -55,12 +55,6 @@ struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; -int xfs_get_blocks(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_get_blocks_direct(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private); int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d054b73b56fbba..f5effa68e037ae 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -210,62 +210,21 @@ xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - loff_t isize = i_size_read(inode); + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); size_t count = iov_iter_count(to); - loff_t end = iocb->ki_pos + count - 1; - struct iov_iter data; - struct xfs_buftarg *target; - ssize_t ret = 0; + ssize_t ret; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ - if (XFS_IS_REALTIME_INODE(ip)) - target = ip->i_mount->m_rtdev_targp; - else - target = ip->i_mount->m_ddev_targp; - - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } - file_accessed(iocb->ki_filp); xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out_unlock; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } - - data = *to; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); - if (ret >= 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } - -out_unlock: + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; } @@ -465,6 +424,58 @@ xfs_file_aio_write_checks( return 0; } +static int +xfs_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + loff_t offset = iocb->ki_pos; + bool update_size = false; + int error = 0; + + trace_xfs_end_io_direct_write(ip, offset, size); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (size <= 0) + return size; + + /* + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) { + i_size_write(inode, offset + size); + update_size = true; + } + spin_unlock(&ip->i_flags_lock); + + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + if (flags & IOMAP_DIO_UNWRITTEN) + error = xfs_iomap_write_unwritten(ip, offset, size); + else if (update_size) + error = xfs_setfilesize(ip, offset, size); + + return error; +} + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -504,9 +515,7 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - loff_t end; - struct iov_iter data; - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ @@ -534,23 +543,6 @@ xfs_file_dio_aio_write( if (ret) goto out; count = iov_iter_count(from); - end = iocb->ki_pos + count - 1; - - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } /* * If we are doing unaligned IO, wait for all other IO to drain, @@ -573,22 +565,7 @@ xfs_file_dio_aio_write( goto out; } - data = *from; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - - /* see generic_file_direct_write() for why this is necessary */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } - - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: xfs_iunlock(ip, iolock); @@ -1468,15 +1445,9 @@ xfs_filemap_fault( return xfs_filemap_page_mkwrite(vma, vmf); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) { - /* - * we do not want to trigger unwritten extent conversion on read - * faults - that is unnecessary overhead and would also require - * changes to xfs_get_blocks_direct() to map unwritten extent - * ioend for conversion on read-only mappings. - */ + if (IS_DAX(inode)) ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); - } else + else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 15a83813b7085a..0d147428971e0c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -950,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && ISUNWRITTEN(imap)); } +static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) +{ + /* + * COW writes will allocate delalloc space, so we need to make sure + * to take the lock exclusively here. + */ + if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) + return true; + if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) + return true; + return false; +} + static int xfs_file_iomap_begin( struct inode *inode, @@ -969,18 +982,14 @@ xfs_file_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && - !xfs_get_extsz_hint(ip)) { + if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && + !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ return xfs_file_iomap_begin_delay(inode, offset, length, flags, iomap); } - /* - * COW writes will allocate delalloc space, so we need to make sure - * to take the lock exclusively here. - */ - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + if (need_excl_ilock(ip, flags)) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); } else { @@ -993,17 +1002,41 @@ xfs_file_iomap_begin( offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); + if (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) { + shared = xfs_reflink_find_cow_mapping(ip, offset, &imap); + if (shared) { + xfs_iunlock(ip, lockmode); + goto alloc_done; + } + ASSERT(!isnullstartblock(imap.br_startblock)); + } + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if (flags & IOMAP_REPORT) { + if ((flags & IOMAP_REPORT) || + (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) { /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) goto out_unlock; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If the + * extent is shared, fall back to buffered mode to handle the + * RMW. + */ + if (!(flags & IOMAP_REPORT) && shared) { + trace_xfs_reflink_bounce_dio_write(ip, &imap); + error = -EREMCHG; + goto out_unlock; + } } if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { @@ -1038,6 +1071,7 @@ xfs_file_iomap_begin( if (error) return error; +alloc_done: iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { From 6b10b23ca94451fae153a5cc8d62fd721bec2019 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:06 +1100 Subject: [PATCH 43/63] xfs: set AGI buffer type in xlog_recover_clear_agi_bucket xlog_recover_clear_agi_bucket didn't set the type to XFS_BLFT_AGI_BUF, so we got a warning during log replay (or an ASSERT on a debug build). XFS (md0): Unknown buffer type 0! XFS (md0): _xfs_buf_ioapply: no ops on block 0xaea8802/0x1 Fix this, as was done in f19b872b for 2 other locations with the same problem. cc: # 3.10 to current Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d9c..2d91f5ab753811 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4929,6 +4929,7 @@ xlog_recover_clear_agi_bucket( agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); From 200237d6746faaeaf7f4ff4abbf13f3917cee60a Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:31 +1100 Subject: [PATCH 44/63] xfs: Move AGI buffer type setting to xfs_read_agi We've missed properly setting the buffer type for an AGI transaction in 3 spots now, so just move it into xfs_read_agi() and set it if we are in a transaction to avoid the problem in the future. This is similar to how it is done in i.e. the dir3 and attr3 read functions. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_ialloc.c | 4 ++-- fs/xfs/xfs_inode.c | 2 -- fs/xfs/xfs_log_recover.c | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 51b4e0de1fdc42..c482b9716347c6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2450,8 +2450,6 @@ xfs_ialloc_log_agi( ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); - /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through @@ -2592,6 +2590,8 @@ xfs_read_agi( XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; + if (tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4e560e6a12c1cc..512ff13ed66a95 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2041,7 +2041,6 @@ xfs_iunlink( agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); return 0; @@ -2133,7 +2132,6 @@ xfs_iunlink_remove( agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); } else { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 2d91f5ab753811..9b3d7c76915d9c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4929,7 +4929,6 @@ xlog_recover_clear_agi_bucket( agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); From 7710517fc37b1899722707883b54694ea710b3c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:50 +1100 Subject: [PATCH 45/63] xfs: pass state not whichfork to trace_xfs_extlist When xfs_bmap_trace_exlist called trace_xfs_extlist, it sent in the "whichfork" var instead of the bmap "state" as expected (even though state was already set up for this purpose). As a result, the xfs_bmap_class in tracing code used "whichfork" not state in xfs_iext_state_to_fork(), and got the wrong ifork pointer. It all goes downhill from there, including an ASSERT when ifp_bytes is empty by the time it reaches xfs_iext_get_ext(): XFS: Assertion failed: idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6b7e6eb2941453..e4120fcefcc893 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -531,7 +531,7 @@ xfs_bmap_trace_exlist( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(cnt == xfs_iext_count(ifp)); for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); + trace_xfs_extlist(ip, idx, state, caller_ip); } /* From c44a1f22626c153976289e1cd67bdcdfefc16e1f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:32:00 +1100 Subject: [PATCH 46/63] xfs: handle cow fork in xfs_bmap_trace_exlist By inspection, xfs_bmap_trace_exlist isn't handling cow forks, and will trace the data fork instead. Fix this by setting state appropriately if whichfork == XFS_COW_FORK. ()___() < @ @ > | | {o_o} (|) Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e4120fcefcc893..23aa70b2790cfa 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -518,7 +518,7 @@ void xfs_bmap_trace_exlist( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ + int whichfork, /* data or attr or cow fork */ unsigned long caller_ip) { xfs_extnum_t idx; /* extent record index */ @@ -527,6 +527,8 @@ xfs_bmap_trace_exlist( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(cnt == xfs_iext_count(ifp)); From f7a136aee3c1c3f7daf87197b3b3c361744a2812 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:32:14 +1100 Subject: [PATCH 47/63] xfs: several xattr functions can be void There are a handful of xattr functions which now return nothing but zero. They can be made void, chased through calling functions, and error handling etc can be removed. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- fs/xfs/xfs_attr.h | 4 +-- fs/xfs/xfs_attr_list.c | 59 +++++++++++++---------------------- fs/xfs/xfs_xattr.c | 23 ++++++-------- 4 files changed, 35 insertions(+), 53 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 4f2aed04f8273b..91f51637f8af07 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -77,7 +77,7 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr3_leaf_list_int(struct xfs_buf *bp, +void xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e3da5d448bcff5..d14691aa02b44f 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ -/* Return 0 on success, or -errno; other state communicated via *context */ -typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, +/* void; state communicated via *context */ +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); typedef struct xfs_attr_list_context { diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 25e76cd6c05336..97c45b6eb91ec3 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_sf_entry_t *sfe; xfs_inode_t *dp; int sbsize, nsbuf, count, i; - int error; ASSERT(context != NULL); dp = context->dp; @@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (XFS_ISRESET_CURSOR(cursor) && (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - error = context->put_listent(context, - sfe->flags, - sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen); - if (error) - return error; + context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen); /* * Either search callback finished early or * didn't fit it all in the buffer after all. @@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor->hashval = sbp->hash; cursor->offset = 0; } - error = context->put_listent(context, - sbp->flags, - sbp->name, - sbp->namelen, - sbp->valuelen); - if (error) { - kmem_free(sbuf); - return error; - } + context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen); if (context->seen_enough) break; cursor->offset++; @@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - error = xfs_attr3_leaf_list_int(bp, context); - if (error) { - xfs_trans_brelse(NULL, bp); - return error; - } + xfs_attr3_leaf_list_int(bp, context); xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; @@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ -int +void xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) @@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int( struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_entry *entry; - int retval; int i; struct xfs_mount *mp = context->dp->i_mount; @@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int( } if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return 0; + return; } } else { entry = &entries[0]; @@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int( /* * We have found our place, start copying out the new attributes. */ - retval = 0; for (; i < ichdr.count; entry++, i++) { char *name; int namelen, valuelen; @@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int( valuelen = be32_to_cpu(name_rmt->valuelen); } - retval = context->put_listent(context, entry->flags, + context->put_listent(context, entry->flags, name, namelen, valuelen); - if (retval) - break; if (context->seen_enough) break; cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return retval; + return; } /* @@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) if (error) return error; - error = xfs_attr3_leaf_list_int(bp, context); + xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); - return error; + return 0; } int @@ -513,7 +498,7 @@ xfs_attr_list_int( * Take care to check values and protect against them changing later, * we may be reading them directly out of a user buffer. */ -STATIC int +STATIC void xfs_attr_put_listent( xfs_attr_list_context_t *context, int flags, @@ -536,10 +521,10 @@ xfs_attr_put_listent( */ if (((context->flags & ATTR_SECURE) == 0) != ((flags & XFS_ATTR_SECURE) == 0)) - return 0; + return; if (((context->flags & ATTR_ROOT) == 0) != ((flags & XFS_ATTR_ROOT) == 0)) - return 0; + return; arraytop = sizeof(*alist) + context->count * sizeof(alist->al_offset[0]); @@ -548,7 +533,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 0; + return; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; @@ -558,7 +543,7 @@ xfs_attr_put_listent( alist->al_offset[context->count++] = context->firstu; alist->al_count = context->count; trace_xfs_attr_list_add(context); - return 0; + return; } /* diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 62900938f26d33..0594db43597250 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = { NULL }; -static int +static void __xfs_xattr_put_listent( struct xfs_attr_list_context *context, char *prefix, @@ -148,7 +148,7 @@ __xfs_xattr_put_listent( if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ context->seen_enough = 1; - return 0; + return; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -159,10 +159,10 @@ __xfs_xattr_put_listent( compute_size: context->count += prefix_len + namelen + 1; - return 0; + return; } -static int +static void xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, @@ -180,23 +180,19 @@ xfs_xattr_put_listent( if (namelen == SGI_ACL_FILE_SIZE && strncmp(name, SGI_ACL_FILE, SGI_ACL_FILE_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_ACCESS, strlen(XATTR_POSIX_ACL_ACCESS)); - if (ret) - return ret; } else if (namelen == SGI_ACL_DEFAULT_SIZE && strncmp(name, SGI_ACL_DEFAULT, SGI_ACL_DEFAULT_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_DEFAULT, strlen(XATTR_POSIX_ACL_DEFAULT)); - if (ret) - return ret; } #endif @@ -205,7 +201,7 @@ xfs_xattr_put_listent( * see them. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return; prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; @@ -217,8 +213,9 @@ xfs_xattr_put_listent( prefix_len = XATTR_USER_PREFIX_LEN; } - return __xfs_xattr_put_listent(context, prefix, prefix_len, name, - namelen); + __xfs_xattr_put_listent(context, prefix, prefix_len, name, + namelen); + return; } ssize_t From d2a047f31e86941fa896e0e3271536d50aba415e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:32:50 +1100 Subject: [PATCH 48/63] xfs: forbid AG btrees with level == 0 There is no such thing as a zero-level AG btree since even a single-node zero-records btree has one level. Btree cursor constructors read cur_nlevels straight from disk and then access things like cur_bufs[cur_nlevels - 1] which is /really/ bad if cur_nlevels is zero! Therefore, strengthen the verifiers to prevent this possibility. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 10 +++++++--- fs/xfs/libxfs/xfs_ialloc.c | 9 ++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index effb64cf714fee..5050056a0b0644 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2455,12 +2455,15 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) return false; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return false; /* @@ -2477,7 +2480,8 @@ xfs_agf_verify( return false; if (xfs_sb_version_hasreflink(&mp->m_sb) && - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return false; return true;; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index c482b9716347c6..d45c03779dae39 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2510,8 +2510,15 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return false; - if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + if (be32_to_cpu(agi->agi_level) < 1 || + be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) return false; + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + (be32_to_cpu(agi->agi_free_level) < 1 || + be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) + return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't From bb3be7e7c1c18e1b141d4cadeb98cc89ecf78099 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:33:54 +1100 Subject: [PATCH 49/63] xfs: check for bogus values in btree block headers When we're reading a btree block, make sure that what we retrieved matches the owner and level; and has a plausible number of records. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0e80993c8a5914..21e6a6ab6b9a40 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block( if (error) return error; + /* Check the inode owner since the verifiers don't. */ + if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + (cur->bc_flags & XFS_BTREE_LONG_PTRS) && + be64_to_cpu((*blkp)->bb_u.l.bb_owner) != + cur->bc_private.b.ip->i_ino) + goto out_bad; + + /* Did we get the level we were looking for? */ + if (be16_to_cpu((*blkp)->bb_level) != level) + goto out_bad; + + /* Check that internal nodes have at least one record. */ + if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) + goto out_bad; + xfs_btree_setbuf(cur, level, bp); return 0; + +out_bad: + *blkp = NULL; + xfs_trans_brelse(cur->bc_tp, bp); + return -EFSCORRUPTED; } /* From 356a3225222e5bc4df88aef3419fb6424f18ab69 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:36:56 +1100 Subject: [PATCH 50/63] xfs: complain if we don't get nextents bmap records When reading into memory all extents of a btree-format inode fork, complain if the number of extents we find is not the same as the number of extents reported in the inode core. This is needed to stop an IO action from accessing the garbage areas of the in-core fork. [dchinner: removed redundant assert] Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 23aa70b2790cfa..829ad632533b80 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1377,8 +1377,9 @@ xfs_bmap_read_extents( return error; block = XFS_BUF_TO_BLOCK(bp); } + if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) + return -EFSCORRUPTED; ASSERT(i == xfs_iext_count(ifp)); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; error0: From 96a3aefb8ffde23180130460b0b2407b328eb727 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:37:47 +1100 Subject: [PATCH 51/63] xfs: don't crash if reading a directory results in an unexpected hole In xfs_dir3_data_read, we can encounter the situation where err == 0 and *bpp == NULL if the given bno offset happens to be a hole; this leads to a crash if we try to set the buffer type after the _da_read_buf call. Holes can happen due to corrupt or malicious entries in the bmbt data, so be a little more careful when we're handling buffers. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2_data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 725fc7841fdeb3..e526f5a5f0beae 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -329,7 +329,7 @@ xfs_dir3_data_read( err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } From 0f352f8ee8412bd9d34fb2a6411241da61175c0e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:38:11 +1100 Subject: [PATCH 52/63] xfs: error out if trying to add attrs and anextents > 0 We shouldn't assert if somehow we end up trying to add an attr fork to an inode that apparently already has attr extents because this is an indication of on-disk corruption. Instead, return an error code to userspace. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 829ad632533b80..29ffc0569ce19e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1153,6 +1153,10 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; + if (ip->i_d.di_anextents != 0) { + error = -EFSCORRUPTED; + goto trans_cancel; + } if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1160,7 +1164,6 @@ xfs_bmap_add_attrfork( ASSERT(ip->i_d.di_aformat == 0); ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } - ASSERT(ip->i_d.di_anextents == 0); xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); From ef388e2054feedaeb05399ed654bdb06f385d294 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:38:38 +1100 Subject: [PATCH 53/63] xfs: don't allow di_size with high bit set The on-disk field di_size is used to set i_size, which is a signed integer of loff_t. If the high bit of di_size is set, we'll end up with a negative i_size, which will cause all sorts of problems. Since the VFS won't let us create a file with such length, we should catch them here in the verifier too. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_buf.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 134424fac434fd..c906e50515f031 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -392,6 +392,14 @@ xfs_dinode_verify( if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; + /* don't allow invalid i_size */ + if (be64_to_cpu(dip->di_size) & (1ULL << 63)) + return false; + + /* No zero-length symlinks. */ + if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0) + return false; + /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return true; From 1bb33a98702d8360947f18a44349df75ba555d5d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:38:57 +1100 Subject: [PATCH 54/63] xfs: don't cap maximum dedupe request length After various discussions on linux-fsdevel, it has been decided that it is not necessary to cap the length of a dedupe request, and that correctly-written userspace client programs will be able to absorb the change. Therefore, remove the length clamping behavior. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6e4f7f900fea4c..9a5d64b5f35a1d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -939,7 +939,6 @@ xfs_file_clone_range( len, false); } -#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) STATIC ssize_t xfs_file_dedupe_range( struct file *src_file, @@ -950,14 +949,6 @@ xfs_file_dedupe_range( { int error; - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > XFS_MAX_DEDUPE_LEN) - len = XFS_MAX_DEDUPE_LEN; - error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) From 11ef38afe98cc7ad1a46ef24945232ec1760d5e2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 5 Dec 2016 14:38:58 +1100 Subject: [PATCH 55/63] xfs: make xfs btree stats less huge Embedding a switch statement in every btree stats inc/add adds a lot of code overhead to the core btree infrastructure paths. Stats are supposed to be small and lightweight, but the btree stats have become big and bloated as we've added more btrees. It needs fixing because the reflink code will just add more overhead again. Convert the v2 btree stats to arrays instead of independent variables, and instead use the type to index the specific btree array via an enum. This allows us to use array based indexing to update the stats, rather than having to derefence variables specific to the btree type. If we then wrap the xfsstats structure in a union and place uint32_t array beside it, and calculate the correct btree stats array base array index when creating a btree cursor, we can easily access entries in the stats structure without having to switch names based on the btree type. We then replace with the switch statement with a simple set of stats wrapper macros, resulting in a significant simplification of the btree stats code, and: text data bss dec hex filename 48905 144 8 49057 bfa1 fs/xfs/libxfs/xfs_btree.o.old 36793 144 8 36945 9051 fs/xfs/libxfs/xfs_btree.o it reduces the core btree infrastructure code size by close to 25%! Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc_btree.c | 4 + fs/xfs/libxfs/xfs_bmap_btree.c | 1 + fs/xfs/libxfs/xfs_btree.h | 43 +------ fs/xfs/libxfs/xfs_ialloc_btree.c | 2 + fs/xfs/libxfs/xfs_refcount_btree.c | 1 + fs/xfs/libxfs/xfs_rmap_btree.c | 1 + fs/xfs/xfs_stats.c | 10 +- fs/xfs/xfs_stats.h | 200 ++++++++++++----------------- 8 files changed, 99 insertions(+), 163 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 5ba2dac5e67c49..44cfcd03c451d1 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -428,6 +428,10 @@ xfs_allocbt_init_cursor( cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_allocbt_ops; + if (btnum == XFS_BTNUM_BNO) + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + else + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8007d2ba9aef9c..94ad31d372ab95 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -803,6 +803,7 @@ xfs_bmbt_init_cursor( cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_btnum = XFS_BTNUM_BMAP; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c2b01d1c79ee3e..b69b947c4c1bd5 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -96,46 +96,10 @@ union xfs_btree_rec { /* * Generic stats interface */ -#define __XFS_BTREE_STATS_INC(mp, type, stat) \ - XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat) #define XFS_BTREE_STATS_INC(cur, stat) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ - case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ - case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) - -#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \ - XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val) -#define XFS_BTREE_STATS_ADD(cur, stat, val) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: \ - __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \ - case XFS_BTNUM_CNT: \ - __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \ - case XFS_BTNUM_BMAP: \ - __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \ - case XFS_BTNUM_INO: \ - __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ - case XFS_BTNUM_FINO: \ - __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ - case XFS_BTNUM_RMAP: \ - __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ - case XFS_BTNUM_REFC: \ - __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) + XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ + XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) #define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ @@ -253,6 +217,7 @@ typedef struct xfs_btree_cur __uint8_t bc_nlevels; /* number of levels in the tree */ __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ + int bc_statoff; /* offset of btre stats array */ union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index eab68ae2e01184..e7ff8ef0e5a7f8 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -365,9 +365,11 @@ xfs_inobt_init_cursor( if (btnum == XFS_BTNUM_INO) { cur->bc_nlevels = be32_to_cpu(agi->agi_level); cur->bc_ops = &xfs_inobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); } else { cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); cur->bc_ops = &xfs_finobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); } cur->bc_blocklog = mp->m_sb.sb_blocklog; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 453bb2757ec23f..6fb2215f8ff77b 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor( cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_refcountbt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 83e672ff7577e9..de25771764bac3 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_rmapbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 12d48cd8f8a423..f11282c96887ac 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) } /* extra precision counters */ for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes; - xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes; - xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes; + xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; + xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; + xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", @@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) for_each_possible_cpu(c) { preempt_disable(); /* save vn_active, it's a universal truth! */ - vn_active = per_cpu_ptr(stats, c)->vn_active; + vn_active = per_cpu_ptr(stats, c)->s.vn_active; memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); - per_cpu_ptr(stats, c)->vn_active = vn_active; + per_cpu_ptr(stats, c)->s.vn_active = vn_active; preempt_enable(); } } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 79ad2e69fc33b6..375840f5a99aa1 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -21,10 +21,38 @@ #include +/* + * The btree stats arrays have fixed offsets for the different stats. We + * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and + * that allows us to use fixed offsets into the stats array for each btree + * stat. These index offsets are defined in the order they will be emitted + * in the stats files, so it is possible to add new btree stat types by + * appending to the enum list below. + */ +enum { + __XBTS_lookup = 0, + __XBTS_compare = 1, + __XBTS_insrec = 2, + __XBTS_delrec = 3, + __XBTS_newroot = 4, + __XBTS_killroot = 5, + __XBTS_increment = 6, + __XBTS_decrement = 7, + __XBTS_lshift = 8, + __XBTS_rshift = 9, + __XBTS_split = 10, + __XBTS_join = 11, + __XBTS_alloc = 12, + __XBTS_free = 13, + __XBTS_moves = 14, + + __XBTS_MAX = 15, +}; + /* * XFS global statistics */ -struct xfsstats { +struct __xfsstats { # define XFSSTAT_END_EXTENT_ALLOC 4 __uint32_t xs_allocx; __uint32_t xs_allocb; @@ -117,118 +145,20 @@ struct xfsstats { __uint32_t xb_page_found; __uint32_t xb_get_read; /* Version 2 btree counters */ -#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) - __uint32_t xs_abtb_2_lookup; - __uint32_t xs_abtb_2_compare; - __uint32_t xs_abtb_2_insrec; - __uint32_t xs_abtb_2_delrec; - __uint32_t xs_abtb_2_newroot; - __uint32_t xs_abtb_2_killroot; - __uint32_t xs_abtb_2_increment; - __uint32_t xs_abtb_2_decrement; - __uint32_t xs_abtb_2_lshift; - __uint32_t xs_abtb_2_rshift; - __uint32_t xs_abtb_2_split; - __uint32_t xs_abtb_2_join; - __uint32_t xs_abtb_2_alloc; - __uint32_t xs_abtb_2_free; - __uint32_t xs_abtb_2_moves; -#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) - __uint32_t xs_abtc_2_lookup; - __uint32_t xs_abtc_2_compare; - __uint32_t xs_abtc_2_insrec; - __uint32_t xs_abtc_2_delrec; - __uint32_t xs_abtc_2_newroot; - __uint32_t xs_abtc_2_killroot; - __uint32_t xs_abtc_2_increment; - __uint32_t xs_abtc_2_decrement; - __uint32_t xs_abtc_2_lshift; - __uint32_t xs_abtc_2_rshift; - __uint32_t xs_abtc_2_split; - __uint32_t xs_abtc_2_join; - __uint32_t xs_abtc_2_alloc; - __uint32_t xs_abtc_2_free; - __uint32_t xs_abtc_2_moves; -#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) - __uint32_t xs_bmbt_2_lookup; - __uint32_t xs_bmbt_2_compare; - __uint32_t xs_bmbt_2_insrec; - __uint32_t xs_bmbt_2_delrec; - __uint32_t xs_bmbt_2_newroot; - __uint32_t xs_bmbt_2_killroot; - __uint32_t xs_bmbt_2_increment; - __uint32_t xs_bmbt_2_decrement; - __uint32_t xs_bmbt_2_lshift; - __uint32_t xs_bmbt_2_rshift; - __uint32_t xs_bmbt_2_split; - __uint32_t xs_bmbt_2_join; - __uint32_t xs_bmbt_2_alloc; - __uint32_t xs_bmbt_2_free; - __uint32_t xs_bmbt_2_moves; -#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) - __uint32_t xs_ibt_2_lookup; - __uint32_t xs_ibt_2_compare; - __uint32_t xs_ibt_2_insrec; - __uint32_t xs_ibt_2_delrec; - __uint32_t xs_ibt_2_newroot; - __uint32_t xs_ibt_2_killroot; - __uint32_t xs_ibt_2_increment; - __uint32_t xs_ibt_2_decrement; - __uint32_t xs_ibt_2_lshift; - __uint32_t xs_ibt_2_rshift; - __uint32_t xs_ibt_2_split; - __uint32_t xs_ibt_2_join; - __uint32_t xs_ibt_2_alloc; - __uint32_t xs_ibt_2_free; - __uint32_t xs_ibt_2_moves; -#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) - __uint32_t xs_fibt_2_lookup; - __uint32_t xs_fibt_2_compare; - __uint32_t xs_fibt_2_insrec; - __uint32_t xs_fibt_2_delrec; - __uint32_t xs_fibt_2_newroot; - __uint32_t xs_fibt_2_killroot; - __uint32_t xs_fibt_2_increment; - __uint32_t xs_fibt_2_decrement; - __uint32_t xs_fibt_2_lshift; - __uint32_t xs_fibt_2_rshift; - __uint32_t xs_fibt_2_split; - __uint32_t xs_fibt_2_join; - __uint32_t xs_fibt_2_alloc; - __uint32_t xs_fibt_2_free; - __uint32_t xs_fibt_2_moves; -#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15) - __uint32_t xs_rmap_2_lookup; - __uint32_t xs_rmap_2_compare; - __uint32_t xs_rmap_2_insrec; - __uint32_t xs_rmap_2_delrec; - __uint32_t xs_rmap_2_newroot; - __uint32_t xs_rmap_2_killroot; - __uint32_t xs_rmap_2_increment; - __uint32_t xs_rmap_2_decrement; - __uint32_t xs_rmap_2_lshift; - __uint32_t xs_rmap_2_rshift; - __uint32_t xs_rmap_2_split; - __uint32_t xs_rmap_2_join; - __uint32_t xs_rmap_2_alloc; - __uint32_t xs_rmap_2_free; - __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) - __uint32_t xs_refcbt_2_lookup; - __uint32_t xs_refcbt_2_compare; - __uint32_t xs_refcbt_2_insrec; - __uint32_t xs_refcbt_2_delrec; - __uint32_t xs_refcbt_2_newroot; - __uint32_t xs_refcbt_2_killroot; - __uint32_t xs_refcbt_2_increment; - __uint32_t xs_refcbt_2_decrement; - __uint32_t xs_refcbt_2_lshift; - __uint32_t xs_refcbt_2_rshift; - __uint32_t xs_refcbt_2_split; - __uint32_t xs_refcbt_2_join; - __uint32_t xs_refcbt_2_alloc; - __uint32_t xs_refcbt_2_free; - __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) + __uint32_t xs_abtb_2[__XBTS_MAX]; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) + __uint32_t xs_abtc_2[__XBTS_MAX]; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) + __uint32_t xs_bmbt_2[__XBTS_MAX]; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) + __uint32_t xs_ibt_2[__XBTS_MAX]; +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) + __uint32_t xs_fibt_2[__XBTS_MAX]; +#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) + __uint32_t xs_rmap_2[__XBTS_MAX]; +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) + __uint32_t xs_refcbt_2[__XBTS_MAX]; #define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; @@ -245,26 +175,58 @@ struct xfsstats { __uint64_t xs_read_bytes; }; +struct xfsstats { + union { + struct __xfsstats s; + uint32_t a[XFSSTAT_END_XQMSTAT]; + }; +}; + +/* + * simple wrapper for getting the array index of s struct member offset + */ +#define XFS_STATS_CALC_INDEX(member) \ + (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t)) + + int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); void xfs_stats_clearall(struct xfsstats __percpu *stats); extern struct xstats xfsstats; #define XFS_STATS_INC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++; \ } while (0) #define XFS_STATS_DEC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--; \ } while (0) #define XFS_STATS_ADD(mp, v, inc) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc); \ +} while (0) + +#define XFS_STATS_INC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++; \ +} while (0) + +#define XFS_STATS_DEC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]; \ +} while (0) + +#define XFS_STATS_ADD_OFF(mp, off, inc) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc); \ } while (0) #if defined(CONFIG_PROC_FS) From cae028df53449905c944603df624ac94bc619661 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 5 Dec 2016 14:40:32 +1100 Subject: [PATCH 56/63] xfs: optimise CRC updates Nick Piggin reported that the CRC overhead in an fsync heavy workload was higher than expected on a Power8 machine. Part of this was to do with the fact that the power8 CRC implementation is not efficient for CRC lengths of less than 512 bytes, and so the way we split the CRCs over the CRC field means a lot of the CRCs are reduced to being less than than optimal size. To optimise this, change the CRC update mechanism to zero the CRC field first, and then compute the CRC in one pass over the buffer and write the result back into the buffer. We can do this safely because anything writing a CRC has exclusive access to the buffer the CRC is being calculated over. We leave the CRC verify code the same - it still splits the CRC calculation - because we do not want read-only operations modifying the underlying buffer. This is because read-only operations may not have an exclusive access to the buffer guaranteed, and so temporary modifications could leak out to to other processes accessing the buffer concurrently. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_cksum.h | 26 ++++++++++++++++++++++---- fs/xfs/libxfs/xfs_inode_buf.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 12 +++++++----- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index fad1676ad8cd25..a416c7cb23ea57 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -6,10 +6,11 @@ /* * Calculate the intermediate checksum for a buffer that has the CRC field * inside it. The offset of the 32bit crc fields is passed as the - * cksum_offset parameter. + * cksum_offset parameter. We do not modify the buffer during verification, + * hence we have to split the CRC calculation across the cksum_offset. */ static inline __uint32_t -xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) { __uint32_t zero = 0; __uint32_t crc; @@ -25,6 +26,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) length - (cksum_offset + sizeof(__be32))); } +/* + * Fast CRC method where the buffer is modified. Callers must have exclusive + * access to the buffer while the calculation takes place. + */ +static inline __uint32_t +xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) +{ + /* zero the CRC field */ + *(__le32 *)(buffer + cksum_offset) = 0; + + /* single pass CRC calculation for the entire buffer */ + return crc32c(XFS_CRC_SEED, buffer, length); +} + /* * Convert the intermediate checksum to the final ondisk format. * @@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc) /* * Helper to generate the checksum for a buffer. + * + * This modifies the buffer temporarily - callers must have exclusive + * access to the buffer while the calculation takes place. */ static inline void xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); } @@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) static inline int xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index c906e50515f031..9004665d679a3b 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -444,7 +444,7 @@ xfs_dinode_calc_crc( return; ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); - crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3b74fa011bb156..3ebe444eb60fa5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1668,7 +1668,7 @@ xlog_cksum( __uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum((char *)rhead, + crc = xfs_start_cksum_update((char *)rhead, sizeof(struct xlog_rec_header), offsetof(struct xlog_rec_header, h_crc)); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d9c..56b7a2f6aaf28e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -5113,19 +5113,21 @@ xlog_recover_process( struct list_head *buffer_list) { int error; + __le32 old_crc = rhead->h_crc; __le32 crc; + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets h_crc to 0 so we must consider this valid even on v5 supers. + * sets old_crc to 0 so we must consider this valid even on v5 supers. * Otherwise, return EFSBADCRC on failure so the callers up the stack * know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (rhead->h_crc && crc != rhead->h_crc) + if (old_crc && crc != old_crc) return -EFSBADCRC; return 0; } @@ -5136,11 +5138,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != rhead->h_crc) { - if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (crc != old_crc) { + if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(rhead->h_crc), + le32_to_cpu(old_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } From 6031e73a5b3f85ec45cac08ef90995b2d3f941c7 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Wed, 7 Dec 2016 17:36:36 +1100 Subject: [PATCH 57/63] xfs: use rhashtable to track buffer cache On filesystems with a lot of metadata and in metadata intensive workloads xfs_buf_find() is showing up at the top of the CPU cycles trace. Most of the CPU time is spent on CPU cache misses while traversing the rbtree. As the buffer cache does not need any kind of ordering, but fast lookups a hashtable is the natural data structure to use. The rhashtable infrastructure provides a self-scaling hashtable implementation and allows lookups to proceed while the table is going through a resize operation. This reduces the CPU-time spent for the lookups to 1/3 even for small filesystems with a relatively small number of cached buffers, with possibly much larger gains on higher loaded filesystems. [dchinner: reduce minimum hash size to an acceptable size for large filesystems with many AGs with no active use.] [dchinner: remove stale rbtree asserts.] [dchinner: use xfs_buf_map for compare function argument.] [dchinner: make functions static.] [dchinner: remove redundant comments.] Signed-off-by: Lucas Stach Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 120 +++++++++++++++++++++++++++------------------ fs/xfs/xfs_buf.h | 2 +- fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_mount.c | 7 ++- fs/xfs/xfs_mount.h | 7 ++- 5 files changed, 85 insertions(+), 52 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b5b9bffe352074..516109424c9604 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -219,7 +219,6 @@ _xfs_buf_alloc( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); - RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); @@ -473,6 +472,62 @@ _xfs_buf_map_pages( /* * Finding and Reading Buffers */ +static int +_xfs_buf_obj_cmp( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct xfs_buf_map *map = arg->key; + const struct xfs_buf *bp = obj; + + /* + * The key hashing in the lookup path depends on the key being the + * first element of the compare_arg, make sure to assert this. + */ + BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); + + if (bp->b_bn != map->bm_bn) + return 1; + + if (unlikely(bp->b_length != map->bm_len)) { + /* + * found a block number match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching for an exact match. + */ + ASSERT(bp->b_flags & XBF_STALE); + return 1; + } + return 0; +} + +static const struct rhashtable_params xfs_buf_hash_params = { + .min_size = 32, /* empty AGs have minimal footprint */ + .nelem_hint = 16, + .key_len = sizeof(xfs_daddr_t), + .key_offset = offsetof(struct xfs_buf, b_bn), + .head_offset = offsetof(struct xfs_buf, b_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = _xfs_buf_obj_cmp, +}; + +int +xfs_buf_hash_init( + struct xfs_perag *pag) +{ + spin_lock_init(&pag->pag_buf_lock); + return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); +} + +void +xfs_buf_hash_destroy( + struct xfs_perag *pag) +{ + rhashtable_destroy(&pag->pag_buf_hash); +} /* * Look up, and creates if absent, a lockable buffer for @@ -488,27 +543,24 @@ _xfs_buf_find( xfs_buf_t *new_bp) { struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent; xfs_buf_t *bp; - xfs_daddr_t blkno = map[0].bm_bn; + struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; xfs_daddr_t eofs; - int numblks = 0; int i; for (i = 0; i < nmaps; i++) - numblks += map[i].bm_len; + cmap.bm_len += map[i].bm_len; /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (blkno < 0 || blkno >= eofs) { + if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { /* * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports @@ -516,53 +568,29 @@ _xfs_buf_find( */ xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", - __func__, blkno, eofs); + __func__, cmap.bm_bn, eofs); WARN_ON(1); return NULL; } - /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, blkno)); + xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); - /* walk tree */ spin_lock(&pag->pag_buf_lock); - rbp = &pag->pag_buf_tree.rb_node; - parent = NULL; - bp = NULL; - while (*rbp) { - parent = *rbp; - bp = rb_entry(parent, struct xfs_buf, b_rbnode); - - if (blkno < bp->b_bn) - rbp = &(*rbp)->rb_left; - else if (blkno > bp->b_bn) - rbp = &(*rbp)->rb_right; - else { - /* - * found a block number match. If the range doesn't - * match, the only way this is allowed is if the buffer - * in the cache is stale and the transaction that made - * it stale has not yet committed. i.e. we are - * reallocating a busy extent. Skip this buffer and - * continue searching to the right for an exact match. - */ - if (bp->b_length != numblks) { - ASSERT(bp->b_flags & XBF_STALE); - rbp = &(*rbp)->rb_right; - continue; - } - atomic_inc(&bp->b_hold); - goto found; - } + bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, + xfs_buf_hash_params); + if (bp) { + atomic_inc(&bp->b_hold); + goto found; } /* No match found */ if (new_bp) { - rb_link_node(&new_bp->b_rbnode, parent, rbp); - rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, + &new_bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); } else { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); @@ -930,7 +958,6 @@ xfs_buf_rele( if (!pag) { ASSERT(list_empty(&bp->b_lru)); - ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) { xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); @@ -938,8 +965,6 @@ xfs_buf_rele( return; } - ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); - ASSERT(atomic_read(&bp->b_hold) > 0); release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); @@ -983,7 +1008,8 @@ xfs_buf_rele( } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); freebuf = true; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 38c9a95bda7d7c..8a9d3a9599f00f 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -151,7 +151,7 @@ typedef struct xfs_buf { * which is the only bit that is touched if we hit the semaphore * fast-path on locking. */ - struct rb_node b_rbnode; /* rbtree node */ + struct rhash_head b_rhash_head; /* pag buffer hash node */ xfs_daddr_t b_bn; /* block number of buffer */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 68640fb63a5423..a415f822f2c1c1 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -78,6 +78,7 @@ typedef __u32 xfs_nlink_t; #include #include #include +#include #include #include diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b341f10cf4810b..9b9540db17a613 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -157,6 +157,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -212,8 +213,8 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - spin_lock_init(&pag->pag_buf_lock); - pag->pag_buf_tree = RB_ROOT; + if (xfs_buf_hash_init(pag)) + goto out_unwind; if (radix_tree_preload(GFP_NOFS)) goto out_unwind; @@ -239,9 +240,11 @@ xfs_initialize_perag( return 0; out_unwind: + xfs_buf_hash_destroy(pag); kmem_free(pag); for (; index > first_initialised; index--) { pag = radix_tree_delete(&mp->m_perag_tree, index); + xfs_buf_hash_destroy(pag); kmem_free(pag); } return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 819b80b15bfb11..84f78521890743 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -393,8 +393,8 @@ typedef struct xfs_perag { unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ - struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ + struct rhashtable pag_buf_hash; /* for rcu-safe freeing */ struct rcu_head rcu_head; @@ -424,6 +424,9 @@ xfs_perag_resv( } } +int xfs_buf_hash_init(xfs_perag_t *pag); +void xfs_buf_hash_destroy(xfs_perag_t *pag); + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); From 2e1d23370e75d7d89350d41b4ab58c7f6a0e26b2 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 9 Dec 2016 16:49:47 +1100 Subject: [PATCH 58/63] xfs: ignore leaf attr ichdr.count in verifier during log replay When we create a new attribute, we first create a shortform attribute, and try to fit the new attribute into it. If that fails, we copy the (empty) attribute into a leaf attribute, and do the copy again. Thus there can be a transient state where we have an empty leaf attribute. If we encounter this during log replay, the verifier will fail. So add a test to ignore this part of the leaf attr verification during log replay. Thanks as usual to dchinner for spotting the problem. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 8ea91f3630938a..2852521fc8ecf7 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -253,6 +253,7 @@ xfs_attr3_leaf_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_perag *pag = bp->b_pag; struct xfs_attr3_icleaf_hdr ichdr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -273,7 +274,12 @@ xfs_attr3_leaf_verify( if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) return false; } - if (ichdr.count == 0) + /* + * In recovery there is a transient state where count == 0 is valid + * because we may have transitioned an empty shortform attr to a leaf + * if the attr didn't fit in shortform. + */ + if (pag && pag->pagf_init && ichdr.count == 0) return false; /* XXX: need to range check rest of attr header values */ From 2291dab2c9d1880efd19469df2042e2277c8b7a4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 9 Dec 2016 16:49:54 +1100 Subject: [PATCH 59/63] xfs: Always flush caches when integrity is required There is no reason anymore for not issuing device integrity operations when teh filesystem requires ordering or data integrity guarantees. We should always issue cache flushes and FUA writes where necessary and let the underlying storage optimise them as necessary for correct integrity operation. Signed-Off-By: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 3 +-- fs/xfs/xfs_file.c | 29 ++++++++++++----------------- fs/xfs/xfs_log.c | 39 ++++++++++++++++----------------------- 3 files changed, 29 insertions(+), 42 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b5b9bffe352074..7d3afa0bddd392 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1711,8 +1711,7 @@ xfs_free_buftarg( percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_blkdev_issue_flush(btp); + xfs_blkdev_issue_flush(btp); kmem_free(btp); } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f5effa68e037ae..2951c483b24bb8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -149,19 +149,16 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { - /* - * If we have an RT and/or log subvolume we need to make sure - * to flush the write cache the device used for file data - * first. This is to ensure newly written file data make - * it to disk before logging the new inode size in case of - * an extending write. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); - else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); - } + /* + * If we have an RT and/or log subvolume we need to make sure to flush + * the write cache the device used for file data first. This is to + * ensure newly written file data make it to disk before logging the new + * inode size in case of an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); /* * All metadata updates are logged, which means that we just have to @@ -196,10 +193,8 @@ xfs_file_fsync( * an already allocated file and thus do not have any metadata to * commit. */ - if ((mp->m_flags & XFS_MOUNT_BARRIER) && - mp->m_logdev_targp == mp->m_ddev_targp && - !XFS_IS_REALTIME_INODE(ip) && - !log_flushed) + if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && + mp->m_logdev_targp == mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); return error; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3b74fa011bb156..25fa31d973c8e3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1862,26 +1862,21 @@ xlog_sync( bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; - bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); + bp->b_flags &= ~XBF_FLUSH; + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { - bp->b_flags |= XBF_FUA; - - /* - * Flush the data device before flushing the log to make - * sure all meta data written back from the AIL actually made - * it to disk before stamping the new log tail LSN into the - * log buffer. For an external log we need to issue the - * flush explicitly, and unfortunately synchronously here; - * for an internal log we can simply use the block layer - * state machine for preflushes. - */ - if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - else - bp->b_flags |= XBF_FLUSH; - } + /* + * Flush the data device before flushing the log to make sure all meta + * data written back from the AIL actually made it to disk before + * stamping the new log tail LSN into the log buffer. For an external + * log we need to issue the flush explicitly, and unfortunately + * synchronously here; for an internal log we can simply use the block + * layer state machine for preflushes. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + else + bp->b_flags |= XBF_FLUSH; ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1906,10 +1901,8 @@ xlog_sync( xfs_buf_associate_memory(bp, (char *)&iclog->ic_header + count, split); bp->b_fspriv = iclog; - bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) - bp->b_flags |= XBF_FUA; + bp->b_flags &= ~XBF_FLUSH; + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); From 4cf4573d899cd80d8578c050061dc342f99f3a32 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 9 Dec 2016 16:49:54 +1100 Subject: [PATCH 60/63] xfs: deprecate barrier/nobarrier mount option We always perform integrity operations now, so these mount options don't do anything. Deprecate them and mark them for removal in in a year. Signed-Off-By: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- Documentation/filesystems/xfs.txt | 12 ++++-------- fs/xfs/xfs_super.c | 25 ++++++++++++++++--------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index c2d44e6e117bc7..3b9b5c149f322c 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -51,13 +51,6 @@ default behaviour. CRC enabled filesystems always use the attr2 format, and so will reject the noattr2 mount option if it is set. - barrier (*) - nobarrier - Enables/disables the use of block layer write barriers for - writes into the journal and for data integrity operations. - This allows for drive level write caching to be enabled, for - devices that support write barriers. - discard nodiscard (*) Enable/disable the issuing of commands to let the block @@ -228,7 +221,10 @@ default behaviour. Deprecated Mount Options ======================== -None at present. + Name Removal Schedule + ---- ---------------- + barrier no earlier than v4.15 + nobarrier no earlier than v4.15 Removed Mount Options diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 563d1d146b8c3b..eecbaac08ebaab 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -104,9 +104,6 @@ static const match_table_t tokens = { {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */ {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */ {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */ - {Opt_barrier, "barrier"}, /* use writer barriers for log write and - * unwritten extent conversion */ - {Opt_nobarrier, "nobarrier"}, /* .. disable */ {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */ {Opt_inode32, "inode32"}, /* inode allocation limited to * XFS_MAXINUMBER_32 */ @@ -134,6 +131,12 @@ static const match_table_t tokens = { {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ + + /* Deprecated mount options scheduled for removal */ + {Opt_barrier, "barrier"}, /* use writer barriers for log write and + * unwritten extent conversion */ + {Opt_nobarrier, "nobarrier"}, /* .. disable */ + {Opt_err, NULL}, }; @@ -301,12 +304,6 @@ xfs_parseargs( case Opt_nouuid: mp->m_flags |= XFS_MOUNT_NOUUID; break; - case Opt_barrier: - mp->m_flags |= XFS_MOUNT_BARRIER; - break; - case Opt_nobarrier: - mp->m_flags &= ~XFS_MOUNT_BARRIER; - break; case Opt_ikeep: mp->m_flags |= XFS_MOUNT_IKEEP; break; @@ -374,6 +371,14 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DAX; break; #endif + case Opt_barrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); + mp->m_flags |= XFS_MOUNT_BARRIER; + break; + case Opt_nobarrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; default: xfs_warn(mp, "unknown mount option [%s].", p); return -EINVAL; @@ -1238,9 +1243,11 @@ xfs_fs_remount( token = match_token(p, tokens, args); switch (token) { case Opt_barrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); mp->m_flags |= XFS_MOUNT_BARRIER; break; case Opt_nobarrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); mp->m_flags &= ~XFS_MOUNT_BARRIER; break; case Opt_inode64: From 0c187dc508d7d8520319c0dcaa0601775f69ab5a Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Fri, 9 Dec 2016 16:49:54 +1100 Subject: [PATCH 61/63] xfs: use xfs_vn_setattr_size to check on new size Commit 6552321831dc ("xfs: remove i_iolock and use i_rwsem in the VFS inode instead") introduced a regression that truncate(2) doesn't check on new size, so it succeeds even if the new size exceeds the current resource limit. Because xfs_setattr_size() was used instead of xfs_vn_setattr_size(), and the latter calls xfs_vn_change_ok() first to do sanity check on permission and new size. This is found by truncate03 test from ltp, and the following is a simplified reproducer: #!/bin/bash dev=/dev/sda5 mnt=/mnt/xfs mkfs -t xfs -f $dev mount $dev $mnt # set max file size to 16k ulimit -f 16 truncate -s $((16 * 1024 + 1)) /mnt/xfs/testfile [ $? -eq 0 ] && echo "FAIL: truncate exceeded max file size" ulimit -f unlimited umount $mnt Signed-off-by: Eryu Guan Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c962999a87ab7c..b930be0b159659 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -988,7 +988,7 @@ xfs_vn_setattr( return error; xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - error = xfs_setattr_size(ip, iattr); + error = xfs_vn_setattr_size(dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { error = xfs_vn_setattr_nonsize(dentry, iattr); From b24a978c377be5f14e798cb41238e66fe51aab2f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 9 Dec 2016 16:49:54 +1100 Subject: [PATCH 62/63] xfs: use GPF_NOFS when allocating btree cursors Use NOFS for allocating btree cursors, since they can be called under the ilock. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc_btree.c | 2 +- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 5ba2dac5e67c49..c06ec77a94182d 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -421,7 +421,7 @@ xfs_allocbt_init_cursor( ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8007d2ba9aef9c..049fa597ae91d0 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -796,7 +796,7 @@ xfs_bmbt_init_cursor( struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index eab68ae2e01184..6c6b95947e716d 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -357,7 +357,7 @@ xfs_inobt_init_cursor( struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; From 9875258ca7ab238a08bb9ad17e0c9b9984eac7df Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 9 Dec 2016 16:49:54 +1100 Subject: [PATCH 63/63] xfs: nuke unused tracepoint definitions This is all unused code, so remove it. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_trace.h | 109 --------------------------------------------- 1 file changed, 109 deletions(-) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0907752be62d3d..69c5bcd9a51be7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -355,7 +355,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_submit); DEFINE_BUF_EVENT(xfs_buf_submit_wait); -DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_trylock_fail); @@ -367,19 +366,15 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); -DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); -DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); /* not really buffer traces, but the buf provides useful information */ DEFINE_BUF_EVENT(xfs_btree_corrupt); -DEFINE_BUF_EVENT(xfs_da_btree_corrupt); DEFINE_BUF_EVENT(xfs_reset_dqcounts); -DEFINE_BUF_EVENT(xfs_inode_item_push); /* pass flags explicitly */ DECLARE_EVENT_CLASS(xfs_buf_flags_class, @@ -541,7 +536,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); -DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); DECLARE_EVENT_CLASS(xfs_filestream_class, TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), @@ -680,7 +674,6 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); -DEFINE_INODE_EVENT(xfs_evict_inode); DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); @@ -798,7 +791,6 @@ TRACE_EVENT(xfs_irec_merge_post, DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ TP_ARGS(ip, caller_ip)) -DEFINE_IREF_EVENT(xfs_ihold); DEFINE_IREF_EVENT(xfs_irele); DEFINE_IREF_EVENT(xfs_inode_pin); DEFINE_IREF_EVENT(xfs_inode_unpin); @@ -939,7 +931,6 @@ DEFINE_DQUOT_EVENT(xfs_dqget_miss); DEFINE_DQUOT_EVENT(xfs_dqget_freeing); DEFINE_DQUOT_EVENT(xfs_dqget_dup); DEFINE_DQUOT_EVENT(xfs_dqput); -DEFINE_DQUOT_EVENT(xfs_dqput_wait); DEFINE_DQUOT_EVENT(xfs_dqput_free); DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); @@ -1815,7 +1806,6 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_addname); DEFINE_ATTR_EVENT(xfs_attr_sf_create); DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); DEFINE_ATTR_EVENT(xfs_attr_sf_remove); -DEFINE_ATTR_EVENT(xfs_attr_sf_removename); DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); DEFINE_ATTR_EVENT(xfs_attr_leaf_add); @@ -1844,7 +1834,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); DEFINE_ATTR_EVENT(xfs_attr_node_get); -DEFINE_ATTR_EVENT(xfs_attr_node_lookup); DEFINE_ATTR_EVENT(xfs_attr_node_replace); DEFINE_ATTR_EVENT(xfs_attr_node_removename); @@ -2440,11 +2429,9 @@ DEFINE_DEFER_EVENT(xfs_defer_finish_done); DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); -DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error); DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work); DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); @@ -3092,87 +3079,6 @@ DEFINE_EVENT(xfs_double_io_class, name, \ struct xfs_inode *dest, xfs_off_t doffset), \ TP_ARGS(src, soffset, len, dest, doffset)) -/* two-file vfs io tracepoint class */ -DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, - TP_PROTO(struct inode *src, u64 soffset, u64 len, - struct inode *dest, u64 doffset), - TP_ARGS(src, soffset, len, dest, doffset), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(unsigned long, src_ino) - __field(loff_t, src_isize) - __field(loff_t, src_offset) - __field(size_t, len) - __field(unsigned long, dest_ino) - __field(loff_t, dest_isize) - __field(loff_t, dest_offset) - ), - TP_fast_assign( - __entry->dev = src->i_sb->s_dev; - __entry->src_ino = src->i_ino; - __entry->src_isize = i_size_read(src); - __entry->src_offset = soffset; - __entry->len = len; - __entry->dest_ino = dest->i_ino; - __entry->dest_isize = i_size_read(dest); - __entry->dest_offset = doffset; - ), - TP_printk("dev %d:%d count %zd " - "ino 0x%lx isize 0x%llx offset 0x%llx -> " - "ino 0x%lx isize 0x%llx offset 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->len, - __entry->src_ino, - __entry->src_isize, - __entry->src_offset, - __entry->dest_ino, - __entry->dest_isize, - __entry->dest_offset) -) - -#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \ -DEFINE_EVENT(xfs_double_vfs_io_class, name, \ - TP_PROTO(struct inode *src, u64 soffset, u64 len, \ - struct inode *dest, u64 doffset), \ - TP_ARGS(src, soffset, len, dest, doffset)) - -/* CoW write tracepoint */ -DECLARE_EVENT_CLASS(xfs_copy_on_write_class, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, - xfs_extlen_t len, xfs_fsblock_t new_pblk), - TP_ARGS(ip, lblk, pblk, len, new_pblk), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_fsblock_t, pblk) - __field(xfs_extlen_t, len) - __field(xfs_fsblock_t, new_pblk) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->pblk = pblk; - __entry->len = len; - __entry->new_pblk = new_pblk; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " - "len 0x%x new_pblk %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->pblk, - __entry->len, - __entry->new_pblk) -) - -#define DEFINE_COW_EVENT(name) \ -DEFINE_EVENT(xfs_copy_on_write_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ - xfs_extlen_t len, xfs_fsblock_t new_pblk), \ - TP_ARGS(ip, lblk, pblk, len, new_pblk)) - /* inode/irec events */ DECLARE_EVENT_CLASS(xfs_inode_irec_class, TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), @@ -3292,8 +3198,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); @@ -3302,9 +3206,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); /* ioctl tracepoints */ -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); TRACE_EVENT(xfs_ioctl_clone, TP_PROTO(struct inode *src, struct inode *dest), TP_ARGS(src, dest), @@ -3334,11 +3235,7 @@ TRACE_EVENT(xfs_ioctl_clone, /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); -DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); -DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); @@ -3361,14 +3258,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); -DEFINE_COW_EVENT(xfs_reflink_fork_buf); -DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); -DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); /* rmap swapext tracepoints */ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);