From a36b926180cda375ac2ec89e1748b47137cfc51c Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 27 Jan 2017 23:22:55 -0800 Subject: xfs: pull up iolock from xfs_free_eofblocks() xfs_free_eofblocks() requires the IOLOCK_EXCL lock, but is called from different contexts where the lock may or may not be held. The need_iolock parameter exists for this reason, to indicate whether xfs_free_eofblocks() must acquire the iolock itself before it can proceed. This is ugly and confusing. Simplify the semantics of xfs_free_eofblocks() to require the caller to acquire the iolock appropriately and kill the need_iolock parameter. While here, the mp param can be removed as well as the xfs_mount is accessible from the xfs_inode structure. This patch does not change behavior. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 51 ++++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index de32f0fe47c8..edfa6a55b064 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1691,33 +1691,35 @@ xfs_release( if (xfs_can_free_eofblocks(ip, false)) { + /* + * Check if the inode is being opened, written and closed + * frequently and we have delayed allocation blocks outstanding + * (e.g. streaming writes from the NFS server), truncating the + * blocks past EOF will cause fragmentation to occur. + * + * In this case don't do the truncation, but we have to be + * careful how we detect this case. Blocks beyond EOF show up as + * i_delayed_blks even when the inode is clean, so we need to + * truncate them away first before checking for a dirty release. + * Hence on the first dirty close we will still remove the + * speculative allocation, but after that we will leave it in + * place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; /* * If we can't get the iolock just skip truncating the blocks * past EOF because we could deadlock with the mmap_sem - * otherwise. We'll get another chance to drop them once the + * otherwise. We'll get another chance to drop them once the * last reference to the inode is dropped, so we'll never leak * blocks permanently. - * - * Further, check if the inode is being opened, written and - * closed frequently and we have delayed allocation blocks - * outstanding (e.g. streaming writes from the NFS server), - * truncating the blocks past EOF will cause fragmentation to - * occur. - * - * In this case don't do the truncation, either, but we have to - * be careful how we detect this case. Blocks beyond EOF show - * up as i_delayed_blks even when the inode is clean, so we - * need to truncate them away first before checking for a dirty - * release. Hence on the first dirty close we will still remove - * the speculative allocation, but after that we will leave it - * in place. */ - if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) - return 0; - - error = xfs_free_eofblocks(mp, ip, true); - if (error && error != -EAGAIN) - return error; + if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + error = xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + if (error) + return error; + } /* delalloc blocks after truncation means it really is dirty */ if (ip->i_delayed_blks) @@ -1904,8 +1906,11 @@ xfs_inactive( * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. */ - if (xfs_can_free_eofblocks(ip, true)) - xfs_free_eofblocks(mp, ip, false); + if (xfs_can_free_eofblocks(ip, true)) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_free_eofblocks(ip); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } return; } -- cgit From 3802a345321a08093ba2ddb1849e736f84e8d450 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2017 16:45:58 -0800 Subject: xfs: only reclaim unwritten COW extents periodically We only want to reclaim preallocations from our periodic work item. Currently this is archived by looking for a dirty inode, but that check is rather fragile. Instead add a flag to xfs_reflink_cancel_cow_* so that the caller can ask for just cancelling unwritten extents in the COW fork. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [darrick: fix typos in commit message] Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_reflink.c | 23 ++++++++++++++++------- fs/xfs/xfs_reflink.h | 4 ++-- fs/xfs/xfs_super.c | 2 +- 6 files changed, 22 insertions(+), 13 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index bf65a9ea8642..aa8a6f0d09c3 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -293,7 +293,7 @@ xfs_end_io( goto done; if (ioend->io_bio->bi_error) { error = xfs_reflink_cancel_cow_range(ip, - ioend->io_offset, ioend->io_size); + ioend->io_offset, ioend->io_size, true); goto done; } error = xfs_reflink_end_cow(ip, ioend->io_offset, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7234b9748c36..3531f8f72fa5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1608,7 +1608,7 @@ xfs_inode_free_cowblocks( xfs_ilock(ip, XFS_IOLOCK_EXCL); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index edfa6a55b064..7eaf1ef74e3c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1615,7 +1615,7 @@ xfs_itruncate_extents( /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, - last_block); + last_block, true); if (error) goto out; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index da6d08fb359c..4a84c5ea266d 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -548,14 +548,18 @@ xfs_reflink_trim_irec_to_next_cow( } /* - * Cancel all pending CoW reservations for some block range of an inode. + * Cancel CoW reservations for some block range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_blocks( struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb) + xfs_fileoff_t end_fsb, + bool cancel_real) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; @@ -579,7 +583,7 @@ xfs_reflink_cancel_cow_blocks( &idx, &got, &del); if (error) break; - } else { + } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); @@ -621,13 +625,17 @@ xfs_reflink_cancel_cow_blocks( } /* - * Cancel all pending CoW reservations for some byte range of an inode. + * Cancel CoW reservations for some byte range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_range( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool cancel_real) { struct xfs_trans *tp; xfs_fileoff_t offset_fsb; @@ -653,7 +661,8 @@ xfs_reflink_cancel_cow_range( xfs_trans_ijoin(tp, ip, 0); /* Scrape out the old CoW reservations */ - error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, + cancel_real); if (error) goto out_cancel; @@ -1450,7 +1459,7 @@ next: * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 33ac9b8db683..d29a7967f029 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -39,9 +39,9 @@ extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb); + xfs_fileoff_t end_fsb, bool cancel_real); extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count); + xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 890862f2447c..685c042a120f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -953,7 +953,7 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_remove); if (xfs_is_reflink_inode(ip)) { - error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) xfs_warn(ip->i_mount, "Error %d while evicting CoW blocks for inode %llu.", -- cgit From 630a04e79dd41ff746b545d4fc052e0abb836120 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Mar 2017 00:24:25 -0700 Subject: xfs: verify inline directory data forks When we're reading or writing the data fork of an inline directory, check the contents to make sure we're not overflowing buffers or eating garbage data. xfs/348 corrupts an inline symlink into an inline directory, triggering a buffer overflow bug. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- v2: add more checks consistent with _dir2_sf_check and make the verifier usable from anywhere. --- fs/xfs/libxfs/xfs_dir2_priv.h | 2 + fs/xfs/libxfs/xfs_dir2_sf.c | 87 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_fork.c | 26 +++++++++++-- fs/xfs/libxfs/xfs_inode_fork.h | 2 +- fs/xfs/xfs_dir2_readdir.c | 11 ------ fs/xfs/xfs_inode.c | 12 ++++-- 6 files changed, 122 insertions(+), 18 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index d04547fcf274..eb00bc133bca 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -125,6 +125,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp, + int size); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index c6809ff41197..96b45cd6c63f 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -629,6 +629,93 @@ xfs_dir2_sf_check( } #endif /* DEBUG */ +/* Verify the consistency of an inline directory. */ +int +xfs_dir2_sf_verify( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int size) +{ + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next_sfep; + char *endp; + const struct xfs_dir_ops *dops; + xfs_ino_t ino; + int i; + int i8count; + int offset; + __uint8_t filetype; + + dops = xfs_dir_get_ops(mp, NULL); + + /* + * Give up if the directory is way too short. + */ + XFS_WANT_CORRUPTED_RETURN(mp, size > + offsetof(struct xfs_dir2_sf_hdr, parent)); + XFS_WANT_CORRUPTED_RETURN(mp, size >= + xfs_dir2_sf_hdr_size(sfp->i8count)); + + endp = (char *)sfp + size; + + /* Check .. entry */ + ino = dops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + offset = dops->data_first_offset; + + /* Check all reported entries */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + /* + * struct xfs_dir2_sf_entry has a variable length. + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ + XFS_WANT_CORRUPTED_RETURN(mp, + ((char *)sfep + sizeof(*sfep)) < endp); + + /* Don't allow names with known bad length. */ + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0); + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN); + + /* + * Check that the variable-length part of the structure is + * within the data buffer. The next entry starts after the + * name component, so nextentry is an acceptable test. + */ + next_sfep = dops->sf_nextentry(sfp, sfep); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep); + + /* Check that the offsets always increase. */ + XFS_WANT_CORRUPTED_RETURN(mp, + xfs_dir2_sf_get_offset(sfep) >= offset); + + /* Check the inode number. */ + ino = dops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + + /* Check the file type. */ + filetype = dops->sf_get_ftype(sfep); + XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX); + + offset = xfs_dir2_sf_get_offset(sfep) + + dops->data_entsize(sfep->namelen); + + sfep = next_sfep; + } + XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count); + XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp); + + /* Make sure this whole thing ought to be in local format. */ + XFS_WANT_CORRUPTED_RETURN(mp, offset + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize); + + return 0; +} + /* * Create a new (shortform) directory. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 25c1e078aef6..9653e964eda4 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -33,6 +33,8 @@ #include "xfs_trace.h" #include "xfs_attr_sf.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" kmem_zone_t *xfs_ifork_zone; @@ -320,6 +322,7 @@ xfs_iformat_local( int whichfork, int size) { + int error; /* * If the size is unreasonable, then something @@ -336,6 +339,14 @@ xfs_iformat_local( return -EFSCORRUPTED; } + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(ip->i_mount, + (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip), + size); + if (error) + return error; + } + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -856,7 +867,7 @@ xfs_iextents_copy( * In these cases, the format always takes precedence, because the * format indicates the current state of the fork. */ -void +int xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, @@ -866,6 +877,7 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; + int error; static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -874,7 +886,7 @@ xfs_iflush_fork( { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; if (!iip) - return; + return 0; ifp = XFS_IFORK_PTR(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, @@ -882,12 +894,19 @@ xfs_iflush_fork( */ if (!ifp) { ASSERT(whichfork == XFS_ATTR_FORK); - return; + return 0; } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_LOCAL: + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(mp, + (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data, + ifp->if_bytes); + if (error) + return error; + } if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); @@ -940,6 +959,7 @@ xfs_iflush_fork( ASSERT(0); break; } + return 0; } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365326d1..132dc59fdde6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -140,7 +140,7 @@ typedef struct xfs_ifork { struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); -void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, +int xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); void xfs_idata_realloc(struct xfs_inode *, int, int); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 003a99b83bd8..ad9396e516f6 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -71,22 +71,11 @@ xfs_dir2_sf_getdents( struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) - return -EFSCORRUPTED; - /* * If the block number in the offset is out of range, we're done. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7eaf1ef74e3c..c7fe2c2123ab 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3475,6 +3475,7 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); @@ -3557,9 +3558,14 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (error) + return error; + if (XFS_IFORK_Q(ip)) { + error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + if (error) + return error; + } xfs_inobp_check(mp, bp); /* -- cgit