From a36b926180cda375ac2ec89e1748b47137cfc51c Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 27 Jan 2017 23:22:55 -0800 Subject: xfs: pull up iolock from xfs_free_eofblocks() xfs_free_eofblocks() requires the IOLOCK_EXCL lock, but is called from different contexts where the lock may or may not be held. The need_iolock parameter exists for this reason, to indicate whether xfs_free_eofblocks() must acquire the iolock itself before it can proceed. This is ugly and confusing. Simplify the semantics of xfs_free_eofblocks() to require the caller to acquire the iolock appropriately and kill the need_iolock parameter. While here, the mp param can be removed as well as the xfs_mount is accessible from the xfs_inode structure. This patch does not change behavior. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'fs/xfs/xfs_icache.c') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 70ca4f608321..c6b698f0fed9 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1322,7 +1322,7 @@ xfs_inode_free_eofblocks( int flags, void *args) { - int ret; + int ret = 0; struct xfs_eofblocks *eofb = args; bool need_iolock = true; int match; @@ -1358,19 +1358,25 @@ xfs_inode_free_eofblocks( return 0; /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. + * A scan owner implies we already hold the iolock. Skip it here + * to avoid deadlock. */ if (eofb->eof_scan_owner == ip->i_ino) need_iolock = false; } - ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); - - /* don't revisit the inode if we're not waiting */ - if (ret == -EAGAIN && !(flags & SYNC_WAIT)) - ret = 0; + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ + if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (flags & SYNC_WAIT) + ret = -EAGAIN; + return ret; + } + ret = xfs_free_eofblocks(ip); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } -- cgit From c3155097ad89a956579bc305856a1f2878494e52 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 27 Jan 2017 23:22:56 -0800 Subject: xfs: sync eofblocks scans under iolock are livelock prone The xfs_eofblocks.eof_scan_owner field is an internal field to facilitate invoking eofb scans from the kernel while under the iolock. This is necessary because the eofb scan acquires the iolock of each inode. Synchronous scans are invoked on certain buffered write failures while under iolock. In such cases, the scan owner indicates that the context for the scan already owns the particular iolock and prevents a double lock deadlock. eofblocks scans while under iolock are still livelock prone in the event of multiple parallel scans, however. If multiple buffered writes to different inodes fail and invoke eofblocks scans at the same time, each scan avoids a deadlock with its own inode by virtue of the eof_scan_owner field, but will never be able to acquire the iolock of the inode from the parallel scan. Because the low free space scans are invoked with SYNC_WAIT, the scan will not return until it has processed every tagged inode and thus both scans will spin indefinitely on the iolock being held across the opposite scan. This problem can be reproduced reliably by generic/224 on systems with higher cpu counts (x16). To avoid this problem, simplify the semantics of eofblocks scans to never invoke a scan while under iolock. This means that the buffered write context must drop the iolock before the scan. It must reacquire the lock before the write retry and also repeat the initial write checks, as the original state might no longer be valid once the iolock was dropped. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 13 +++++++++---- fs/xfs/xfs_icache.c | 45 +++++++-------------------------------------- fs/xfs/xfs_icache.h | 2 -- 3 files changed, 16 insertions(+), 44 deletions(-) (limited to 'fs/xfs/xfs_icache.c') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index bbb9eb6811b2..0a29739f785e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -614,8 +614,10 @@ xfs_file_buffered_aio_write( struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; - int iolock = XFS_IOLOCK_EXCL; + int iolock; +write_retry: + iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); @@ -625,7 +627,6 @@ xfs_file_buffered_aio_write( /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); -write_retry: trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) @@ -641,18 +642,21 @@ write_retry: * running at the same time. */ if (ret == -EDQUOT && !enospc) { + xfs_iunlock(ip, iolock); enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; enospc = xfs_inode_free_quota_cowblocks(ip); if (enospc) goto write_retry; + iolock = 0; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; enospc = 1; xfs_flush_inodes(ip->i_mount); - eofb.eof_scan_owner = ip->i_ino; /* for locking */ + + xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); goto write_retry; @@ -660,7 +664,8 @@ write_retry: current->backing_dev_info = NULL; out: - xfs_iunlock(ip, iolock); + if (iolock) + xfs_iunlock(ip, iolock); return ret; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c6b698f0fed9..7234b9748c36 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1324,11 +1324,8 @@ xfs_inode_free_eofblocks( { int ret = 0; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ trace_xfs_inode_free_eofblocks_invalid(ip); @@ -1356,27 +1353,19 @@ xfs_inode_free_eofblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it here - * to avoid deadlock. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } /* * If the caller is waiting, return -EAGAIN to keep the background * scanner moving and revisit the inode in a subsequent pass. */ - if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (flags & SYNC_WAIT) ret = -EAGAIN; return ret; } ret = xfs_free_eofblocks(ip); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -1423,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks( struct xfs_eofblocks eofb = {0}; struct xfs_dquot *dq; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* - * Set the scan owner to avoid a potential livelock. Otherwise, the scan - * can repeatedly trylock on the inode we're currently processing. We - * run a sync scan to increase effectiveness and use the union filter to + * Run a sync scan to increase effectiveness and use the union filter to * cover all applicable quotas in a single scan. */ - eofb.eof_scan_owner = ip->i_ino; eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { @@ -1583,12 +1567,9 @@ xfs_inode_free_cowblocks( { int ret; struct xfs_eofblocks *eofb = args; - bool need_iolock = true; int match; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - /* * Just clear the tag if we have an empty cow fork or none at all. It's * possible the inode was fully unshared since it was originally tagged. @@ -1621,28 +1602,16 @@ xfs_inode_free_cowblocks( if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && XFS_ISIZE(ip) < eofb->eof_min_file_size) return 0; - - /* - * A scan owner implies we already hold the iolock. Skip it in - * xfs_free_eofblocks() to avoid deadlock. This also eliminates - * the possibility of EAGAIN being returned. - */ - if (eofb->eof_scan_owner == ip->i_ino) - need_iolock = false; } /* Free the CoW blocks */ - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - } + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); - if (need_iolock) { - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index a1e02f4708ab..8a7c849b4dea 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -27,7 +27,6 @@ struct xfs_eofblocks { kgid_t eof_gid; prid_t eof_prid; __u64 eof_min_file_size; - xfs_ino_t eof_scan_owner; }; #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ @@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user( dst->eof_flags = src->eof_flags; dst->eof_prid = src->eof_prid; dst->eof_min_file_size = src->eof_min_file_size; - dst->eof_scan_owner = NULLFSINO; dst->eof_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { -- cgit From 3802a345321a08093ba2ddb1849e736f84e8d450 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2017 16:45:58 -0800 Subject: xfs: only reclaim unwritten COW extents periodically We only want to reclaim preallocations from our periodic work item. Currently this is archived by looking for a dirty inode, but that check is rather fragile. Instead add a flag to xfs_reflink_cancel_cow_* so that the caller can ask for just cancelling unwritten extents in the COW fork. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [darrick: fix typos in commit message] Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_reflink.c | 23 ++++++++++++++++------- fs/xfs/xfs_reflink.h | 4 ++-- fs/xfs/xfs_super.c | 2 +- 6 files changed, 22 insertions(+), 13 deletions(-) (limited to 'fs/xfs/xfs_icache.c') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index bf65a9ea8642..aa8a6f0d09c3 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -293,7 +293,7 @@ xfs_end_io( goto done; if (ioend->io_bio->bi_error) { error = xfs_reflink_cancel_cow_range(ip, - ioend->io_offset, ioend->io_size); + ioend->io_offset, ioend->io_size, true); goto done; } error = xfs_reflink_end_cow(ip, ioend->io_offset, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7234b9748c36..3531f8f72fa5 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1608,7 +1608,7 @@ xfs_inode_free_cowblocks( xfs_ilock(ip, XFS_IOLOCK_EXCL); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index edfa6a55b064..7eaf1ef74e3c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1615,7 +1615,7 @@ xfs_itruncate_extents( /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, - last_block); + last_block, true); if (error) goto out; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index da6d08fb359c..4a84c5ea266d 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -548,14 +548,18 @@ xfs_reflink_trim_irec_to_next_cow( } /* - * Cancel all pending CoW reservations for some block range of an inode. + * Cancel CoW reservations for some block range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_blocks( struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb) + xfs_fileoff_t end_fsb, + bool cancel_real) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; @@ -579,7 +583,7 @@ xfs_reflink_cancel_cow_blocks( &idx, &got, &del); if (error) break; - } else { + } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); @@ -621,13 +625,17 @@ xfs_reflink_cancel_cow_blocks( } /* - * Cancel all pending CoW reservations for some byte range of an inode. + * Cancel CoW reservations for some byte range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_range( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool cancel_real) { struct xfs_trans *tp; xfs_fileoff_t offset_fsb; @@ -653,7 +661,8 @@ xfs_reflink_cancel_cow_range( xfs_trans_ijoin(tp, ip, 0); /* Scrape out the old CoW reservations */ - error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, + cancel_real); if (error) goto out_cancel; @@ -1450,7 +1459,7 @@ next: * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 33ac9b8db683..d29a7967f029 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -39,9 +39,9 @@ extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb); + xfs_fileoff_t end_fsb, bool cancel_real); extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count); + xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 890862f2447c..685c042a120f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -953,7 +953,7 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_remove); if (xfs_is_reflink_inode(ip)) { - error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) xfs_warn(ip->i_mount, "Error %d while evicting CoW blocks for inode %llu.", -- cgit