From f88ae46b09e93ef07ac9efaf85df62adb5ba58e6 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jan 2017 23:16:37 -0800 Subject: xfs: glean crc status from mp not flags in xfs_btree_init_block_int xfs_btree_init_block_int() can determine whether crcs are in effect without the passed-in XFS_BTREE_CRC_BLOCKS flag; the mp argument allows us to determine this from the superblock. Remove the flag from callers, and use xfs_sb_version_hascrc(&mp->m_sb) internally instead. This removes one difference between the if & else cases in the callers. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index bfc00de5c6f1..1d4b8d5edaaf 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -743,7 +743,7 @@ xfs_bmap_extents_to_btree( if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, - XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + XFS_BTREE_LONG_PTRS); else xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, XFS_BMAP_MAGIC, 1, 1, ip->i_ino, @@ -820,7 +820,7 @@ try_another_ag: if (xfs_sb_version_hascrc(&mp->m_sb)) xfs_btree_init_block_int(mp, ablock, abp->b_bn, XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + XFS_BTREE_LONG_PTRS); else xfs_btree_init_block_int(mp, ablock, abp->b_bn, XFS_BMAP_MAGIC, 0, 0, ip->i_ino, -- cgit From b6f41e448277ff080fea734b93121e6cd7513f0c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 27 Jan 2017 23:16:39 -0800 Subject: xfs: remove boilerplate around xfs_btree_init_block Now that xfs_btree_init_block_int is able to determine crc status from the passed-in mp, we can determine the proper magic as well if we are given a btree number, rather than an explicit magic value. Change xfs_btree_init_block[_int] callers to pass in the btree number, and let xfs_btree_init_block_int use the xfs_magics array via the xfs_btree_magic macro to determine which magic value is needed. This makes all of the if (crc) / else stanzas identical, and the if/else can be removed, leading to a single, common init_block call. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 19 ++++--------------- fs/xfs/libxfs/xfs_bmap_btree.c | 10 ++-------- fs/xfs/libxfs/xfs_btree.c | 11 +++++------ fs/xfs/libxfs/xfs_btree.h | 4 ++-- fs/xfs/xfs_fsops.c | 31 ++++++------------------------- 5 files changed, 19 insertions(+), 56 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1d4b8d5edaaf..d3da53e6a927 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -740,15 +740,9 @@ xfs_bmap_extents_to_btree( * Fill in the root. */ block = ifp->if_broot; - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BTNUM_BMAP, 1, 1, ip->i_ino, XFS_BTREE_LONG_PTRS); - else - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BMAP_MAGIC, 1, 1, ip->i_ino, - XFS_BTREE_LONG_PTRS); - /* * Need a cursor. Can't allocate until bb_level is filled in. */ @@ -817,13 +811,8 @@ try_another_ag: */ abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, ablock, abp->b_bn, - XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); - else - xfs_btree_init_block_int(mp, ablock, abp->b_bn, - XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index a80bf8080b1c..f93072b58a58 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -71,15 +71,9 @@ xfs_bmdr_to_bmbt( xfs_bmbt_key_t *tkp; __be64 *tpp; - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); - else - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); - rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 18afab315445..421efa0ef778 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1100,13 +1100,14 @@ xfs_btree_init_block_int( struct xfs_mount *mp, struct xfs_btree_block *buf, xfs_daddr_t blkno, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, unsigned int flags) { int crc = xfs_sb_version_hascrc(&mp->m_sb); + __u32 magic = xfs_btree_magic(crc, btnum); buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); @@ -1141,14 +1142,14 @@ void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, unsigned int flags) { xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - magic, level, numrecs, owner, flags); + btnum, level, numrecs, owner, flags); } STATIC void @@ -1159,8 +1160,6 @@ xfs_btree_init_block_cur( int numrecs) { __u64 owner; - int crc = xfs_sb_version_hascrc(&cur->bc_mp->m_sb); - xfs_btnum_t btnum = cur->bc_btnum; /* * we can pull the owner from the cursor right now as the different @@ -1174,7 +1173,7 @@ xfs_btree_init_block_cur( owner = cur->bc_private.a.agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - xfs_btree_magic(crc, btnum), level, numrecs, + cur->bc_btnum, level, numrecs, owner, cur->bc_flags); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 95ea6ed0c14b..cdd4f05a5976 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -380,7 +380,7 @@ void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, @@ -391,7 +391,7 @@ xfs_btree_init_block_int( struct xfs_mount *mp, struct xfs_btree_block *buf, xfs_daddr_t blkno, - __u32 magic, + xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 21e3cdbaebbc..6ccaae9eb0ee 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -352,12 +352,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, - agno, 0); - else - xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); @@ -381,12 +376,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, - agno, 0); - else - xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); @@ -413,7 +403,7 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0, + xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0, agno, 0); block = XFS_BUF_TO_BLOCK(bp); @@ -488,12 +478,7 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, - agno, 0); - else - xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, - agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -513,12 +498,8 @@ xfs_growfs_data_private( goto error0; } - if (xfs_sb_version_hascrc(&mp->m_sb)) - xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, + xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO, 0, 0, agno, 0); - else - xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, - 0, agno, 0); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -539,7 +520,7 @@ xfs_growfs_data_private( goto error0; } - xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, + xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC, 0, 0, agno, 0); error = xfs_bwrite(bp); -- cgit From d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:13:58 -0800 Subject: xfs: filter out obviously bad btree pointers Don't let anybody load an obviously bad btree pointer. Since the values come from disk, we must return an error, not just ASSERT. Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen --- fs/xfs/libxfs/xfs_bmap.c | 5 +---- fs/xfs/libxfs/xfs_btree.c | 3 ++- fs/xfs/libxfs/xfs_btree.h | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d3da53e6a927..2e91eb66d32f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1267,7 +1267,6 @@ xfs_bmap_read_extents( /* REFERENCED */ xfs_extnum_t room; /* number of entries there's room for */ - bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : @@ -1280,9 +1279,7 @@ xfs_bmap_read_extents( ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); - ASSERT(bno != NULLFSBLOCK); - ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 421efa0ef778..c3decedc9455 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -826,7 +826,8 @@ xfs_btree_read_bufl( xfs_daddr_t d; /* real disk block address */ int error; - ASSERT(fsbno != NULLFSBLOCK); + if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) + return -EFSCORRUPTED; d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, mp->m_bsize, lock, &bp, ops); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index cdd4f05a5976..4bb62580a7fd 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -458,7 +458,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) #define XFS_FSB_SANITY_CHECK(mp,fsb) \ - (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) /* -- cgit From 05a630d76bd3f39baf0eecfa305bed2820796dee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 2 Feb 2017 15:14:01 -0800 Subject: xfs: allow unwritten extents in the CoW fork In the data fork, we only allow extents to perform the following state transitions: delay -> real <-> unwritten There's no way to move directly from a delalloc reservation to an /unwritten/ allocated extent. However, for the CoW fork we want to be able to do the following to each extent: delalloc -> unwritten -> written -> remapped to data fork This will help us to avoid a race in the speculative CoW preallocation code between a first thread that is allocating a CoW extent and a second thread that is remapping part of a file after a write. In order to do this, however, we need two things: first, we have to be able to transition from da to unwritten, and second the function that converts between real and unwritten has to be made aware of the cow fork. Do both of those things. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 80 ++++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 30 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2e91eb66d32f..dcffbb09444e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1850,6 +1850,7 @@ xfs_bmap_add_extent_delay_real( */ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_state(ep, new->br_state); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); (*nextents)++; @@ -2188,6 +2189,7 @@ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ @@ -2207,12 +2209,14 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - struct xfs_mount *mp = tp->t_mountp; + struct xfs_mount *mp = ip->i_mount; *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(*idx >= 0); ASSERT(*idx <= xfs_iext_count(ifp)); @@ -2271,7 +2275,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < xfs_iext_count(&ip->i_df) - 1) { + if (*idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2311,7 +2315,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 2, state); - ip->i_d.di_nextents -= 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2354,7 +2359,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2389,7 +2395,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_set_state(ep, newext); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx + 1, 1, state); - ip->i_d.di_nextents--; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2501,7 +2508,8 @@ xfs_bmap_add_extent_unwritten_real( trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2579,7 +2587,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 1, new, state); - ip->i_d.di_nextents++; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2627,7 +2636,8 @@ xfs_bmap_add_extent_unwritten_real( ++*idx; xfs_iext_insert(ip, *idx, 2, &r[0], state); - ip->i_d.di_nextents += 2; + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2681,17 +2691,17 @@ xfs_bmap_add_extent_unwritten_real( } /* update reverse mappings */ - error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); + error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new); if (error) goto done; /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, - 0, &tmp_logflags, XFS_DATA_FORK); + 0, &tmp_logflags, whichfork); *logflagsp |= tmp_logflags; if (error) goto done; @@ -2703,7 +2713,7 @@ xfs_bmap_add_extent_unwritten_real( *curp = cur; } - xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(*curp, ip, whichfork); done: *logflagsp |= rval; return error; @@ -4354,10 +4364,16 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_NORM; /* - * A wasdelay extent has been initialized, so shouldn't be flagged - * as unwritten. + * In the data fork, a wasdelay extent has been initialized, so + * shouldn't be flagged as unwritten. + * + * For the cow fork, however, we convert delalloc reservations + * (extents allocated for speculative preallocation) to + * allocated unwritten extents, and only convert the unwritten + * extents to real extents when we're about to write the data. */ - if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && + if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && + (bma->flags & XFS_BMAPI_PREALLOC) && xfs_sb_version_hasextflgbit(&mp->m_sb)) bma->got.br_state = XFS_EXT_UNWRITTEN; @@ -4408,8 +4424,6 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; - ASSERT(whichfork != XFS_COW_FORK); - /* * Modify (by adding) the state flag, if writing. */ @@ -4434,8 +4448,8 @@ xfs_bmapi_convert_unwritten( return error; } - error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, - &bma->cur, mval, bma->firstblock, bma->dfops, + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, + &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops, &tmp_logflags); /* * Log the inode core unconditionally in the unwritten extent conversion @@ -4444,8 +4458,12 @@ xfs_bmapi_convert_unwritten( * in the transaction for the sake of fsync(), even if nothing has * changed, because fsync() will not force the log for this transaction * unless it sees the inode pinned. + * + * Note: If we're only converting cow fork extents, there aren't + * any on-disk updates to make, so we don't need to log anything. */ - bma->logflags |= tmp_logflags | XFS_ILOG_CORE; + if (whichfork != XFS_COW_FORK) + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -4519,15 +4537,15 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & XFS_BMAPI_IGSTATE)); - ASSERT(tp != NULL); + ASSERT(tp != NULL || + (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == + (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); - ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); - ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -5542,8 +5560,8 @@ __xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, - &lastx, &cur, &del, firstblock, dfops, - &logflags); + whichfork, &lastx, &cur, &del, + firstblock, dfops, &logflags); if (error) goto error0; goto nodelete; @@ -5596,8 +5614,9 @@ __xfs_bunmapi( prev.br_state = XFS_EXT_UNWRITTEN; lastx--; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &prev, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &prev, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; @@ -5605,8 +5624,9 @@ __xfs_bunmapi( ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, - ip, &lastx, &cur, &del, - firstblock, dfops, &logflags); + ip, whichfork, &lastx, &cur, + &del, firstblock, dfops, + &logflags); if (error) goto error0; goto nodelete; -- cgit From a14234c72bf41ac96bc8c98e96e2c84b6d4bd4f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Feb 2017 10:50:49 -0800 Subject: xfs: go straight to real allocations for direct I/O COW writes When we allocate COW fork blocks for direct I/O writes we currently first create a delayed allocation, and then convert it to a real allocation once we've got the delayed one. As there is no good reason for that this patch instead makes use call xfs_bmapi_write from the COW allocation path. The only interesting bits are a few tweaks the low-level allocator to allow for this, most notably the need to remove the call to xfs_bmap_extsize_align for the cowextsize in xfs_bmap_btalloc - for the existing convert case it's a no-op, but for the direct allocation case it would blow up our block reservation way beyond what we reserved for the transaction. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 3 +- fs/xfs/xfs_reflink.c | 94 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 68 insertions(+), 29 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index dcffbb09444e..1cee514de19e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2895,13 +2895,14 @@ xfs_bmap_add_extent_hole_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); - ASSERT(whichfork != XFS_COW_FORK); XFS_STATS_INC(mp, xs_add_exlist); state = 0; if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; /* * Check and set flags if this segment has a left neighbor. diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 219bc96bfc71..9bba084c1436 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -389,62 +389,100 @@ __xfs_reflink_allocate_cow( xfs_fileoff_t end_fsb) { struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; + struct xfs_bmbt_irec imap, got; struct xfs_defer_ops dfops; struct xfs_trans *tp; xfs_fsblock_t first_block; - int nimaps = 1, error; - bool shared; + int nimaps, error, lockmode; + bool shared, trimmed; + xfs_filblks_t resaligned; + xfs_extlen_t resblks; + xfs_extnum_t idx; - xfs_defer_init(&dfops, &first_block); + resaligned = xfs_aligned_fsb_count(*offset_fsb, end_fsb - *offset_fsb, + xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, - XFS_TRANS_RESERVE, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); - /* Read extent from the source file. */ - nimaps = 1; - error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, - &imap, &nimaps, 0); - if (error) - goto out_unlock; - ASSERT(nimaps == 1); + /* + * Even if the extent is not shared we might have a preallocation for + * it in the COW fork. If so use it. + */ + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, *offset_fsb, &idx, &got) && + got.br_startoff <= *offset_fsb) { + /* If we have a real allocation in the COW fork we're done. */ + if (!isnullstartblock(got.br_startblock)) { + xfs_trim_extent(&got, *offset_fsb, + end_fsb - *offset_fsb); + *offset_fsb = got.br_startoff + got.br_blockcount; + goto out_trans_cancel; + } + } else { + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_trans_cancel; + ASSERT(nimaps == 1); + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, + &trimmed); + if (error) + goto out_trans_cancel; + + if (!shared) { + *offset_fsb = imap.br_startoff + imap.br_blockcount; + goto out_trans_cancel; + } - /* Make sure there's a CoW reservation for it. */ - error = xfs_reflink_reserve_cow(ip, &imap, &shared); + *offset_fsb = imap.br_startoff; + end_fsb = imap.br_startoff + imap.br_blockcount; + } + + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; - if (!shared) { - *offset_fsb = imap.br_startoff + imap.br_blockcount; - goto out_trans_cancel; - } + xfs_trans_ijoin(tp, ip, 0); + + xfs_defer_init(&dfops, &first_block); + nimaps = 1; /* Allocate the entire reservation as unwritten blocks. */ - xfs_trans_ijoin(tp, ip, 0); - error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, + error = xfs_bmapi_write(tp, ip, *offset_fsb, end_fsb - *offset_fsb, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), - &imap, &nimaps, &dfops); + resblks, &imap, &nimaps, &dfops); if (error) - goto out_trans_cancel; + goto out_bmap_cancel; /* Finish up. */ error = xfs_defer_finish(&tp, &dfops, NULL); if (error) - goto out_trans_cancel; + goto out_bmap_cancel; error = xfs_trans_commit(tp); + if (error) + goto out_unlock; *offset_fsb = imap.br_startoff + imap.br_blockcount; + out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; -out_trans_cancel: + +out_bmap_cancel: xfs_defer_cancel(&dfops); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, + XFS_QMOPT_RES_REGBLKS); +out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; } -- cgit From 0e339ef8556d9e567aa7925f8892c263d79430d9 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 13 Feb 2017 22:48:18 -0800 Subject: xfs: handle indlen shortage on delalloc extent merge When a delalloc extent is created, it can be merged with pre-existing, contiguous, delalloc extents. When this occurs, xfs_bmap_add_extent_hole_delay() merges the extents along with the associated indirect block reservations. The expectation here is that the combined worst case indlen reservation is always less than or equal to the indlen reservation for the individual extents. This is not always the case, however, as existing extents can less than the expected indlen reservation if the extent was previously split due to a hole punch. If a new extent merges with such an extent, the total indlen requirement may be larger than the sum of the indlen reservations held by both extents. xfs_bmap_add_extent_hole_delay() assumes that the worst case indlen reservation is always available and assigns it to the merged extent without consideration for the indlen held by the pre-existing extent. As a result, the subsequent xfs_mod_fdblocks() call can attempt an unintentional allocation rather than a free (indicated by an ASSERT() failure). Further, if the allocation happens to fail in this context, the failure goes unhandled and creates a filesystem wide block accounting inconsistency. Fix xfs_bmap_add_extent_hole_delay() to function as designed. Cap the indlen reservation assigned to the merged extent to the sum of the indlen reservations held by each of the individual extents. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1cee514de19e..73c95466c225 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2805,7 +2805,8 @@ xfs_bmap_add_extent_hole_delay( oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2826,7 +2827,8 @@ xfs_bmap_add_extent_hole_delay( xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); oldlen = startblockval(left.br_startblock) + startblockval(new->br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), nullstartblock((int)newlen)); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); @@ -2842,7 +2844,8 @@ xfs_bmap_add_extent_hole_delay( temp = new->br_blockcount + right.br_blockcount; oldlen = startblockval(new->br_startblock) + startblockval(right.br_startblock); - newlen = xfs_bmap_worst_indlen(ip, temp); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, nullstartblock((int)newlen), temp, right.br_state); -- cgit From 75d65361cf3c0dae2af970c305e19c727b28a510 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 13 Feb 2017 22:48:30 -0800 Subject: xfs: split indlen reservations fairly when under reserved Certain workoads that punch holes into speculative preallocation can cause delalloc indirect reservation splits when the delalloc extent is split in two. If further splits occur, an already short-handed extent can be split into two in a manner that leaves zero indirect blocks for one of the two new extents. This occurs because the shortage is large enough that the xfs_bmap_split_indlen() algorithm completely drains the requested indlen of one of the extents before it honors the existing reservation. This ultimately results in a warning from xfs_bmap_del_extent(). This has been observed during file copies of large, sparse files using 'cp --sparse=always.' To avoid this problem, update xfs_bmap_split_indlen() to explicitly apply the reservation shortage fairly between both extents. This smooths out the overall indlen shortage and defers the situation where we end up with a delalloc extent with zero indlen reservation to extreme circumstances. Reported-by: Patrick Dung Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 61 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 18 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 73c95466c225..2ae55db8c977 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4795,34 +4795,59 @@ xfs_bmap_split_indlen( xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ xfs_filblks_t stolen = 0; + xfs_filblks_t resfactor; /* * Steal as many blocks as we can to try and satisfy the worst case * indlen for both new extents. */ - while (nres > ores && avail) { - nres--; - avail--; - stolen++; - } + if (ores < nres && avail) + stolen = XFS_FILBLKS_MIN(nres - ores, avail); + ores += stolen; + + /* nothing else to do if we've satisfied the new reservation */ + if (ores >= nres) + return stolen; + + /* + * We can't meet the total required reservation for the two extents. + * Calculate the percent of the overall shortage between both extents + * and apply this percentage to each of the requested indlen values. + * This distributes the shortage fairly and reduces the chances that one + * of the two extents is left with nothing when extents are repeatedly + * split. + */ + resfactor = (ores * 100); + do_div(resfactor, nres); + len1 *= resfactor; + do_div(len1, 100); + len2 *= resfactor; + do_div(len2, 100); + ASSERT(len1 + len2 <= ores); + ASSERT(len1 < *indlen1 && len2 < *indlen2); /* - * The only blocks available are those reserved for the original - * extent and what we can steal from the extent being removed. - * If this still isn't enough to satisfy the combined - * requirements for the two new extents, skim blocks off of each - * of the new reservations until they match what is available. + * Hand out the remainder to each extent. If one of the two reservations + * is zero, we want to make sure that one gets a block first. The loop + * below starts with len1, so hand len2 a block right off the bat if it + * is zero. */ - while (nres > ores) { - if (len1) { - len1--; - nres--; + ores -= (len1 + len2); + ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores); + if (ores && !len2 && *indlen2) { + len2++; + ores--; + } + while (ores) { + if (len1 < *indlen1) { + len1++; + ores--; } - if (nres == ores) + if (!ores) break; - if (len2) { - len2--; - nres--; + if (len2 < *indlen2) { + len2++; + ores--; } } -- cgit From 410d17f67e583559be3a922f8b6cc336331893f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Feb 2017 17:12:51 -0800 Subject: xfs: tune down agno asserts in the bmap code In various places we currently assert that xfs_bmap_btalloc allocates from the same as the firstblock value passed in, unless it's either NULLAGNO or the dop_low flag is set. But the reflink code does not fully follow this convention as it passes in firstblock purely as a hint for the allocator without actually having previous allocations in the transaction, and without having a minleft check on the current AG, leading to the assert firing on a very full and heavily used file system. As even the reflink code only allocates from equal or higher AGs for now we can simply the check to always allow for equal or higher AGs. Note that we need to eventually split the two meanings of the firstblock value. At that point we can also allow the reflink code to allocate from any AG instead of limiting it in any way. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2ae55db8c977..a9c66d47757a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -798,9 +798,7 @@ try_another_ag: */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (dfops->dop_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); *firstblock = cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; ip->i_d.di_nblocks++; @@ -3822,17 +3820,13 @@ xfs_bmap_btalloc( * the first block that was allocated. */ ASSERT(*ap->firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *ap->firstblock) == - XFS_FSB_TO_AGNO(mp, args.fsbno) || - (ap->dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *ap->firstblock) < - XFS_FSB_TO_AGNO(mp, args.fsbno))); + XFS_FSB_TO_AGNO(mp, *ap->firstblock) <= + XFS_FSB_TO_AGNO(mp, args.fsbno)); ap->blkno = args.fsbno; if (*ap->firstblock == NULLFSBLOCK) *ap->firstblock = args.fsbno; - ASSERT(nullfb || fb_agno == args.agno || - (ap->dfops->dop_low && fb_agno < args.agno)); + ASSERT(nullfb || fb_agno <= args.agno); ap->length = args.len; if (!(ap->flags & XFS_BMAPI_COWFORK)) ap->ip->i_d.di_nblocks += args.len; @@ -4754,13 +4748,9 @@ error0: if (bma.cur) { if (!error) { ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, *firstblock) <= XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock) || - (dfops->dop_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, - bma.cur->bc_private.b.firstblock))); + bma.cur->bc_private.b.firstblock)); *firstblock = bma.cur->bc_private.b.firstblock; } xfs_btree_del_cursor(bma.cur, -- cgit From f65e6fad293b3a5793b7fa2044800506490e7a2e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 8 Mar 2017 09:58:08 -0800 Subject: xfs: use iomap new flag for newly allocated delalloc blocks Commit fa7f138 ("xfs: clear delalloc and cache on buffered write failure") fixed one regression in the iomap error handling code and exposed another. The fundamental problem is that if a buffered write is a rewrite of preexisting delalloc blocks and the write fails, the failure handling code can punch out preexisting blocks with valid file data. This was reproduced directly by sub-block writes in the LTP kernel/syscalls/write/write03 test. A first 100 byte write allocates a single block in a file. A subsequent 100 byte write fails and punches out the block, including the data successfully written by the previous write. To address this problem, update the ->iomap_begin() handler to distinguish newly allocated delalloc blocks from preexisting delalloc blocks via the IOMAP_F_NEW flag. Use this flag in the ->iomap_end() handler to decide when a failed or short write should punch out delalloc blocks. This introduces the subtle requirement that ->iomap_begin() should never combine newly allocated delalloc blocks with existing blocks in the resulting iomap descriptor. This can occur when a new delalloc reservation merges with a neighboring extent that is part of the current write, for example. Therefore, drop the post-allocation extent lookup from xfs_bmapi_reserve_delalloc() and just return the record inserted into the fork. This ensures only new blocks are returned and thus that preexisting delalloc blocks are always handled as "found" blocks and not punched out on a failed rewrite. Reported-by: Xiong Zhou Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 24 ++++++++++++++---------- fs/xfs/xfs_iomap.c | 25 ++++++++++++++++++------- 2 files changed, 32 insertions(+), 17 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a9c66d47757a..bfa59a1a2d09 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4150,6 +4150,19 @@ xfs_bmapi_read( return 0; } +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, @@ -4236,13 +4249,8 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); - /* - * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay - * might have merged it into one of the neighbouring ones. - */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); /* * Tag the inode if blocks were preallocated. Note that COW fork @@ -4254,10 +4262,6 @@ xfs_bmapi_reserve_delalloc( if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) xfs_inode_set_cowblocks_tag(ip); - ASSERT(got->br_startoff <= aoff); - ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); - ASSERT(isnullstartblock(got->br_startblock)); - ASSERT(got->br_state == XFS_EXT_NORM); return 0; out_unreserve_blocks: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 41662fb14e87..288ee5b840d7 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -630,6 +630,11 @@ retry: goto out_unlock; } + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -1071,16 +1076,22 @@ xfs_file_iomap_end_delalloc( struct xfs_inode *ip, loff_t offset, loff_t length, - ssize_t written) + ssize_t written, + struct iomap *iomap) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_fsb; xfs_fileoff_t end_fsb; int error = 0; - /* behave as if the write failed if drop writes is enabled */ - if (xfs_mp_drop_writes(mp)) + /* + * Behave as if the write failed if drop writes is enabled. Set the NEW + * flag to force delalloc cleanup. + */ + if (xfs_mp_drop_writes(mp)) { + iomap->flags |= IOMAP_F_NEW; written = 0; + } /* * start_fsb refers to the first unused block after a short write. If @@ -1094,14 +1105,14 @@ xfs_file_iomap_end_delalloc( end_fsb = XFS_B_TO_FSB(mp, offset + length); /* - * Trim back delalloc blocks if we didn't manage to write the whole - * range reserved. + * Trim delalloc blocks if they were allocated by this write and we + * didn't manage to write the whole range. * * We don't need to care about racing delalloc as we hold i_mutex * across the reserve/allocate/unreserve calls. If there are delalloc * blocks in the range, they are ours. */ - if (start_fsb < end_fsb) { + if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), XFS_FSB_TO_B(mp, end_fsb) - 1); @@ -1131,7 +1142,7 @@ xfs_file_iomap_end( { if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, - length, written); + length, written, iomap); return 0; } -- cgit From 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Mar 2017 10:38:53 -0800 Subject: xfs: try any AG when allocating the first btree block when reflinking When a reflink operation causes the bmap code to allocate a btree block we're currently doing single-AG allocations due to having ->firstblock set and then try any higher AG due a little reflink quirk we've put in when adding the reflink code. But given that we do not have a minleft reservation of any kind in this AG we can still not have any space in the same or higher AG even if the file system has enough free space. To fix this use a XFS_ALLOCTYPE_FIRST_AG allocation in this fall back path instead. [And yes, we need to redo this properly instead of piling hacks over hacks. I'm working on that, but it's not going to be a small series. In the meantime this fixes the customer reported issue] Also add a warning for failing allocations to make it easier to debug. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 10 +++++++--- fs/xfs/libxfs/xfs_bmap_btree.c | 6 +++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs/xfs/libxfs/xfs_bmap.c') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index bfa59a1a2d09..9bd104f32908 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -763,8 +763,8 @@ xfs_bmap_extents_to_btree( args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { -try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; +try_another_ag: args.fsbno = *firstblock; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -790,13 +790,17 @@ try_another_ag: if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && args.fsbno == NULLFSBLOCK && args.type == XFS_ALLOCTYPE_NEAR_BNO) { - dfops->dop_low = true; + args.type = XFS_ALLOCTYPE_FIRST_AG; goto try_another_ag; } + if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return -ENOSPC; + } /* * Allocation can't fail, the space was reserved. */ - ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); *firstblock = cur->bc_private.b.firstblock = args.fsbno; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index f93072b58a58..fd55db479385 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -447,8 +447,8 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); -try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; +try_another_ag: /* * Make sure there is sufficient room left in the AG to * complete a full tree split for an extent insert. If @@ -488,8 +488,8 @@ try_another_ag: if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && args.fsbno == NULLFSBLOCK && args.type == XFS_ALLOCTYPE_NEAR_BNO) { - cur->bc_private.b.dfops->dop_low = true; args.fsbno = cur->bc_private.b.firstblock; + args.type = XFS_ALLOCTYPE_FIRST_AG; goto try_another_ag; } @@ -506,7 +506,7 @@ try_another_ag: goto error0; cur->bc_private.b.dfops->dop_low = true; } - if (args.fsbno == NULLFSBLOCK) { + if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; -- cgit