diff options
Diffstat (limited to 'fs/xfs')
104 files changed, 3233 insertions, 2114 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9345802c99f7..1ef8acf35e7d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -339,14 +339,14 @@ xfs_ag_init_headers( { /* BNO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_cntbt_buf_ops, .work = &xfs_cntroot_init, .need_init = true }, @@ -361,7 +361,7 @@ xfs_ag_init_headers( { /* FINO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_inobt_buf_ops, + .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) @@ -414,7 +414,6 @@ xfs_ag_extend_space( struct aghdr_init_data *id, xfs_extlen_t len) { - struct xfs_owner_info oinfo; struct xfs_buf *bp; struct xfs_agi *agi; struct xfs_agf *agf; @@ -448,17 +447,17 @@ xfs_ag_extend_space( /* * Free the new space. * - * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that + * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); error = xfs_rmap_free(tp, bp, id->agno, be32_to_cpu(agf->agf_length) - len, - len, &oinfo); + len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) return error; return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, be32_to_cpu(agf->agf_length) - len), - len, &oinfo, XFS_AG_RESV_NONE); + len, &XFS_RMAP_OINFO_SKIP_UPDATE, + XFS_AG_RESV_NONE); } diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e701ebc36c06..e2ba2a3b63b2 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -281,7 +281,7 @@ xfs_ag_resv_init( */ ask = used = 0; - mp->m_inotbt_nores = true; + mp->m_finobt_nores = true; error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index e1c0c0d2f1b0..bc3367b8b7bb 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -568,9 +568,9 @@ xfs_agfl_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return NULL; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) + if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) return __this_address; - if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -643,6 +643,7 @@ xfs_agfl_write_verify( const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", + .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, .verify_struct = xfs_agfl_verify, @@ -1594,7 +1595,6 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { - struct xfs_owner_info oinfo; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1648,9 +1648,8 @@ xfs_alloc_ag_vextent_small( * doesn't live in the free space, we need to clear * out the OWN_AG rmap. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, - fbno, 1, &oinfo); + fbno, 1, &XFS_RMAP_OINFO_AG); if (error) goto error0; @@ -1694,28 +1693,28 @@ error0: */ STATIC int xfs_free_ag_extent( - xfs_trans_t *tp, - xfs_buf_t *agbp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) { - xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ - xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ - int error; /* error return value */ - xfs_agblock_t gtbno; /* start of right neighbor block */ - xfs_extlen_t gtlen; /* length of right neighbor block */ - int haveleft; /* have a left neighbor block */ - int haveright; /* have a right neighbor block */ - int i; /* temp, result code */ - xfs_agblock_t ltbno; /* start of left neighbor block */ - xfs_extlen_t ltlen; /* length of left neighbor block */ - xfs_mount_t *mp; /* mount point struct for filesystem */ - xfs_agblock_t nbno; /* new starting block of freespace */ - xfs_extlen_t nlen; /* new length of freespace */ - xfs_perag_t *pag; /* per allocation group data */ + struct xfs_mount *mp; + struct xfs_perag *pag; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t gtbno; /* start of right neighbor */ + xfs_extlen_t gtlen; /* length of right neighbor */ + xfs_agblock_t ltbno; /* start of left neighbor */ + xfs_extlen_t ltlen; /* length of left neighbor */ + xfs_agblock_t nbno; /* new starting block of freesp */ + xfs_extlen_t nlen; /* new length of freespace */ + int haveleft; /* have a left neighbor */ + int haveright; /* have a right neighbor */ + int i; + int error; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; @@ -2314,10 +2313,11 @@ xfs_alloc_fix_freelist( * repair/rmap.c in xfsprogs for details. */ memset(&targs, 0, sizeof(targs)); + /* struct copy below */ if (flags & XFS_ALLOC_FLAG_NORMAP) - xfs_rmap_skip_owner_update(&targs.oinfo); + targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; else - xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG); + targs.oinfo = XFS_RMAP_OINFO_AG; while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) @@ -2435,7 +2435,6 @@ xfs_alloc_get_freelist( be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; - xfs_perag_put(pag); logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; if (btreeblk) { @@ -2443,6 +2442,7 @@ xfs_alloc_get_freelist( pag->pagf_btreeblks++; logflags |= XFS_AGF_BTREEBLKS; } + xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; @@ -2588,8 +2588,10 @@ xfs_agf_verify( return __this_address; } - if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + if (!xfs_verify_magic(bp, agf->agf_magicnum)) + return __this_address; + + if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && @@ -2671,6 +2673,7 @@ xfs_agf_write_verify( const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", + .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, .verify_struct = xfs_agf_verify, @@ -3008,21 +3011,21 @@ out: * Just break up the extent address and hand off to xfs_free_ag_extent * after fixing up the freelist. */ -int /* error */ +int __xfs_free_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo, /* extent owner */ - enum xfs_ag_resv_type type, /* block reservation type */ - bool skip_discard) + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + bool skip_discard) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *agbp; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); - int error; - unsigned int busy_flags = 0; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + int error; + unsigned int busy_flags = 0; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 00cd5ec4cb6b..d6ed5d2c07c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -182,7 +182,7 @@ __xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo, /* extent owner */ + const struct xfs_owner_info *oinfo, /* extent owner */ enum xfs_ag_resv_type type, /* block reservation type */ bool skip_discard); @@ -191,7 +191,7 @@ xfs_free_extent( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_extlen_t len, - struct xfs_owner_info *oinfo, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { return __xfs_free_extent(tp, bno, len, oinfo, type, false); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 4e59cc8a2802..9fe949f6055e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -297,48 +297,34 @@ xfs_allocbt_verify( struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; + xfs_btnum_t btnum = XFS_BTNUM_BNOi; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; + } /* - * magic number and level verification - * - * During growfs operations, we can't verify the exact level or owner as - * the perag is not fully initialised and hence not attached to the - * buffer. In this case, check against the maximum tree depth. + * The perag may not be attached during grow operations or fully + * initialized from the AGF during log recovery. Therefore we can only + * check against maximum tree depth from those contexts. * - * Similarly, during log recovery we will have a perag structure - * attached, but the agf information will not yet have been initialised - * from the on disk AGF. Again, we can only check against maximum limits - * in this case. + * Otherwise check against the per-tree limit. Peek at one of the + * verifier magic values to determine the type of tree we're verifying + * against. */ level = be16_to_cpu(block->bb_level); - switch (block->bb_magic) { - case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) + btnum = XFS_BTNUM_CNTi; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[btnum]) return __this_address; - break; - case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) - return __this_address; - break; - default: + } else if (level >= mp->m_ag_maxlevels) return __this_address; - } return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } @@ -377,13 +363,23 @@ xfs_allocbt_write_verify( } -const struct xfs_buf_ops xfs_allocbt_buf_ops = { - .name = "xfs_allocbt", +const struct xfs_buf_ops xfs_bnobt_buf_ops = { + .name = "xfs_bnobt", + .magic = { cpu_to_be32(XFS_ABTB_MAGIC), + cpu_to_be32(XFS_ABTB_CRC_MAGIC) }, .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, .verify_struct = xfs_allocbt_verify, }; +const struct xfs_buf_ops xfs_cntbt_buf_ops = { + .name = "xfs_cntbt", + .magic = { cpu_to_be32(XFS_ABTC_MAGIC), + cpu_to_be32(XFS_ABTC_CRC_MAGIC) }, + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, + .verify_struct = xfs_allocbt_verify, +}; STATIC int xfs_bnobt_keys_inorder( @@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_bnobt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_bnobt_buf_ops, .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, @@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_cntbt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_cntbt_buf_ops, .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 844ed87b1900..2dd9ee2a2e08 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args) xfs_da_state_free(state); return retval; } + +/* Returns true if the attribute entry name is valid. */ +bool +xfs_attr_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any nulls here */ + return !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index bdf52a333f3f..2297d8467666 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); - +bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 6fc5425b1474..1f6e3965ff74 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -243,27 +243,16 @@ xfs_attr3_leaf_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; - uint16_t end; + uint32_t end; /* must be 32bit - see below */ int i; + xfs_failaddr_t fa; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) - return __this_address; - } /* * In recovery there is a transient state where count == 0 is valid * because we may have transitioned an empty shortform attr to a leaf @@ -293,6 +282,11 @@ xfs_attr3_leaf_verify( /* * Quickly check the freemap information. Attribute data has to be * aligned to 4-byte boundaries, and likewise for the free space. + * + * Note that for 64k block size filesystems, the freemap entries cannot + * overflow as they are only be16 fields. However, when checking end + * pointer of the freemap, we have to be careful to detect overflows and + * so use uint32_t for those checks. */ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr.freemap[i].base > mp->m_attr_geo->blksize) @@ -303,7 +297,9 @@ xfs_attr3_leaf_verify( return __this_address; if (ichdr.freemap[i].size & 0x3) return __this_address; - end = ichdr.freemap[i].base + ichdr.freemap[i].size; + + /* be care of 16 bit overflows here */ + end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size; if (end < ichdr.freemap[i].base) return __this_address; if (end > mp->m_attr_geo->blksize) @@ -362,6 +358,8 @@ xfs_attr3_leaf_read_verify( const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", + .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d89363c6b523..65ff600a8067 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok( static xfs_failaddr_t xfs_attr3_rmt_verify( struct xfs_mount *mp, + struct xfs_buf *bp, void *ptr, int fsbsize, xfs_daddr_t bno) @@ -87,7 +88,7 @@ xfs_attr3_rmt_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify( const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .name = "xfs_attr3_rmt", + .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) }, .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, .verify_struct = xfs_attr3_rmt_verify_struct, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74d7228e755b..48502cb9990f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -536,7 +536,7 @@ __xfs_bmap_add_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo, + const struct xfs_owner_info *oinfo, bool skip_discard) { struct xfs_extent_free_item *new; /* new element */ @@ -564,7 +564,7 @@ __xfs_bmap_add_free( if (oinfo) new->xefi_oinfo = *oinfo; else - xfs_rmap_skip_owner_update(&new->xefi_oinfo); + new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; new->xefi_skip_discard = skip_discard; trace_xfs_bmap_free_defer(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, @@ -577,42 +577,44 @@ __xfs_bmap_add_free( */ /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. + * Convert the inode format to extent format if it currently is in btree format, + * but the extent list is small enough that it fits into the extent format. + * + * Since the extents are already in-core, all we have to do is give up the space + * for the btree root and pitch the leaf block. */ STATIC int /* error */ xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - /* REFERENCED */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ xfs_buf_t *cbp; /* child block's buffer */ int error; /* error return value */ - struct xfs_ifork *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ struct xfs_owner_info oinfo; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* check if we actually need the extent format first: */ + if (!xfs_bmap_wants_extents(ip, whichfork)) + return 0; + + ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); - *logflagsp = 0; #ifdef DEBUG XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, xfs_btree_check_lptr(cur, cbno, 1)); @@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -1694,10 +1696,13 @@ xfs_bmap_add_extent_delay_real( case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. - * The right neighbor is contiguous, the left is not. + * The right neighbor is contiguous, the left is not. Take care + * with delay -> unwritten extent allocation here because the + * delalloc record we are overwriting is always written. */ PREV.br_startblock = new->br_startblock; PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; xfs_iext_next(ifp, &bma->icur); xfs_iext_remove(bma->ip, &bma->icur, state); @@ -2026,7 +2031,7 @@ done: /* * Convert an unwritten allocation to a real allocation or vice versa. */ -STATIC int /* error */ +int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ @@ -3450,7 +3455,7 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; - xfs_rmap_skip_owner_update(&args.oinfo); + args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); @@ -3682,17 +3687,6 @@ xfs_trim_extent( } } -/* trim extent to within eof */ -void -xfs_trim_extent_eof( - struct xfs_bmbt_irec *irec, - struct xfs_inode *ip) - -{ - xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, - i_size_read(VFS_I(ip)))); -} - /* * Trim the returned map to the required bounds */ @@ -4200,6 +4194,44 @@ xfs_bmapi_convert_unwritten( return 0; } +static inline xfs_extlen_t +xfs_bmapi_minleft( + struct xfs_trans *tp, + struct xfs_inode *ip, + int fork) +{ + if (tp && tp->t_firstblock != NULLFSBLOCK) + return 0; + if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + return 1; + return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; +} + +/* + * Log whatever the flags say, even if error. Otherwise we might miss detecting + * a case where the data is changed, there's an error, and it's not logged so we + * don't shutdown when we should. Don't bother logging extents/btree changes if + * we converted to the other format. + */ +static void +xfs_bmapi_finish( + struct xfs_bmalloca *bma, + int whichfork, + int error) +{ + if ((bma->logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma->logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma->logflags &= ~xfs_ilog_fbroot(whichfork); + + if (bma->logflags) + xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags); + if (bma->cur) + xfs_btree_del_cursor(bma->cur, error); +} + /* * Map file blocks to filesystem blocks, and allocate blocks or convert the * extent state if necessary. Details behaviour is controlled by the flags @@ -4244,9 +4276,7 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(tp != NULL || - (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == - (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); + ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -4279,25 +4309,12 @@ xfs_bmapi_write( XFS_STATS_INC(mp, xs_blk_mapw); - if (!tp || tp->t_firstblock == NULLFSBLOCK) { - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; - else - bma.minleft = 1; - } else { - bma.minleft = 0; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) goto error0; } - n = 0; - end = bno + len; - obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) @@ -4306,7 +4323,11 @@ xfs_bmapi_write( bma.ip = ip; bma.total = total; bma.datatype = 0; + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + n = 0; + end = bno + len; + obno = bno; while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; @@ -4320,26 +4341,7 @@ xfs_bmapi_write( ASSERT(!((flags & XFS_BMAPI_CONVERT) && (flags & XFS_BMAPI_COWFORK))); - if (flags & XFS_BMAPI_DELALLOC) { - /* - * For the COW fork we can reasonably get a - * request for converting an extent that races - * with other threads already having converted - * part of it, as there converting COW to - * regular blocks is not protected using the - * IOLOCK. - */ - ASSERT(flags & XFS_BMAPI_COWFORK); - if (!(flags & XFS_BMAPI_COWFORK)) { - error = -EIO; - goto error0; - } - - if (eof || bno >= end) - break; - } else { - need_alloc = true; - } + need_alloc = true; } else if (isnullstartblock(bma.got.br_startblock)) { wasdelay = true; } @@ -4348,8 +4350,7 @@ xfs_bmapi_write( * First, deal with the hole before the allocated space * that we found, if any. */ - if ((need_alloc || wasdelay) && - !(flags & XFS_BMAPI_CONVERT_ONLY)) { + if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4417,49 +4418,130 @@ xfs_bmapi_write( } *nmap = n; - /* - * Transform from btree to extents, give it cur. - */ - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - ASSERT(bma.cur); - error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, - &tmp_logflags, whichfork); - bma.logflags |= tmp_logflags; - if (error) - goto error0; - } + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto error0; ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork)); - error = 0; + xfs_bmapi_finish(&bma, whichfork, 0); + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return 0; error0: + xfs_bmapi_finish(&bma, whichfork, error); + return error; +} + +/* + * Convert an existing delalloc extent to real blocks based on file offset. This + * attempts to allocate the entire delalloc extent and may require multiple + * invocations to allocate the target offset if a large enough physical extent + * is not available. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap, + unsigned int *seq) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmalloca bma = { NULL }; + struct xfs_trans *tp; + int error; + /* - * Log everything. Do this after conversion, there's no point in - * logging the extent records if we've converted to btree format. + * Space for the extent and indirect blocks was reserved when the + * delalloc extent was created so there's no need to do so here. */ - if ((bma.logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - bma.logflags &= ~xfs_ilog_fext(whichfork); - else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - bma.logflags &= ~xfs_ilog_fbroot(whichfork); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || + bma.got.br_startoff > offset_fsb) { + /* + * No extent found in the range we are trying to convert. This + * should only happen for the COW fork, where another thread + * might have moved the extent to the data fork in the meantime. + */ + WARN_ON_ONCE(whichfork != XFS_COW_FORK); + error = -EAGAIN; + goto out_trans_cancel; + } + /* - * Log whatever the flags say, even if error. Otherwise we might miss - * detecting a case where the data is changed, there's an error, - * and it's not logged so we don't shutdown when we should. + * If we find a real extent here we raced with another thread converting + * the extent. Just return the real extent at this offset. */ - if (bma.logflags) - xfs_trans_log_inode(tp, ip, bma.logflags); + if (!isnullstartblock(bma.got.br_startblock)) { + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + goto out_trans_cancel; + } + + bma.tp = tp; + bma.ip = ip; + bma.wasdel = true; + bma.offset = bma.got.br_startoff; + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + if (whichfork == XFS_COW_FORK) + bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + + if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; - if (bma.cur) { - xfs_btree_del_cursor(bma.cur, error); + error = xfs_bmapi_allocate(&bma); + if (error) + goto out_finish; + + error = -ENOSPC; + if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) + goto out_finish; + error = -EFSCORRUPTED; + if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + goto out_finish; + + XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); + XFS_STATS_INC(mp, xs_xstrat_quick); + + ASSERT(!isnullstartblock(bma.got.br_startblock)); + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); + if (error) + goto out_finish; } - if (!error) - xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto out_finish; + + xfs_bmapi_finish(&bma, whichfork, 0); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_finish: + xfs_bmapi_finish(&bma, whichfork, error); +out_trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -4533,13 +4615,7 @@ xfs_bmapi_remap( if (error) goto error0; - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, whichfork); - logflags |= tmp_logflags; - } + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) @@ -5403,24 +5479,11 @@ nodelete: error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; - if (error) - goto error0; - } - /* - * transform from btree to extents, give it cur - */ - else if (xfs_bmap_wants_extents(ip, whichfork)) { - ASSERT(cur != NULL); - error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + } else { + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; } - /* - * transform from extents to local? - */ - error = 0; + error0: /* * Log everything. Do this after conversion, there's no point in diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 488dc8860fd7..8f597f9abdbe 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -95,12 +95,6 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 -/* Only convert delalloc space, don't allocate entirely new extents */ -#define XFS_BMAPI_DELALLOC 0x400 - -/* Only convert unwritten extents, don't allocate new blocks */ -#define XFS_BMAPI_CONVERT_ONLY 0x800 - /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD 0x1000 @@ -117,8 +111,6 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } @@ -181,12 +173,11 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); -void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, - xfs_filblks_t len, struct xfs_owner_info *oinfo, + xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, @@ -228,13 +219,20 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, + unsigned int *seq); +int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, int *logflagsp); static inline void xfs_bmap_add_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo) + const struct xfs_owner_info *oinfo) { __xfs_bmap_add_free(tp, bno, len, oinfo, false); } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cdb74d2e2a43..aff82ed112c9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -416,8 +416,10 @@ xfs_bmbt_verify( xfs_failaddr_t fa; unsigned int level; - switch (block->bb_magic) { - case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -425,11 +427,6 @@ xfs_bmbt_verify( fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_BMAP_MAGIC): - break; - default: - return __this_address; } /* @@ -481,6 +478,8 @@ xfs_bmbt_write_verify( const struct xfs_buf_ops xfs_bmbt_buf_ops = { .name = "xfs_bmbt", + .magic = { cpu_to_be32(XFS_BMAP_MAGIC), + cpu_to_be32(XFS_BMAP_CRC_MAGIC) }, .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, .verify_struct = xfs_bmbt_verify, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 34c6d7bd4d18..bbdae2b4559f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -330,7 +330,7 @@ xfs_btree_sblock_verify_crc( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) - return __this_address; + return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 376bee94b5dd..e2737e2ac2ae 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } +/* + * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only + * accessible on v5 filesystems. This header format is common across da node, + * attr leaf and dir leaf blocks. + */ +xfs_failaddr_t +xfs_da3_blkinfo_verify( + struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *hdr = &hdr3->hdr; + + if (!xfs_verify_magic16(bp, hdr->magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return __this_address; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return __this_address; + } + + return NULL; +} + static xfs_failaddr_t xfs_da3_node_verify( struct xfs_buf *bp) @@ -124,27 +152,16 @@ xfs_da3_node_verify( struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; const struct xfs_dir_ops *ops; + xfs_failaddr_t fa; ops = xfs_dir_get_ops(mp, NULL); ops->node_hdr_from_disk(&ichdr, hdr); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_DA3_NODE_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_DA_NODE_MAGIC) - return __this_address; - } if (ichdr.level == 0) return __this_address; if (ichdr.level > XFS_DA_NODE_MAXDEPTH) @@ -257,6 +274,8 @@ xfs_da3_node_verify_struct( const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", + .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC), + cpu_to_be16(XFS_DA3_NODE_MAGIC) }, .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, .verify_struct = xfs_da3_node_verify_struct, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5d5bf3bffc78..ae654e06b2fb 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); } +xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3); + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index e792b167150a..94f00427de98 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -172,7 +172,13 @@ * reoccur. */ -static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; +static const struct xfs_defer_op_type *defer_op_types[] = { + [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, + [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type, + [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, + [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, + [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, +}; /* * For each pending item in the intake list, log its intent item and the @@ -185,15 +191,15 @@ xfs_defer_create_intents( { struct list_head *li; struct xfs_defer_pending *dfp; + const struct xfs_defer_op_type *ops; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { - dfp->dfp_intent = dfp->dfp_type->create_intent(tp, - dfp->dfp_count); + ops = defer_op_types[dfp->dfp_type]; + dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); trace_xfs_defer_create_intent(tp->t_mountp, dfp); - list_sort(tp->t_mountp, &dfp->dfp_work, - dfp->dfp_type->diff_items); + list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); list_for_each(li, &dfp->dfp_work) - dfp->dfp_type->log_item(tp, dfp->dfp_intent, li); + ops->log_item(tp, dfp->dfp_intent, li); } } @@ -204,14 +210,16 @@ xfs_defer_trans_abort( struct list_head *dop_pending) { struct xfs_defer_pending *dfp; + const struct xfs_defer_op_type *ops; trace_xfs_defer_trans_abort(tp, _RET_IP_); /* Abort intent items that don't have a done item. */ list_for_each_entry(dfp, dop_pending, dfp_list) { + ops = defer_op_types[dfp->dfp_type]; trace_xfs_defer_pending_abort(tp->t_mountp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { - dfp->dfp_type->abort_intent(dfp->dfp_intent); + ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; } } @@ -315,18 +323,20 @@ xfs_defer_cancel_list( struct xfs_defer_pending *pli; struct list_head *pwi; struct list_head *n; + const struct xfs_defer_op_type *ops; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) { + ops = defer_op_types[dfp->dfp_type]; trace_xfs_defer_cancel_list(mp, dfp); list_del(&dfp->dfp_list); list_for_each_safe(pwi, n, &dfp->dfp_work) { list_del(pwi); dfp->dfp_count--; - dfp->dfp_type->cancel_item(pwi); + ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); kmem_free(dfp); @@ -350,7 +360,7 @@ xfs_defer_finish_noroll( struct list_head *n; void *state; int error = 0; - void (*cleanup_fn)(struct xfs_trans *, void *, int); + const struct xfs_defer_op_type *ops; LIST_HEAD(dop_pending); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -373,18 +383,18 @@ xfs_defer_finish_noroll( /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); + ops = defer_op_types[dfp->dfp_type]; trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, + dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent, dfp->dfp_count); - cleanup_fn = dfp->dfp_type->finish_cleanup; /* Finish the work items. */ state = NULL; list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; - error = dfp->dfp_type->finish_item(*tp, li, - dfp->dfp_done, &state); + error = ops->finish_item(*tp, li, dfp->dfp_done, + &state); if (error == -EAGAIN) { /* * Caller wants a fresh transaction; @@ -400,8 +410,8 @@ xfs_defer_finish_noroll( * xfs_defer_cancel will take care of freeing * all these lists and stuff. */ - if (cleanup_fn) - cleanup_fn(*tp, state, error); + if (ops->finish_cleanup) + ops->finish_cleanup(*tp, state, error); goto out; } } @@ -413,20 +423,19 @@ xfs_defer_finish_noroll( * a Fresh Transaction while Finishing * Deferred Work" above. */ - dfp->dfp_intent = dfp->dfp_type->create_intent(*tp, + dfp->dfp_intent = ops->create_intent(*tp, dfp->dfp_count); dfp->dfp_done = NULL; list_for_each(li, &dfp->dfp_work) - dfp->dfp_type->log_item(*tp, dfp->dfp_intent, - li); + ops->log_item(*tp, dfp->dfp_intent, li); } else { /* Done with the dfp, free it. */ list_del(&dfp->dfp_list); kmem_free(dfp); } - if (cleanup_fn) - cleanup_fn(*tp, state, error); + if (ops->finish_cleanup) + ops->finish_cleanup(*tp, state, error); } out: @@ -486,8 +495,10 @@ xfs_defer_add( struct list_head *li) { struct xfs_defer_pending *dfp = NULL; + const struct xfs_defer_op_type *ops; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); /* * Add the item to a pending item at the end of the intake list. @@ -497,15 +508,15 @@ xfs_defer_add( if (!list_empty(&tp->t_dfops)) { dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, dfp_list); - if (dfp->dfp_type->type != type || - (dfp->dfp_type->max_items && - dfp->dfp_count >= dfp->dfp_type->max_items)) + ops = defer_op_types[dfp->dfp_type]; + if (dfp->dfp_type != type || + (ops->max_items && dfp->dfp_count >= ops->max_items)) dfp = NULL; } if (!dfp) { dfp = kmem_alloc(sizeof(struct xfs_defer_pending), KM_SLEEP | KM_NOFS); - dfp->dfp_type = defer_op_types[type]; + dfp->dfp_type = type; dfp->dfp_intent = NULL; dfp->dfp_done = NULL; dfp->dfp_count = 0; @@ -517,14 +528,6 @@ xfs_defer_add( dfp->dfp_count++; } -/* Initialize a deferred operation list. */ -void -xfs_defer_init_op_type( - const struct xfs_defer_op_type *type) -{ - defer_op_types[type->type] = type; -} - /* * Move deferred ops from one transaction to another and reset the source to * initial state. This is primarily used to carry state forward across diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 2584a5b95b0d..7c28d7608ac6 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -9,20 +9,6 @@ struct xfs_defer_op_type; /* - * Save a log intent item and a list of extents, so that we can replay - * whatever action had to happen to the extent list and file the log done - * item. - */ -struct xfs_defer_pending { - const struct xfs_defer_op_type *dfp_type; /* function pointers */ - struct list_head dfp_list; /* pending items */ - void *dfp_intent; /* log intent item */ - void *dfp_done; /* log done item */ - struct list_head dfp_work; /* work items */ - unsigned int dfp_count; /* # extent items */ -}; - -/* * Header for deferred operation list. */ enum xfs_defer_ops_type { @@ -34,6 +20,20 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_MAX, }; +/* + * Save a log intent item and a list of extents, so that we can replay + * whatever action had to happen to the extent list and file the log done + * item. + */ +struct xfs_defer_pending { + struct list_head dfp_list; /* pending items */ + struct list_head dfp_work; /* work items */ + void *dfp_intent; /* log intent item */ + void *dfp_done; /* log done item */ + unsigned int dfp_count; /* # extent items */ + enum xfs_defer_ops_type dfp_type; +}; + void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, struct list_head *h); int xfs_defer_finish_noroll(struct xfs_trans **tp); @@ -43,8 +43,6 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { - enum xfs_defer_ops_type type; - unsigned int max_items; void (*abort_intent)(void *); void *(*create_done)(struct xfs_trans *, void *, unsigned int); int (*finish_item)(struct xfs_trans *, struct list_head *, void *, @@ -54,8 +52,13 @@ struct xfs_defer_op_type { int (*diff_items)(void *, struct list_head *, struct list_head *); void *(*create_intent)(struct xfs_trans *, uint); void (*log_item)(struct xfs_trans *, void *, struct list_head *); + unsigned int max_items; }; -void xfs_defer_init_op_type(const struct xfs_defer_op_type *type); +extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; +extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; +extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; +extern const struct xfs_defer_op_type xfs_extent_free_defer_type; +extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 229152cd1a24..156ce95c9c45 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -703,3 +703,20 @@ xfs_dir2_shrink_inode( xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } + +/* Returns true if the directory entry name is valid. */ +bool +xfs_dir2_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any slashes or nulls here */ + return !memchr(name, '/', length) && !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c3e3f6b813d8..f54244779492 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); +bool xfs_dir2_namecheck(const void *name, size_t length); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 30ed5919da72..b7d6d78f4ce2 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -53,18 +53,16 @@ xfs_dir3_block_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -112,6 +110,8 @@ xfs_dir3_block_write_verify( const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", + .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC), + cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) }, .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, .verify_struct = xfs_dir3_block_verify, diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 01162c62ec8f..b7b9ce002cb9 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -252,18 +252,16 @@ xfs_dir3_data_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -339,6 +337,8 @@ xfs_dir3_data_write_verify( const struct xfs_buf_ops xfs_dir3_data_buf_ops = { .name = "xfs_dir3_data", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, .verify_struct = xfs_dir3_data_verify, @@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = { static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .name = "xfs_dir3_data_reada", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 1728a3e6f5cf..9a3767818c50 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int( */ static xfs_failaddr_t xfs_dir3_leaf_verify( - struct xfs_buf *bp, - uint16_t magic) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_failaddr_t fa; - ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - uint16_t magic3; - - magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC - : XFS_DIR3_LEAFN_MAGIC; - - if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) - return __this_address; - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) - return __this_address; - } else { - if (leaf->hdr.info.magic != cpu_to_be16(magic)) - return __this_address; - } + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); } static void -__read_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_read_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; xfs_failaddr_t fa; @@ -185,23 +166,22 @@ __read_verify( !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void -__write_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_write_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -216,60 +196,22 @@ __write_verify( xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } -static xfs_failaddr_t -xfs_dir3_leaf1_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static xfs_failaddr_t -xfs_dir3_leafn_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", - .verify_read = xfs_dir3_leaf1_read_verify, - .verify_write = xfs_dir3_leaf1_write_verify, - .verify_struct = xfs_dir3_leaf1_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC), + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", - .verify_read = xfs_dir3_leafn_read_verify, - .verify_write = xfs_dir3_leafn_write_verify, - .verify_struct = xfs_dir3_leafn_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC), + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; int diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index f1bb3434f51c..3b03703c5c3d 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -87,20 +87,18 @@ xfs_dir3_free_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; + if (!xfs_verify_magic(bp, hdr->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) - return __this_address; } /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ @@ -151,6 +149,8 @@ xfs_dir3_free_write_verify( const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .name = "xfs_dir3_free", + .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC), + cpu_to_be32(XFS_DIR3_FREE_MAGIC) }, .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, .verify_struct = xfs_dir3_free_verify, diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index d293f371dd54..fb5bd9a804f6 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify( const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, .verify_struct = xfs_dquot_buf_verify_struct, @@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = { const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .name = "xfs_dquot_ra", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 66077a105cbb..79e6c4fb1d8a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -54,7 +54,8 @@ #define XFS_ERRTAG_BUF_LRU_REF 31 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 -#define XFS_ERRTAG_MAX 34 +#define XFS_ERRTAG_IUNLINK_FALLBACK 34 +#define XFS_ERRTAG_MAX 35 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -93,5 +94,6 @@ #define XFS_RANDOM_BUF_LRU_REF 2 #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 +#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9995d5ae380b..9bb3c48843ec 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -916,6 +916,9 @@ static inline uint xfs_dinode_size(int version) /* * Values for di_format + * + * This enum is used in string mapping in xfs_trace.h; please keep the + * TRACE_DEFINE_ENUMs for it up to date. */ typedef enum xfs_dinode_fmt { XFS_DINODE_FMT_DEV, /* xfs_dev_t */ @@ -925,6 +928,13 @@ typedef enum xfs_dinode_fmt { XFS_DINODE_FMT_UUID /* added long ago, but never used */ } xfs_dinode_fmt_t; +#define XFS_INODE_FORMAT_STR \ + { XFS_DINODE_FMT_DEV, "dev" }, \ + { XFS_DINODE_FMT_LOCAL, "local" }, \ + { XFS_DINODE_FMT_EXTENTS, "extent" }, \ + { XFS_DINODE_FMT_BTREE, "btree" }, \ + { XFS_DINODE_FMT_UUID, "uuid" } + /* * Inode minimum and maximum sizes. */ @@ -1083,6 +1093,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) #define XFS_OFFBNO_TO_AGINO(mp,b,o) \ ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) +#define XFS_FSB_TO_INO(mp, b) ((xfs_ino_t)((b) << XFS_INO_OFFSET_BITS(mp))) +#define XFS_AGB_TO_AGINO(mp, b) ((xfs_agino_t)((b) << XFS_INO_OFFSET_BITS(mp))) #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) #define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a8f6db735d5d..fe9898875097 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -288,7 +288,7 @@ xfs_ialloc_inode_init( { struct xfs_buf *fbuf; struct xfs_dinode *free; - int nbufs, blks_per_cluster, inodes_per_cluster; + int nbufs; int version; int i, j; xfs_daddr_t d; @@ -299,9 +299,7 @@ xfs_ialloc_inode_init( * sizes, manipulate the inodes in buffers which are multiples of the * blocks size. */ - blks_per_cluster = xfs_icluster_size_fsb(mp); - inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; - nbufs = length / blks_per_cluster; + nbufs = length / mp->m_blocks_per_cluster; /* * Figure out what version number to use in the inodes we create. If @@ -312,7 +310,7 @@ xfs_ialloc_inode_init( * * For v3 inodes, we also need to write the inode number into the inode, * so calculate the first inode number of the chunk here as - * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * XFS_AGB_TO_AGINO() only works within a filesystem block, not * across multiple filesystem blocks (such as a cluster) and so cannot * be used in the cluster buffer loop below. * @@ -324,8 +322,7 @@ xfs_ialloc_inode_init( */ if (xfs_sb_version_hascrc(&mp->m_sb)) { version = 3; - ino = XFS_AGINO_TO_INO(mp, agno, - XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); /* * log the initialisation that is about to take place as an @@ -345,9 +342,10 @@ xfs_ialloc_inode_init( /* * Get the block. */ - d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); + d = XFS_AGB_TO_DADDR(mp, agno, agbno + + (j * mp->m_blocks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, + mp->m_bsize * mp->m_blocks_per_cluster, XBF_UNMAPPED); if (!fbuf) return -ENOMEM; @@ -355,7 +353,7 @@ xfs_ialloc_inode_init( /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); - for (i = 0; i < inodes_per_cluster; i++) { + for (i = 0; i < mp->m_inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; uint isize = xfs_dinode_size(version); @@ -445,7 +443,7 @@ xfs_align_sparse_ino( return; /* calculate the inode offset and align startino */ - offset = mod << mp->m_sb.sb_inopblog; + offset = XFS_AGB_TO_AGINO(mp, mod); *startino -= offset; /* @@ -641,7 +639,7 @@ xfs_ialloc_ag_alloc( args.tp = tp; args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; - xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES); + args.oinfo = XFS_RMAP_OINFO_INODES; #ifdef DEBUG /* randomly do sparse inode allocations */ @@ -692,7 +690,7 @@ xfs_ialloc_ag_alloc( * but not to use them in the actual exact allocation. */ args.alignment = 1; - args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1; + args.minalignslop = args.mp->m_cluster_align - 1; /* Allow space for the inode btree to split. */ args.minleft = args.mp->m_in_maxlevels - 1; @@ -727,7 +725,7 @@ xfs_ialloc_ag_alloc( args.alignment = args.mp->m_dalign; isaligned = 1; } else - args.alignment = xfs_ialloc_cluster_alignment(args.mp); + args.alignment = args.mp->m_cluster_align; /* * Need to figure out where to allocate the inode blocks. * Ideally they should be spaced out through the a.g. @@ -756,7 +754,7 @@ xfs_ialloc_ag_alloc( args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); - args.alignment = xfs_ialloc_cluster_alignment(args.mp); + args.alignment = args.mp->m_cluster_align; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -797,7 +795,7 @@ sparse_alloc: if (error) return error; - newlen = args.len << args.mp->m_sb.sb_inopblog; + newlen = XFS_AGB_TO_AGINO(args.mp, args.len); ASSERT(newlen <= XFS_INODES_PER_CHUNK); allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; } @@ -825,7 +823,7 @@ sparse_alloc: /* * Convert the results. */ - newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + newino = XFS_AGB_TO_AGINO(args.mp, args.agbno); if (xfs_inobt_issparse(~allocmask)) { /* @@ -1019,7 +1017,7 @@ xfs_ialloc_ag_select( */ ineed = mp->m_ialloc_min_blks; if (flags && ineed > 1) - ineed += xfs_ialloc_cluster_alignment(mp); + ineed += mp->m_cluster_align; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -1849,14 +1847,12 @@ xfs_difree_inode_chunk( int nextbit; xfs_agblock_t agbno; int contigblk; - struct xfs_owner_info oinfo; DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - mp->m_ialloc_blks, &oinfo); + mp->m_ialloc_blks, &XFS_RMAP_OINFO_INODES); return; } @@ -1900,7 +1896,7 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &oinfo); + contigblk, &XFS_RMAP_OINFO_INODES); /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -2292,7 +2288,6 @@ xfs_imap( xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agnumber_t agno; /* allocation group number */ - int blks_per_cluster; /* num blocks per inode cluster */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ xfs_agblock_t cluster_agbno; /* first block in inode cluster */ int error; /* error code */ @@ -2338,8 +2333,6 @@ xfs_imap( return -EINVAL; } - blks_per_cluster = xfs_icluster_size_fsb(mp); - /* * For bulkstat and handle lookups, we have an untrusted inode number * that we have to verify is valid. We cannot do this just by reading @@ -2359,7 +2352,7 @@ xfs_imap( * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ - if (blks_per_cluster == 1) { + if (mp->m_blocks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); @@ -2388,12 +2381,13 @@ xfs_imap( out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + - ((offset_agbno / blks_per_cluster) * blks_per_cluster); + ((offset_agbno / mp->m_blocks_per_cluster) * + mp->m_blocks_per_cluster); offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); - imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap->im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* @@ -2514,7 +2508,7 @@ xfs_agi_verify( /* * Validate the magic number of the agi block. */ - if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; @@ -2588,6 +2582,7 @@ xfs_agi_write_verify( const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", + .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, @@ -2726,8 +2721,8 @@ xfs_ialloc_has_inodes_at_extent( xfs_agino_t low; xfs_agino_t high; - low = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno, 0); - high = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno + len, 0) - 1; + low = XFS_AGB_TO_AGINO(cur->bc_mp, bno); + high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; return xfs_ialloc_has_inode_record(cur, low, high, exists); } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 86c50208a143..1080381ff243 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -84,7 +84,7 @@ __xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT); + args.oinfo = XFS_RMAP_OINFO_INOBT; args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); args.minlen = 1; args.maxlen = 1; @@ -124,7 +124,7 @@ xfs_finobt_alloc_block( union xfs_btree_ptr *new, int *stat) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); @@ -136,12 +136,9 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { - struct xfs_owner_info oinfo; - - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, - &oinfo, resv); + &XFS_RMAP_OINFO_INOBT, resv); } STATIC int @@ -157,7 +154,7 @@ xfs_finobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_free_block(cur, bp); return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } @@ -263,6 +260,9 @@ xfs_inobt_verify( xfs_failaddr_t fa; unsigned int level; + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + /* * During growfs operations, we can't verify the exact owner as the * perag is not fully initialised and hence not attached to the buffer. @@ -273,18 +273,10 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - switch (block->bb_magic) { - case cpu_to_be32(XFS_IBT_CRC_MAGIC): - case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (xfs_sb_version_hascrc(&mp->m_sb)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_IBT_MAGIC): - case cpu_to_be32(XFS_FIBT_MAGIC): - break; - default: - return __this_address; } /* level verification */ @@ -331,6 +323,16 @@ xfs_inobt_write_verify( const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", + .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) }, + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, + .verify_struct = xfs_inobt_verify, +}; + +const struct xfs_buf_ops xfs_finobt_buf_ops = { + .name = "xfs_finobt", + .magic = { cpu_to_be32(XFS_FIBT_MAGIC), + cpu_to_be32(XFS_FIBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, @@ -392,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .buf_ops = &xfs_inobt_buf_ops, + .buf_ops = &xfs_finobt_buf_ops, .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, @@ -538,15 +540,18 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agnumber_t agno) { + xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_inobt_mxr[0] == 0) return 0; return xfs_btree_calc_size(mp->m_inobt_mnr, - (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / - XFS_INODES_PER_CHUNK); + (uint64_t)agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); } static int @@ -594,7 +599,7 @@ xfs_finobt_calc_reserves( if (error) return error; - *ask += xfs_inobt_max_size(mp); + *ask += xfs_inobt_max_size(mp, agno); *used += tree_len; return 0; } diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 771dd072015d..bc690f2409fa 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -614,16 +614,15 @@ xfs_iext_realloc_root( } /* - * Increment the sequence counter if we are on a COW fork. This allows - * the writeback code to skip looking for a COW extent if the COW fork - * hasn't changed. We use WRITE_ONCE here to ensure the update to the - * sequence counter is seen before the modifications to the extent - * tree itself take effect. + * Increment the sequence counter on extent tree changes. If we are on a COW + * fork, this allows the writeback code to skip looking for a COW extent if the + * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the + * sequence counter is seen before the modifications to the extent tree itself + * take effect. */ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) { - if (state & BMAP_COWFORK) - WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); + WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } void diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 09d9c8cfa4a0..e021d5133ccb 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -97,10 +97,9 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - (unlinked_ino == NULLAGINO || - xfs_verify_agino(mp, agno, unlinked_ino)); + xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -147,12 +146,16 @@ xfs_inode_buf_write_verify( const struct xfs_buf_ops xfs_inode_buf_ops = { .name = "xfs_inode", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { - .name = "xxfs_inode_ra", + .name = "xfs_inode_ra", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 60361d2d74a1..00c62ce170d0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -14,7 +14,7 @@ struct xfs_dinode; */ struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ - unsigned int if_seq; /* cow fork mod counter */ + unsigned int if_seq; /* fork mod counter */ struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 1aaa01c97517..6f47ab876d90 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -70,7 +70,7 @@ xfs_refcountbt_alloc_block( args.type = XFS_ALLOCTYPE_NEAR_BNO; args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, xfs_refc_block(args.mp)); - xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC); + args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; @@ -106,15 +106,13 @@ xfs_refcountbt_free_block( struct xfs_buf *agbp = cur->bc_private.a.agbp; struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); - struct xfs_owner_info oinfo; int error; trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo, + error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); if (error) return error; @@ -211,7 +209,7 @@ xfs_refcountbt_verify( xfs_failaddr_t fa; unsigned int level; - if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasreflink(&mp->m_sb)) @@ -266,6 +264,7 @@ xfs_refcountbt_write_verify( const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", + .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) }, .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, .verify_struct = xfs_refcountbt_verify, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 245af452840e..8ed885507dd8 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -458,21 +458,21 @@ out: */ STATIC int xfs_rmap_unmap( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec ltrec; - uint64_t ltoff; - int error = 0; - int i; - uint64_t owner; - uint64_t offset; - unsigned int flags; - bool ignore_off; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; + bool ignore_off; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) || @@ -653,16 +653,16 @@ out_error: */ int xfs_rmap_free( - struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo) + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *cur; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + int error; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; @@ -710,23 +710,23 @@ xfs_rmap_is_mergeable( */ STATIC int xfs_rmap_map( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec ltrec; - struct xfs_rmap_irec gtrec; - int have_gt; - int have_lt; - int error = 0; - int i; - uint64_t owner; - uint64_t offset; - unsigned int flags = 0; - bool ignore_off; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; + bool ignore_off; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); ASSERT(owner != 0); @@ -890,16 +890,16 @@ out_error: */ int xfs_rmap_alloc( - struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo) + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *cur; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + int error; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; @@ -929,16 +929,16 @@ xfs_rmap_alloc( */ STATIC int xfs_rmap_convert( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec r[4]; /* neighbor extent entries */ - /* left is 0, right is 1, prev is 2 */ - /* new is 3 */ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, */ + /* prev is 2, new is 3 */ uint64_t owner; uint64_t offset; uint64_t new_endoff; @@ -1354,16 +1354,16 @@ done: */ STATIC int xfs_rmap_convert_shared( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec r[4]; /* neighbor extent entries */ - /* left is 0, right is 1, prev is 2 */ - /* new is 3 */ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, */ + /* prev is 2, new is 3 */ uint64_t owner; uint64_t offset; uint64_t new_endoff; @@ -1743,20 +1743,20 @@ done: */ STATIC int xfs_rmap_unmap_shared( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec ltrec; - uint64_t ltoff; - int error = 0; - int i; - uint64_t owner; - uint64_t offset; - unsigned int flags; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) @@ -1905,22 +1905,22 @@ out_error: */ STATIC int xfs_rmap_map_shared( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec ltrec; - struct xfs_rmap_irec gtrec; - int have_gt; - int have_lt; - int error = 0; - int i; - uint64_t owner; - uint64_t offset; - unsigned int flags = 0; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) @@ -2459,18 +2459,18 @@ xfs_rmap_has_record( */ int xfs_rmap_record_exists( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo, - bool *has_rmap) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + bool *has_rmap) { - uint64_t owner; - uint64_t offset; - unsigned int flags; - int has_record; - struct xfs_rmap_irec irec; - int error; + uint64_t owner; + uint64_t offset; + unsigned int flags; + int has_record; + struct xfs_rmap_irec irec; + int error; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || @@ -2530,7 +2530,7 @@ xfs_rmap_has_other_keys( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - struct xfs_owner_info *oinfo, + const struct xfs_owner_info *oinfo, bool *has_rmap) { struct xfs_rmap_irec low = {0}; @@ -2550,3 +2550,31 @@ xfs_rmap_has_other_keys( *has_rmap = rks.has_rmap; return error; } + +const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { + .oi_owner = XFS_RMAP_OWN_NULL, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER = { + .oi_owner = XFS_RMAP_OWN_UNKNOWN, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_FS = { + .oi_owner = XFS_RMAP_OWN_FS, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_LOG = { + .oi_owner = XFS_RMAP_OWN_LOG, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_AG = { + .oi_owner = XFS_RMAP_OWN_AG, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_INOBT = { + .oi_owner = XFS_RMAP_OWN_INOBT, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_INODES = { + .oi_owner = XFS_RMAP_OWN_INODES, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_REFC = { + .oi_owner = XFS_RMAP_OWN_REFC, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_COW = { + .oi_owner = XFS_RMAP_OWN_COW, +}; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 157dc722ad35..e21ed0294e5c 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -7,16 +7,6 @@ #define __XFS_RMAP_H__ static inline void -xfs_rmap_ag_owner( - struct xfs_owner_info *oi, - uint64_t owner) -{ - oi->oi_owner = owner; - oi->oi_offset = 0; - oi->oi_flags = 0; -} - -static inline void xfs_rmap_ino_bmbt_owner( struct xfs_owner_info *oi, xfs_ino_t ino, @@ -43,27 +33,13 @@ xfs_rmap_ino_owner( oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK; } -static inline void -xfs_rmap_skip_owner_update( - struct xfs_owner_info *oi) -{ - xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL); -} - static inline bool xfs_rmap_should_skip_owner_update( - struct xfs_owner_info *oi) + const struct xfs_owner_info *oi) { return oi->oi_owner == XFS_RMAP_OWN_NULL; } -static inline void -xfs_rmap_any_owner_update( - struct xfs_owner_info *oi) -{ - xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN); -} - /* Reverse mapping functions. */ struct xfs_buf; @@ -103,12 +79,12 @@ xfs_rmap_irec_offset_unpack( static inline void xfs_owner_info_unpack( - struct xfs_owner_info *oinfo, - uint64_t *owner, - uint64_t *offset, - unsigned int *flags) + const struct xfs_owner_info *oinfo, + uint64_t *owner, + uint64_t *offset, + unsigned int *flags) { - unsigned int r = 0; + unsigned int r = 0; *owner = oinfo->oi_owner; *offset = oinfo->oi_offset; @@ -137,10 +113,10 @@ xfs_owner_info_pack( int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - struct xfs_owner_info *oinfo); + const struct xfs_owner_info *oinfo); int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - struct xfs_owner_info *oinfo); + const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner, uint64_t offset, @@ -218,11 +194,21 @@ int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, struct xfs_owner_info *oinfo, + xfs_extlen_t len, const struct xfs_owner_info *oinfo, bool *has_rmap); int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, struct xfs_owner_info *oinfo, + xfs_extlen_t len, const struct xfs_owner_info *oinfo, bool *has_rmap); int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap); +extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE; +extern const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER; +extern const struct xfs_owner_info XFS_RMAP_OINFO_FS; +extern const struct xfs_owner_info XFS_RMAP_OINFO_LOG; +extern const struct xfs_owner_info XFS_RMAP_OINFO_AG; +extern const struct xfs_owner_info XFS_RMAP_OINFO_INOBT; +extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES; +extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC; +extern const struct xfs_owner_info XFS_RMAP_OINFO_COW; + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index f79cf040d745..5738e11055e6 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -310,7 +310,7 @@ xfs_rmapbt_verify( * from the on disk AGF. Again, we can only check against maximum limits * in this case. */ - if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) @@ -365,6 +365,7 @@ xfs_rmapbt_write_verify( const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .name = "xfs_rmapbt", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, .verify_read = xfs_rmapbt_read_verify, .verify_write = xfs_rmapbt_write_verify, .verify_struct = xfs_rmapbt_verify, diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index b228c821bae6..eaaff67e9626 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -505,6 +505,12 @@ xfs_rtmodify_summary_int( uint first = (uint)((char *)sp - (char *)bp->b_addr); *sp += delta; + if (mp->m_rsum_cache) { + if (*sp == 0 && log == mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno]++; + if (*sp != 0 && log < mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; + } xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); } if (sum) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b5a82acd7dfe..77a3a4085de3 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -225,10 +225,11 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); uint32_t agcount = 0; uint32_t rem; - if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; } @@ -781,12 +782,14 @@ out_error: const struct xfs_buf_ops xfs_sb_buf_ops = { .name = "xfs_sb", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .name = "xfs_sb_quiet", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; @@ -874,7 +877,7 @@ xfs_initialize_perag_data( uint64_t bfreelst = 0; uint64_t btree = 0; uint64_t fdblocks; - int error; + int error = 0; for (index = 0; index < agcount; index++) { /* diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 1c5debe748f0..4e909791aeac 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; @@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_finobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 95374ab2dee7..a0ccc253c43d 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -95,7 +95,7 @@ xfs_symlink_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -159,6 +159,7 @@ xfs_symlink_write_verify( const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", + .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) }, .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, .verify_struct = xfs_symlink_verify, @@ -199,7 +200,10 @@ xfs_symlink_local_to_remote( ifp->if_bytes - 1); } -/* Verify the consistency of an inline symlink. */ +/* + * Verify the in-memory consistency of an inline symlink data fork. This + * does not do on-disk format checks. + */ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) @@ -215,9 +219,12 @@ xfs_symlink_shortform_verify( size = ifp->if_bytes; endp = sfp + size; - /* Zero length symlinks can exist while we're deleting a remote one. */ - if (size == 0) - return NULL; + /* + * Zero length symlinks should never occur in memory as they are + * never alllowed to exist on disk. + */ + if (!size) + return __this_address; /* No negative sizes or overly long symlink targets. */ if (size < 0 || size > XFS_SYMLINK_MAXLEN) diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 33a5ca346baf..de310712dd6d 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -87,16 +87,15 @@ xfs_agino_range( * Calculate the first inode, which will be in the first * cluster-aligned block after the AGFL. */ - bno = round_up(XFS_AGFL_BLOCK(mp) + 1, - xfs_ialloc_cluster_alignment(mp)); - *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0); + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, mp->m_cluster_align); + *first = XFS_AGB_TO_AGINO(mp, bno); /* * Calculate the last inode, which will be at the end of the * last (aligned) cluster that can be allocated in the AG. */ - bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp)); - *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1; + bno = round_down(eoag, mp->m_cluster_align); + *last = XFS_AGB_TO_AGINO(mp, bno) - 1; } /* @@ -117,6 +116,19 @@ xfs_verify_agino( } /* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +bool +xfs_verify_agino_or_null( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); +} + +/* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ @@ -205,3 +217,14 @@ xfs_verify_icount( xfs_icount_range(mp, &min, &max); return icount >= min && icount <= max; } + +/* Sanity-checking of dir/attr block offsets. */ +bool +xfs_verify_dablk( + struct xfs_mount *mp, + xfs_fileoff_t dabno) +{ + xfs_dablk_t max_dablk = -1U; + + return dabno <= max_dablk; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b9e6c89284c3..c5a25403b4db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -100,15 +100,37 @@ typedef void * xfs_failaddr_t; */ #define MAXNAMELEN 256 +/* + * This enum is used in string mapping in xfs_trace.h; please keep the + * TRACE_DEFINE_ENUMs for it up to date. + */ typedef enum { XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi } xfs_lookup_t; +#define XFS_AG_BTREE_CMP_FORMAT_STR \ + { XFS_LOOKUP_EQi, "eq" }, \ + { XFS_LOOKUP_LEi, "le" }, \ + { XFS_LOOKUP_GEi, "ge" } + +/* + * This enum is used in string mapping in xfs_trace.h and scrub/trace.h; + * please keep the TRACE_DEFINE_ENUMs for it up to date. + */ typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX } xfs_btnum_t; +#define XFS_BTNUM_STRINGS \ + { XFS_BTNUM_BNOi, "bnobt" }, \ + { XFS_BTNUM_CNTi, "cntbt" }, \ + { XFS_BTNUM_RMAPi, "rmapbt" }, \ + { XFS_BTNUM_BMAPi, "bmbt" }, \ + { XFS_BTNUM_INOi, "inobt" }, \ + { XFS_BTNUM_FINOi, "finobt" }, \ + { XFS_BTNUM_REFCi, "refcbt" } + struct xfs_name { const unsigned char *name; int len; @@ -161,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last); bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino); +bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); +bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 3068a9382feb..ddf06bfaa29d 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -32,7 +32,6 @@ xchk_superblock_xref( struct xfs_scrub *sc, struct xfs_buf *bp) { - struct xfs_owner_info oinfo; struct xfs_mount *mp = sc->mp; xfs_agnumber_t agno = sc->sm->sm_agno; xfs_agblock_t agbno; @@ -49,8 +48,7 @@ xchk_superblock_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); /* scrub teardown will take care of sc->sa for us */ @@ -401,7 +399,7 @@ xchk_agf_xref_cntbt( if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) return; if (!have) { - if (agf->agf_freeblks != be32_to_cpu(0)) + if (agf->agf_freeblks != cpu_to_be32(0)) xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); return; } @@ -484,7 +482,6 @@ STATIC void xchk_agf_xref( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; int error; @@ -502,8 +499,7 @@ xchk_agf_xref( xchk_agf_xref_freeblks(sc); xchk_agf_xref_cntbt(sc); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_agf_xref_btreeblks(sc); xchk_xref_is_not_shared(sc, agbno, 1); xchk_agf_xref_refcblks(sc); @@ -598,7 +594,6 @@ out: /* AGFL */ struct xchk_agfl_info { - struct xfs_owner_info oinfo; unsigned int sz_entries; unsigned int nr_entries; xfs_agblock_t *entries; @@ -609,15 +604,14 @@ struct xchk_agfl_info { STATIC void xchk_agfl_block_xref( struct xfs_scrub *sc, - xfs_agblock_t agbno, - struct xfs_owner_info *oinfo) + xfs_agblock_t agbno) { if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); xchk_xref_is_not_shared(sc, agbno, 1); } @@ -638,7 +632,7 @@ xchk_agfl_block( else xchk_block_set_corrupt(sc, sc->sa.agfl_bp); - xchk_agfl_block_xref(sc, agbno, priv); + xchk_agfl_block_xref(sc, agbno); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return XFS_BTREE_QUERY_RANGE_ABORT; @@ -662,7 +656,6 @@ STATIC void xchk_agfl_xref( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; int error; @@ -678,8 +671,7 @@ xchk_agfl_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); /* @@ -732,7 +724,6 @@ xchk_agfl( } /* Check the blocks in the AGFL. */ - xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG); error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), sc->sa.agfl_bp, xchk_agfl_block, &sai); if (error == XFS_BTREE_QUERY_RANGE_ABORT) { @@ -791,7 +782,6 @@ STATIC void xchk_agi_xref( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; int error; @@ -808,8 +798,7 @@ xchk_agi_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); xchk_agi_xref_icounts(sc); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); /* scrub teardown will take care of sc->sa for us */ @@ -875,19 +864,17 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (agino == NULLAGINO) - continue; - if (!xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index f7568a4b5fe5..64e31f87d490 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -341,23 +341,19 @@ xrep_agf( struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { [XREP_AGF_BNOBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTB_CRC_MAGIC, + .buf_ops = &xfs_bnobt_buf_ops, }, [XREP_AGF_CNTBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTC_CRC_MAGIC, + .buf_ops = &xfs_cntbt_buf_ops, }, [XREP_AGF_RMAPBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_rmapbt_buf_ops, - .magic = XFS_RMAP_CRC_MAGIC, }, [XREP_AGF_REFCOUNTBT] = { .rmap_owner = XFS_RMAP_OWN_REFC, .buf_ops = &xfs_refcountbt_buf_ops, - .magic = XFS_REFC_CRC_MAGIC, }, [XREP_AGF_END] = { .buf_ops = NULL, @@ -646,7 +642,6 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; struct xfs_bitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; @@ -708,8 +703,8 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); - return xrep_reap_extents(sc, &agfl_extents, &oinfo, XFS_AG_RESV_AGFL); + return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + XFS_AG_RESV_AGFL); err: xfs_bitmap_destroy(&agfl_extents); return error; @@ -876,12 +871,10 @@ xrep_agi( [XREP_AGI_INOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_IBT_CRC_MAGIC, }, [XREP_AGI_FINOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_FIBT_CRC_MAGIC, + .buf_ops = &xfs_finobt_buf_ops, }, [XREP_AGI_END] = { .buf_ops = NULL diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 376bcb585ae6..44883e9112ad 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -125,12 +125,10 @@ xchk_allocbt( struct xfs_scrub *sc, xfs_btnum_t which) { - struct xfs_owner_info oinfo; struct xfs_btree_cur *cur; - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &oinfo, NULL); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL); } int diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 81d5e90547a1..dce74ec57038 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -82,12 +82,23 @@ xchk_xattr_listent( sx = container_of(context, struct xchk_xattr, context); + if (xchk_should_terminate(sx->sc, &error)) { + context->seen_enough = 1; + return; + } + if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ xchk_ino_set_preen(sx->sc, context->dp->i_ino); return; } + /* Does this name make sense? */ + if (!xfs_attr_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + return; + } + args.flags = ATTR_KERNOTIME; if (flags & XFS_ATTR_ROOT) args.flags |= ATTR_ROOT; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index e1d11f3223e3..a703cd58a90e 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -281,6 +281,31 @@ xchk_bmap_extent_xref( xchk_ag_free(info->sc, &info->sc->sa); } +/* + * Directories and attr forks should never have blocks that can't be addressed + * by a xfs_dablk_t. + */ +STATIC void +xchk_bmap_dirattr_extent( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t off; + + if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK) + return; + + if (!xfs_verify_dablk(mp, irec->br_startoff)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + off = irec->br_startoff + irec->br_blockcount - 1; + if (!xfs_verify_dablk(mp, off)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, off); +} + /* Scrub a single extent record. */ STATIC int xchk_bmap_extent( @@ -305,6 +330,8 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + xchk_bmap_dirattr_extent(ip, info, irec); + /* There should never be a "hole" extent in either extent list. */ if (irec->br_startblock == HOLESTARTBLOCK) xchk_fblock_set_corrupt(info->sc, info->whichfork, diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 4ae959f7ad2c..6f94d1f7322d 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -583,31 +583,32 @@ xchk_btree_block_keys( */ int xchk_btree( - struct xfs_scrub *sc, - struct xfs_btree_cur *cur, - xchk_btree_rec_fn scrub_fn, - struct xfs_owner_info *oinfo, - void *private) + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + xchk_btree_rec_fn scrub_fn, + const struct xfs_owner_info *oinfo, + void *private) { - struct xchk_btree bs = { NULL }; - union xfs_btree_ptr ptr; - union xfs_btree_ptr *pp; - union xfs_btree_rec *recp; - struct xfs_btree_block *block; - int level; - struct xfs_buf *bp; - struct check_owner *co; - struct check_owner *n; - int i; - int error = 0; + struct xchk_btree bs = { + .cur = cur, + .scrub_rec = scrub_fn, + .oinfo = oinfo, + .firstrec = true, + .private = private, + .sc = sc, + }; + union xfs_btree_ptr ptr; + union xfs_btree_ptr *pp; + union xfs_btree_rec *recp; + struct xfs_btree_block *block; + int level; + struct xfs_buf *bp; + struct check_owner *co; + struct check_owner *n; + int i; + int error = 0; /* Initialize scrub state */ - bs.cur = cur; - bs.scrub_rec = scrub_fn; - bs.oinfo = oinfo; - bs.firstrec = true; - bs.private = private; - bs.sc = sc; for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) bs.firstkey[i] = true; INIT_LIST_HEAD(&bs.to_check); diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index aada763cd006..5572e475f8ed 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -31,21 +31,21 @@ typedef int (*xchk_btree_rec_fn)( struct xchk_btree { /* caller-provided scrub state */ - struct xfs_scrub *sc; - struct xfs_btree_cur *cur; - xchk_btree_rec_fn scrub_rec; - struct xfs_owner_info *oinfo; - void *private; + struct xfs_scrub *sc; + struct xfs_btree_cur *cur; + xchk_btree_rec_fn scrub_rec; + const struct xfs_owner_info *oinfo; + void *private; /* internal scrub state */ - union xfs_btree_rec lastrec; - bool firstrec; - union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS]; - bool firstkey[XFS_BTREE_MAXLEVELS]; - struct list_head to_check; + union xfs_btree_rec lastrec; + bool firstrec; + union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS]; + bool firstkey[XFS_BTREE_MAXLEVELS]; + struct list_head to_check; }; int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur, - xchk_btree_rec_fn scrub_fn, struct xfs_owner_info *oinfo, + xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo, void *private); #endif /* __XFS_SCRUB_BTREE_H__ */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 346b02abccf7..0c54ff55b901 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -313,8 +313,8 @@ xchk_set_incomplete( */ struct xchk_rmap_ownedby_info { - struct xfs_owner_info *oinfo; - xfs_filblks_t *blocks; + const struct xfs_owner_info *oinfo; + xfs_filblks_t *blocks; }; STATIC int @@ -347,15 +347,15 @@ int xchk_count_rmap_ownedby_ag( struct xfs_scrub *sc, struct xfs_btree_cur *cur, - struct xfs_owner_info *oinfo, + const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks) { - struct xchk_rmap_ownedby_info sroi; + struct xchk_rmap_ownedby_info sroi = { + .oinfo = oinfo, + .blocks = blocks, + }; - sroi.oinfo = oinfo; *blocks = 0; - sroi.blocks = blocks; - return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec, &sroi); } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 2d4324d12f9a..e26a430bd466 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -116,7 +116,7 @@ int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, void xchk_ag_btcur_free(struct xchk_ag *sa); int xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, - struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); + const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); int xchk_setup_ag_btree(struct xfs_scrub *sc, struct xfs_inode *ip, bool force_log); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index cd3e4d768a18..a38a22785a1a 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -129,6 +129,12 @@ xchk_dir_actor( goto out; } + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + if (!strncmp(".", name, namelen)) { /* If this is "." then check that the inum matches the dir. */ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 224dba937492..700114f79a7d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -44,6 +44,17 @@ xchk_setup_ag_iallocbt( /* Inode btree scrubber. */ +struct xchk_iallocbt { + /* Number of inodes we see while scanning inobt. */ + unsigned long long inodes; + + /* Expected next startino, for big block filesystems. */ + xfs_agino_t next_startino; + + /* Expected end of the current inode cluster. */ + xfs_agino_t next_cluster_ino; +}; + /* * If we're checking the finobt, cross-reference with the inobt. * Otherwise we're checking the inobt; if there is an finobt, make sure @@ -82,15 +93,12 @@ xchk_iallocbt_chunk_xref( xfs_agblock_t agbno, xfs_extlen_t len) { - struct xfs_owner_info oinfo; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; xchk_xref_is_used_space(sc, agbno, len); xchk_iallocbt_chunk_xref_other(sc, irec, agino); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); - xchk_xref_is_owned_by(sc, agbno, len, &oinfo); + xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); xchk_xref_is_not_shared(sc, agbno, len); } @@ -126,41 +134,57 @@ xchk_iallocbt_freecount( return hweight64(freemask); } -/* Check a particular inode with ir_free. */ +/* + * Check that an inode's allocation status matches ir_free in the inobt + * record. First we try querying the in-core inode state, and if the inode + * isn't loaded we examine the on-disk inode directly. + * + * Since there can be 1:M and M:1 mappings between inobt records and inode + * clusters, we pass in the inode location information as an inobt record; + * the index of an inode cluster within the inobt record (as well as the + * cluster buffer itself); and the index of the inode within the cluster. + * + * @irec is the inobt record. + * @irec_ino is the inode offset from the start of the record. + * @dip is the on-disk inode. + */ STATIC int -xchk_iallocbt_check_cluster_freemask( +xchk_iallocbt_check_cluster_ifree( struct xchk_btree *bs, - xfs_ino_t fsino, - xfs_agino_t chunkino, - xfs_agino_t clusterino, struct xfs_inobt_rec_incore *irec, - struct xfs_buf *bp) + unsigned int irec_ino, + struct xfs_dinode *dip) { - struct xfs_dinode *dip; struct xfs_mount *mp = bs->cur->bc_mp; - bool inode_is_free = false; + xfs_ino_t fsino; + xfs_agino_t agino; + bool irec_free; + bool ino_inuse; bool freemask_ok; - bool inuse; int error = 0; if (xchk_should_terminate(bs->sc, &error)) return error; - dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + /* + * Given an inobt record and the offset of an inode from the start of + * the record, compute which fs inode we're talking about. + */ + agino = irec->ir_startino + irec_ino; + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || - (dip->di_version >= 3 && - be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } - if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) - inode_is_free = true; - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, - fsino + clusterino, &inuse); + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, + &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ - freemask_ok = inode_is_free ^ !!(dip->di_mode); + freemask_ok = irec_free ^ !!(dip->di_mode); if (!bs->sc->try_harder && !freemask_ok) return -EDEADLOCK; } else if (error < 0) { @@ -172,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask( goto out; } else { /* Inode is all there. */ - freemask_ok = inode_is_free ^ inuse; + freemask_ok = irec_free ^ ino_inuse; } if (!freemask_ok) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); @@ -180,89 +204,221 @@ out: return 0; } -/* Make sure the free mask is consistent with what the inodes think. */ +/* + * Check that the holemask and freemask of a hypothetical inode cluster match + * what's actually on disk. If sparse inodes are enabled, the cluster does + * not actually have to map to inodes if the corresponding holemask bit is set. + * + * @cluster_base is the first inode in the cluster within the @irec. + */ STATIC int -xchk_iallocbt_check_freemask( +xchk_iallocbt_check_cluster( struct xchk_btree *bs, - struct xfs_inobt_rec_incore *irec) + struct xfs_inobt_rec_incore *irec, + unsigned int cluster_base) { - struct xfs_owner_info oinfo; struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_dinode *dip; - struct xfs_buf *bp; - xfs_ino_t fsino; - xfs_agino_t nr_inodes; - xfs_agino_t agino; - xfs_agino_t chunkino; - xfs_agino_t clusterino; + struct xfs_buf *cluster_bp; + unsigned int nr_inodes; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t agbno; - int blks_per_cluster; - uint16_t holemask; + unsigned int cluster_index; + uint16_t cluster_mask = 0; uint16_t ir_holemask; int error = 0; - /* Make sure the freemask matches the inode records. */ - blks_per_cluster = xfs_icluster_size_fsb(mp); - nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); - - for (agino = irec->ir_startino; - agino < irec->ir_startino + XFS_INODES_PER_CHUNK; - agino += blks_per_cluster * mp->m_sb.sb_inopblock) { - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - chunkino = agino - irec->ir_startino; - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - - /* Compute the holemask mask for this cluster. */ - for (clusterino = 0, holemask = 0; clusterino < nr_inodes; - clusterino += XFS_INODES_PER_HOLEMASK_BIT) - holemask |= XFS_INOBT_MASK((chunkino + clusterino) / - XFS_INODES_PER_HOLEMASK_BIT); - - /* The whole cluster must be a hole or not a hole. */ - ir_holemask = (irec->ir_holemask & holemask); - if (ir_holemask != holemask && ir_holemask != 0) { + nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_inodes_per_cluster); + + /* Map this inode cluster */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base); + + /* Compute a bitmask for this cluster that can be used for holemask. */ + for (cluster_index = 0; + cluster_index < nr_inodes; + cluster_index += XFS_INODES_PER_HOLEMASK_BIT) + cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* + * Map the first inode of this cluster to a buffer and offset. + * Be careful about inobt records that don't align with the start of + * the inode buffer when block sizes are large enough to hold multiple + * inode chunks. When this happens, cluster_base will be zero but + * ir_startino can be large enough to make im_boffset nonzero. + */ + ir_holemask = (irec->ir_holemask & cluster_mask); + imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino); + + if (imap.im_boffset != 0 && cluster_base != 0) { + ASSERT(imap.im_boffset == 0 || cluster_base == 0); + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, + imap.im_blkno, imap.im_len, cluster_base, nr_inodes, + cluster_mask, ir_holemask, + XFS_INO_TO_OFFSET(mp, irec->ir_startino + + cluster_base)); + + /* The whole cluster must be a hole or not a hole. */ + if (ir_holemask != cluster_mask && ir_holemask != 0) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) { + xchk_xref_is_not_owned_by(bs->sc, agbno, + mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + return 0; + } + + xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + + /* Grab the inode cluster buffer. */ + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, + 0, 0); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) + return error; + + /* Check free status of each inode within this cluster. */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + struct xfs_dinode *dip; + + if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - continue; + break; } - /* If any part of this is a hole, skip it. */ - if (ir_holemask) { - xchk_xref_is_not_owned_by(bs->sc, agbno, - blks_per_cluster, &oinfo); - continue; + dip = xfs_buf_offset(cluster_bp, imap.im_boffset); + error = xchk_iallocbt_check_cluster_ifree(bs, irec, + cluster_base + cluster_index, dip); + if (error) + break; + imap.im_boffset += mp->m_sb.sb_inodesize; + } + + xfs_trans_brelse(bs->cur->bc_tp, cluster_bp); + return error; +} + +/* + * For all the inode clusters that could map to this inobt record, make sure + * that the holemask makes sense and that the allocation status of each inode + * matches the freemask. + */ +STATIC int +xchk_iallocbt_check_clusters( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + unsigned int cluster_base; + int error = 0; + + /* + * For the common case where this inobt record maps to multiple inode + * clusters this will call _check_cluster for each cluster. + * + * For the case that multiple inobt records map to a single cluster, + * this will call _check_cluster once. + */ + for (cluster_base = 0; + cluster_base < XFS_INODES_PER_CHUNK; + cluster_base += bs->sc->mp->m_inodes_per_cluster) { + error = xchk_iallocbt_check_cluster(bs, irec, cluster_base); + if (error) + break; + } + + return error; +} + +/* + * Make sure this inode btree record is aligned properly. Because a fs block + * contains multiple inodes, we check that the inobt record is aligned to the + * correct inode, not just the correct block on disk. This results in a finer + * grained corruption check. + */ +STATIC void +xchk_iallocbt_rec_alignment( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = bs->sc->mp; + struct xchk_iallocbt *iabt = bs->private; + + /* + * finobt records have different positioning requirements than inobt + * records: each finobt record must have a corresponding inobt record. + * That is checked in the xref function, so for now we only catch the + * obvious case where the record isn't at all aligned properly. + * + * Note that if a fs block contains more than a single chunk of inodes, + * we will have finobt records only for those chunks containing free + * inodes, and therefore expect chunk alignment of finobt records. + * Otherwise, we expect that the finobt record is aligned to the + * cluster alignment as told by the superblock. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + unsigned int imask; + + imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_cluster_align_inodes) - 1; + if (irec->ir_startino & imask) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (iabt->next_startino != NULLAGINO) { + /* + * We're midway through a cluster of inodes that is mapped by + * multiple inobt records. Did we get the record for the next + * irec in the sequence? + */ + if (irec->ir_startino != iabt->next_startino) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - xchk_xref_is_owned_by(bs->sc, agbno, blks_per_cluster, - &oinfo); - - /* Grab the inode cluster buffer. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, - agbno); - imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); - imap.im_boffset = 0; - - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, - &dip, &bp, 0, 0); - if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, - &error)) - continue; - - /* Which inodes are free? */ - for (clusterino = 0; clusterino < nr_inodes; clusterino++) { - error = xchk_iallocbt_check_cluster_freemask(bs, - fsino, chunkino, clusterino, irec, bp); - if (error) { - xfs_trans_brelse(bs->cur->bc_tp, bp); - return error; - } + iabt->next_startino += XFS_INODES_PER_CHUNK; + + /* Are we done with the cluster? */ + if (iabt->next_startino >= iabt->next_cluster_ino) { + iabt->next_startino = NULLAGINO; + iabt->next_cluster_ino = NULLAGINO; } + return; + } + + /* inobt records must be aligned to cluster and inoalignmnt size. */ + if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } - xfs_trans_brelse(bs->cur->bc_tp, bp); + if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - return error; + if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK) + return; + + /* + * If this is the start of an inode cluster that can be mapped by + * multiple inobt records, the next inobt record must follow exactly + * after this one. + */ + iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK; + iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster; } /* Scrub an inobt/finobt record. */ @@ -272,12 +428,11 @@ xchk_iallocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_filblks_t *inode_blocks = bs->private; + struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agino_t agino; - xfs_agblock_t agbno; xfs_extlen_t len; int holecount; int i; @@ -304,14 +459,11 @@ xchk_iallocbt_rec( goto out; } - /* Make sure this record is aligned to cluster and inoalignmnt size. */ - agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); - if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) || - (agbno & (xfs_icluster_size_fsb(mp) - 1))) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_iallocbt_rec_alignment(bs, &irec); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; - *inode_blocks += XFS_B_TO_FSB(mp, - irec.ir_count * mp->m_sb.sb_inodesize); + iabt->inodes += irec.ir_count; /* Handle non-sparse inodes */ if (!xfs_inobt_issparse(irec.ir_holemask)) { @@ -322,7 +474,7 @@ xchk_iallocbt_rec( if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) goto out; - goto check_freemask; + goto check_clusters; } /* Check each chunk of a sparse inode cluster. */ @@ -348,8 +500,8 @@ xchk_iallocbt_rec( holecount + irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); -check_freemask: - error = xchk_iallocbt_check_freemask(bs, &irec); +check_clusters: + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; @@ -366,7 +518,6 @@ xchk_iallocbt_xref_rmap_btreeblks( struct xfs_scrub *sc, int which) { - struct xfs_owner_info oinfo; xfs_filblks_t blocks; xfs_extlen_t inobt_blocks = 0; xfs_extlen_t finobt_blocks = 0; @@ -388,9 +539,8 @@ xchk_iallocbt_xref_rmap_btreeblks( return; } - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); - error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo, - &blocks); + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_INOBT, &blocks); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; if (blocks != inobt_blocks + finobt_blocks) @@ -405,21 +555,21 @@ STATIC void xchk_iallocbt_xref_rmap_inodes( struct xfs_scrub *sc, int which, - xfs_filblks_t inode_blocks) + unsigned long long inodes) { - struct xfs_owner_info oinfo; xfs_filblks_t blocks; + xfs_filblks_t inode_blocks; int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; /* Check that we saw as many inode blocks as the rmap knows about. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); - error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo, - &blocks); + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_INODES, &blocks); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; + inode_blocks = XFS_B_TO_FSB(sc->mp, inodes * sc->mp->m_sb.sb_inodesize); if (blocks != inode_blocks) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } @@ -431,14 +581,16 @@ xchk_iallocbt( xfs_btnum_t which) { struct xfs_btree_cur *cur; - struct xfs_owner_info oinfo; - xfs_filblks_t inode_blocks = 0; + struct xchk_iallocbt iabt = { + .inodes = 0, + .next_startino = NULLAGINO, + .next_cluster_ino = NULLAGINO, + }; int error; - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; - error = xchk_btree(sc, cur, xchk_iallocbt_rec, &oinfo, - &inode_blocks); + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, + &iabt); if (error) return error; @@ -452,7 +604,7 @@ xchk_iallocbt( * to inode chunks with free inodes. */ if (which == XFS_BTNUM_INO) - xchk_iallocbt_xref_rmap_inodes(sc, which, inode_blocks); + xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes); return error; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index e386c9b0b4ab..e213efc194a1 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -509,7 +509,6 @@ xchk_inode_xref( xfs_ino_t ino, struct xfs_dinode *dip) { - struct xfs_owner_info oinfo; xfs_agnumber_t agno; xfs_agblock_t agbno; int error; @@ -526,8 +525,7 @@ xchk_inode_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_inode_xref_finobt(sc, ino); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); xchk_xref_is_not_shared(sc, agbno, 1); xchk_inode_xref_bmap(sc, dip); diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index e8c82b026083..708b4158eb90 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -383,7 +383,6 @@ xchk_refcountbt_rec( STATIC void xchk_refcount_xref_rmap( struct xfs_scrub *sc, - struct xfs_owner_info *oinfo, xfs_filblks_t cow_blocks) { xfs_extlen_t refcbt_blocks = 0; @@ -397,17 +396,16 @@ xchk_refcount_xref_rmap( error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks); if (!xchk_btree_process_error(sc, sc->sa.refc_cur, 0, &error)) return; - error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo, - &blocks); + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_REFC, &blocks); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; if (blocks != refcbt_blocks) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); /* Check that we saw as many cow blocks as the rmap knows about. */ - xfs_rmap_ag_owner(oinfo, XFS_RMAP_OWN_COW); - error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo, - &blocks); + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_COW, &blocks); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; if (blocks != cow_blocks) @@ -419,17 +417,15 @@ int xchk_refcountbt( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; xfs_agblock_t cow_blocks = 0; int error; - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec, - &oinfo, &cow_blocks); + &XFS_RMAP_OINFO_REFC, &cow_blocks); if (error) return error; - xchk_refcount_xref_rmap(sc, &oinfo, cow_blocks); + xchk_refcount_xref_rmap(sc, cow_blocks); return 0; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 4fc0a5ea7673..f28f4bad317b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -299,14 +299,14 @@ xrep_calc_ag_resblks( /* Allocate a block in an AG. */ int xrep_alloc_ag_block( - struct xfs_scrub *sc, - struct xfs_owner_info *oinfo, - xfs_fsblock_t *fsbno, - enum xfs_ag_resv_type resv) + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv) { - struct xfs_alloc_arg args = {0}; - xfs_agblock_t bno; - int error; + struct xfs_alloc_arg args = {0}; + xfs_agblock_t bno; + int error; switch (resv) { case XFS_AG_RESV_AGFL: @@ -505,7 +505,6 @@ xrep_put_freelist( struct xfs_scrub *sc, xfs_agblock_t agbno) { - struct xfs_owner_info oinfo; int error; /* Make sure there's space on the freelist. */ @@ -518,9 +517,8 @@ xrep_put_freelist( * create an rmap for the block prior to merging it or else other * parts will break. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1, - &oinfo); + &XFS_RMAP_OINFO_AG); if (error) return error; @@ -538,17 +536,17 @@ xrep_put_freelist( /* Dispose of a single block. */ STATIC int xrep_reap_block( - struct xfs_scrub *sc, - xfs_fsblock_t fsbno, - struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type resv) + struct xfs_scrub *sc, + xfs_fsblock_t fsbno, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type resv) { - struct xfs_btree_cur *cur; - struct xfs_buf *agf_bp = NULL; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - bool has_other_rmap; - int error; + struct xfs_btree_cur *cur; + struct xfs_buf *agf_bp = NULL; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + bool has_other_rmap; + int error; agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); @@ -612,15 +610,15 @@ out_free: /* Dispose of every block of every extent in the bitmap. */ int xrep_reap_extents( - struct xfs_scrub *sc, - struct xfs_bitmap *bitmap, - struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) + struct xfs_scrub *sc, + struct xfs_bitmap *bitmap, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; - xfs_fsblock_t fsbno; - int error = 0; + struct xfs_bitmap_range *bmr; + struct xfs_bitmap_range *n; + xfs_fsblock_t fsbno; + int error = 0; ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); @@ -745,7 +743,8 @@ xrep_findroot_block( /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); - if (be32_to_cpu(btblock->bb_magic) != fab->magic) + ASSERT(fab->buf_ops->magic[1] != 0); + if (btblock->bb_magic != fab->buf_ops->magic[1]) goto out; /* @@ -770,18 +769,23 @@ xrep_findroot_block( if (!uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) goto out; + /* + * Read verifiers can reference b_ops, so we set the pointer + * here. If the verifier fails we'll reset the buffer state + * to what it was before we touched the buffer. + */ + bp->b_ops = fab->buf_ops; fab->buf_ops->verify_read(bp); if (bp->b_error) { + bp->b_ops = NULL; bp->b_error = 0; goto out; } /* * Some read verifiers will (re)set b_ops, so we must be - * careful not to blow away any such assignment. + * careful not to change b_ops after running the verifier. */ - if (!bp->b_ops) - bp->b_ops = fab->buf_ops; } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 9de321eee4ab..d990314eb08b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -21,8 +21,9 @@ int xrep_roll_ag_trans(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); -int xrep_alloc_ag_block(struct xfs_scrub *sc, struct xfs_owner_info *oinfo, - xfs_fsblock_t *fsbno, enum xfs_ag_resv_type resv); +int xrep_alloc_ag_block(struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv); int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, struct xfs_buf **bpp, xfs_btnum_t btnum, const struct xfs_buf_ops *ops); @@ -32,7 +33,7 @@ struct xfs_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist); int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist, - struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ @@ -41,9 +42,6 @@ struct xrep_find_ag_btree { /* in: buffer ops */ const struct xfs_buf_ops *buf_ops; - /* in: magic number of the btree */ - uint32_t magic; - /* out: the highest btree block found and the tree height */ xfs_agblock_t root; unsigned int height; diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 5e293c129813..92a140c5b55e 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -174,24 +174,21 @@ int xchk_rmapbt( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; - - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, - &oinfo, NULL); + &XFS_RMAP_OINFO_AG, NULL); } /* xref check that the extent is owned by a given owner */ static inline void xchk_xref_check_owner( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo, - bool should_have_rmap) + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + bool should_have_rmap) { - bool has_rmap; - int error; + bool has_rmap; + int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; @@ -207,10 +204,10 @@ xchk_xref_check_owner( /* xref check that the extent is owned by a given owner */ void xchk_xref_is_owned_by( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo) + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) { xchk_xref_check_owner(sc, bno, len, oinfo, true); } @@ -218,10 +215,10 @@ xchk_xref_is_owned_by( /* xref check that the extent is not owned by a given owner */ void xchk_xref_is_not_owned_by( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo) + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) { xchk_xref_check_owner(sc, bno, len, oinfo, false); } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 665d4bbb17cc..dbe115b075f7 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space( startext = fsbno; endext = fsbno + len - 1; do_div(startext, sc->mp->m_sb.sb_rextsize); - if (do_div(endext, sc->mp->m_sb.sb_rextsize)) - endext++; - extcount = endext - startext; + do_div(endext, sc->mp->m_sb.sb_rextsize); + extcount = endext - startext + 1; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index af323b229c4b..22f754fba8e5 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -122,9 +122,9 @@ void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, - xfs_extlen_t len, struct xfs_owner_info *oinfo); + xfs_extlen_t len, const struct xfs_owner_info *oinfo); void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, - xfs_extlen_t len, struct xfs_owner_info *oinfo); + xfs_extlen_t len, const struct xfs_owner_info *oinfo); void xchk_xref_has_no_owner(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4e20f0e48232..3c83e8b3b39c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -12,6 +12,71 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +/* + * ftrace's __print_symbolic requires that all enum values be wrapped in the + * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace + * ring buffer. Somehow this was only worth mentioning in the ftrace sample + * code. + */ +TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); +TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); + +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGFL); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGI); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BNOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_CNTBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FINOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RMAPBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_REFCNTBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INODE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTD); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTC); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIR); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_XATTR); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SYMLINK); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PARENT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTBITMAP); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); + +#define XFS_SCRUB_TYPE_STRINGS \ + { XFS_SCRUB_TYPE_PROBE, "probe" }, \ + { XFS_SCRUB_TYPE_SB, "sb" }, \ + { XFS_SCRUB_TYPE_AGF, "agf" }, \ + { XFS_SCRUB_TYPE_AGFL, "agfl" }, \ + { XFS_SCRUB_TYPE_AGI, "agi" }, \ + { XFS_SCRUB_TYPE_BNOBT, "bnobt" }, \ + { XFS_SCRUB_TYPE_CNTBT, "cntbt" }, \ + { XFS_SCRUB_TYPE_INOBT, "inobt" }, \ + { XFS_SCRUB_TYPE_FINOBT, "finobt" }, \ + { XFS_SCRUB_TYPE_RMAPBT, "rmapbt" }, \ + { XFS_SCRUB_TYPE_REFCNTBT, "refcountbt" }, \ + { XFS_SCRUB_TYPE_INODE, "inode" }, \ + { XFS_SCRUB_TYPE_BMBTD, "bmapbtd" }, \ + { XFS_SCRUB_TYPE_BMBTA, "bmapbta" }, \ + { XFS_SCRUB_TYPE_BMBTC, "bmapbtc" }, \ + { XFS_SCRUB_TYPE_DIR, "directory" }, \ + { XFS_SCRUB_TYPE_XATTR, "xattr" }, \ + { XFS_SCRUB_TYPE_SYMLINK, "symlink" }, \ + { XFS_SCRUB_TYPE_PARENT, "parent" }, \ + { XFS_SCRUB_TYPE_RTBITMAP, "rtbitmap" }, \ + { XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \ + { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ + { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ + { XFS_SCRUB_TYPE_PQUOTA, "prjquota" } + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -36,10 +101,10 @@ DECLARE_EVENT_CLASS(xchk_class, __entry->flags = sm->sm_flags; __entry->error = error; ), - TP_printk("dev %d:%d ino 0x%llx type %u agno %u inum %llu gen %u flags 0x%x error %d", + TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, __entry->inum, __entry->gen, @@ -78,9 +143,9 @@ TRACE_EVENT(xchk_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, __entry->bno, __entry->error, @@ -109,11 +174,11 @@ TRACE_EVENT(xchk_file_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->offset, __entry->error, __entry->ret_ip) @@ -144,9 +209,9 @@ DECLARE_EVENT_CLASS(xchk_block_error_class, __entry->bno = bno; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, __entry->bno, __entry->ret_ip) @@ -176,10 +241,10 @@ DECLARE_EVENT_CLASS(xchk_ino_error_class, __entry->type = sc->sm->sm_type; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx type %s ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->ret_ip) ) @@ -213,11 +278,11 @@ DECLARE_EVENT_CLASS(xchk_fblock_error_class, __entry->offset = offset; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->offset, __entry->ret_ip) ); @@ -244,9 +309,9 @@ TRACE_EVENT(xchk_incomplete, __entry->type = sc->sm->sm_type; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u ret_ip %pS", + TP_printk("dev %d:%d type %s ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->ret_ip) ); @@ -278,10 +343,10 @@ TRACE_EVENT(xchk_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->ptr, __entry->agno, @@ -321,12 +386,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->ptr, __entry->agno, @@ -360,10 +425,10 @@ TRACE_EVENT(xchk_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->ptr, __entry->agno, @@ -400,12 +465,12 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->ptr, __entry->agno, @@ -439,10 +504,10 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_ptrs[level]; ), - TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d", + TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->agno, __entry->bno, __entry->level, @@ -473,13 +538,58 @@ TRACE_EVENT(xchk_xref_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u xref error %d ret_ip %pF", + TP_printk("dev %d:%d type %s xref error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->error, __entry->ret_ip) ); +TRACE_EVENT(xchk_iallocbt_check_cluster, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, xfs_daddr_t map_daddr, + unsigned short map_len, unsigned int chunk_ino, + unsigned int nr_inodes, uint16_t cluster_mask, + uint16_t holemask, unsigned int cluster_ino), + TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + cluster_mask, holemask, cluster_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(xfs_daddr_t, map_daddr) + __field(unsigned short, map_len) + __field(unsigned int, chunk_ino) + __field(unsigned int, nr_inodes) + __field(unsigned int, cluster_ino) + __field(uint16_t, cluster_mask) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->map_daddr = map_daddr; + __entry->map_len = map_len; + __entry->chunk_ino = chunk_ino; + __entry->nr_inodes = nr_inodes; + __entry->cluster_mask = cluster_mask; + __entry->holemask = holemask; + __entry->cluster_ino = cluster_ino; + ), + TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->map_daddr, + __entry->map_len, + __entry->chunk_ino, + __entry->nr_inodes, + __entry->cluster_mask, + __entry->holemask, + __entry->cluster_ino) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) @@ -598,11 +708,11 @@ TRACE_EVENT(xrep_init_btblock, __entry->agbno = agbno; __entry->btnum = btnum; ), - TP_printk("dev %d:%d agno %u agbno %u btnum %d", + TP_printk("dev %d:%d agno %u agbno %u btree %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, - __entry->btnum) + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS)) ) TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 338b9d9984e0..3619e9e8d359 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -28,7 +28,8 @@ */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; - unsigned int io_type; + int fork; + unsigned int data_seq; unsigned int cow_seq; struct xfs_ioend *ioend; }; @@ -62,7 +63,7 @@ xfs_find_daxdev_for_inode( static void xfs_finish_page_writeback( struct inode *inode, - struct bio_vec *bvec, + struct bio_vec *bvec, int error) { struct iomap_page *iop = to_iomap_page(bvec->bv_page); @@ -98,6 +99,7 @@ xfs_destroy_ioend( for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; /* * For the last bio, bi_private points to the ioend, so we @@ -109,7 +111,7 @@ xfs_destroy_ioend( next = bio->bi_private; /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) xfs_finish_page_writeback(inode, bvec, error); bio_put(bio); } @@ -255,30 +257,20 @@ xfs_end_io( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) xfs_reflink_cancel_cow_range(ip, offset, size, true); - break; - } - goto done; } /* - * Success: commit the COW or unwritten blocks if needed. + * Success: commit the COW or unwritten blocks if needed. */ - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) error = xfs_reflink_end_cow(ip, offset, size); - break; - case XFS_IO_UNWRITTEN: - /* writeback should never update isize */ + else if (ioend->io_state == XFS_EXT_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - break; - default: + else ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); - break; - } done: if (ioend->io_append_trans) @@ -293,7 +285,8 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) + if (ioend->io_fork == XFS_COW_FORK || + ioend->io_state == XFS_EXT_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -301,6 +294,75 @@ xfs_end_bio( xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } +/* + * Fast revalidation of the cached writeback mapping. Return true if the current + * mapping is valid, false otherwise. + */ +static bool +xfs_imap_valid( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + if (offset_fsb < wpc->imap.br_startoff || + offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + return false; + /* + * If this is a COW mapping, it is sufficient to check that the mapping + * covers the offset. Be careful to check this first because the caller + * can revalidate a COW mapping without updating the data seqno. + */ + if (wpc->fork == XFS_COW_FORK) + return true; + + /* + * This is not a COW mapping. Check the sequence number of the data fork + * because concurrent changes could have invalidated the extent. Check + * the COW fork because concurrent changes since the last time we + * checked (and found nothing at this offset) could have added + * overlapping blocks. + */ + if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + return false; + if (xfs_inode_has_cow_data(ip) && + wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + return false; + return true; +} + +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in wpc->imap. + * + * The current page is held locked so nothing could have removed the block + * backing offset_fsb, although it could have moved from the COW to the data + * fork by another thread. + */ +static int +xfs_convert_blocks( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs + * offset_fsb and put the result into wpc->imap. Allocate in a loop + * because it may take several attempts to allocate real blocks for a + * contiguous delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, + &wpc->imap, wpc->fork == XFS_COW_FORK ? + &wpc->cow_seq : &wpc->data_seq); + if (error) + return error; + } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + + return 0; +} + STATIC int xfs_map_blocks( struct xfs_writepage_ctx *wpc, @@ -310,26 +372,16 @@ xfs_map_blocks( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; - int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; - bool imap_valid; + int retries = 0; int error = 0; - /* - * We have to make sure the cached mapping is within EOF to protect - * against eofblocks trimming on file release leaving us with a stale - * mapping. Otherwise, a page for a subsequent file extending buffered - * write could get picked up by this writeback cycle and written to the - * wrong blocks. - * - * Note that what we really want here is a generic mapping invalidation - * mechanism to protect us from arbitrary extent modifying contexts, not - * just eofblocks. - */ - xfs_trim_extent_eof(&wpc->imap, ip); + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; /* * COW fork blocks can overlap data fork blocks even if the blocks @@ -346,31 +398,19 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - imap_valid = offset_fsb >= wpc->imap.br_startoff && - offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; - if (imap_valid && - (!xfs_inode_has_cow_data(ip) || - wpc->io_type == XFS_IO_COW || - wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) + if (xfs_imap_valid(wpc, ip, offset_fsb)) return 0; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * If we don't have a valid map, now it's time to get a new one for this * offset. This will convert delayed allocations (including COW ones) * into real extents. If we return without a valid map, it means we * landed in a hole and we skip the block. */ +retry: xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_super->s_maxbytes); - - if (offset > mp->m_super->s_maxbytes - count) - count = mp->m_super->s_maxbytes - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); /* * Check if this is offset is covered by a COW extents, and if yes use @@ -382,30 +422,16 @@ xfs_map_blocks( if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - /* - * Truncate can race with writeback since writeback doesn't - * take the iolock and truncate decreases the file size before - * it starts truncating the pages between new_size and old_size. - * Therefore, we can end up in the situation where writeback - * gets a CoW fork mapping but the truncate makes the mapping - * invalid and we end up in here trying to get a new mapping. - * bail out here so that we simply never get a valid mapping - * and so we drop the write altogether. The page truncation - * will kill the contents anyway. - */ - if (offset > i_size_read(inode)) { - wpc->io_type = XFS_IO_HOLE; - return 0; - } - whichfork = XFS_COW_FORK; - wpc->io_type = XFS_IO_COW; + + wpc->fork = XFS_COW_FORK; goto allocate_blocks; } /* - * Map valid and no COW extent in the way? We're done. + * No COW extent overlap. Revalidate now that we may have updated + * ->cow_seq. If the data mapping is still valid, we're done. */ - if (imap_valid) { + if (xfs_imap_valid(wpc, ip, offset_fsb)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -417,49 +443,65 @@ xfs_map_blocks( */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ + wpc->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); + wpc->fork = XFS_DATA_FORK; + + /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { - /* landed in a hole or beyond EOF */ imap.br_blockcount = imap.br_startoff - offset_fsb; imap.br_startoff = offset_fsb; imap.br_startblock = HOLESTARTBLOCK; - wpc->io_type = XFS_IO_HOLE; - } else { - /* - * Truncate to the next COW extent if there is one. This is the - * only opportunity to do this because we can skip COW fork - * lookups for the subsequent blocks in the mapping; however, - * the requirement to treat the COW range separately remains. - */ - if (cow_fsb != NULLFILEOFF && - cow_fsb < imap.br_startoff + imap.br_blockcount) - imap.br_blockcount = cow_fsb - imap.br_startoff; - - if (isnullstartblock(imap.br_startblock)) { - /* got a delalloc extent */ - wpc->io_type = XFS_IO_DELALLOC; - goto allocate_blocks; - } - - if (imap.br_state == XFS_EXT_UNWRITTEN) - wpc->io_type = XFS_IO_UNWRITTEN; - else - wpc->io_type = XFS_IO_OVERWRITE; + imap.br_state = XFS_EXT_NORM; } + /* + * Truncate to the next COW extent if there is one. This is the only + * opportunity to do this because we can skip COW fork lookups for the + * subsequent blocks in the mapping; however, the requirement to treat + * the COW range separately remains. + */ + if (cow_fsb != NULLFILEOFF && + cow_fsb < imap.br_startoff + imap.br_blockcount) + imap.br_blockcount = cow_fsb - imap.br_startoff; + + /* got a delalloc extent? */ + if (imap.br_startblock != HOLESTARTBLOCK && + isnullstartblock(imap.br_startblock)) + goto allocate_blocks; + wpc->imap = imap; - trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); + trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); return 0; allocate_blocks: - error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, - &wpc->cow_seq); - if (error) + error = xfs_convert_blocks(wpc, ip, offset_fsb); + if (error) { + /* + * If we failed to find the extent in the COW fork we might have + * raced with a COW to data fork conversion or truncate. + * Restart the lookup to catch the extent in the data fork for + * the former case, but prevent additional retries to avoid + * looping forever for the latter case. + */ + if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + goto retry; + ASSERT(error != -EAGAIN); return error; - ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || - imap.br_startoff + imap.br_blockcount <= cow_fsb); - wpc->imap = imap; - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); + } + + /* + * Due to merging the return real extent might be larger than the + * original delalloc one. Trim the return extent to the next COW + * boundary again to force a re-lookup. + */ + if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && + cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) + wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + + ASSERT(wpc->imap.br_startoff <= offset_fsb); + ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); + trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); return 0; } @@ -484,7 +526,7 @@ xfs_submit_ioend( int status) { /* Convert CoW extents to regular */ - if (!status && ioend->io_type == XFS_IO_COW) { + if (!status && ioend->io_fork == XFS_COW_FORK) { /* * Yuk. This can do memory allocation, but is not a * transactional operation so everything is done in GFP_KERNEL @@ -502,7 +544,8 @@ xfs_submit_ioend( /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && + (ioend->io_fork == XFS_COW_FORK || + ioend->io_state != XFS_EXT_UNWRITTEN) && xfs_ioend_is_append(ioend) && !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); @@ -531,7 +574,8 @@ xfs_submit_ioend( static struct xfs_ioend * xfs_alloc_ioend( struct inode *inode, - unsigned int type, + int fork, + xfs_exntst_t state, xfs_off_t offset, struct block_device *bdev, sector_t sector) @@ -545,7 +589,8 @@ xfs_alloc_ioend( ioend = container_of(bio, struct xfs_ioend, io_inline_bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; + ioend->io_fork = fork; + ioend->io_state = state; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; @@ -606,21 +651,23 @@ xfs_add_to_ioend( sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || + if (!wpc->ioend || + wpc->fork != wpc->ioend->io_fork || + wpc->imap.br_state != wpc->ioend->io_state || sector != bio_end_sector(wpc->ioend->io_bio) || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, - bdev, sector); + wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, + wpc->imap.br_state, offset, bdev, sector); } - if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { + if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) { if (iop) atomic_inc(&iop->write_count); if (bio_full(wpc->ioend->io_bio)) xfs_chain_bio(wpc->ioend, wbc, bdev, sector); - __bio_add_page(wpc->ioend->io_bio, page, len, poff); + bio_add_page(wpc->ioend->io_bio, page, len, poff); } wpc->ioend->io_size += len; @@ -721,7 +768,7 @@ xfs_writepage_map( error = xfs_map_blocks(wpc, inode, file_offset); if (error) break; - if (wpc->io_type == XFS_IO_HOLE) + if (wpc->imap.br_startblock == HOLESTARTBLOCK) continue; xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, &submit_list); @@ -916,9 +963,7 @@ xfs_vm_writepage( struct page *page, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; ret = xfs_do_writepage(page, wbc, &wpc); @@ -932,9 +977,7 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); @@ -981,7 +1024,7 @@ xfs_vm_bmap( * Since we don't pass back blockdev info, we can't return bmap * information for rt files either. */ - if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) + if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; return iomap_bmap(mapping, block, &xfs_iomap_ops); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 494b4338446e..6c2615b83c5d 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,29 +9,12 @@ extern struct bio_set xfs_ioend_bioset; /* - * Types of I/O for bmap clustering and I/O completion tracking. - */ -enum { - XFS_IO_HOLE, /* covers region without any block allocation */ - XFS_IO_DELALLOC, /* covers delalloc region */ - XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ - XFS_IO_OVERWRITE, /* covers already allocated extent */ - XFS_IO_COW, /* covers copy-on-write extent */ -}; - -#define XFS_IO_TYPES \ - { XFS_IO_HOLE, "hole" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" } - -/* * Structure for buffered I/O completions. */ struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ - unsigned int io_type; /* delalloc / unwritten */ + int io_fork; /* inode fork written back */ + xfs_exntst_t io_state; /* extent state */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a58034049995..3d213a7394c5 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -555,6 +555,7 @@ xfs_attr_put_listent( attrlist_ent_t *aep; int arraytop; + ASSERT(!context->seen_enough); ASSERT(!(context->flags & ATTR_KERNOVAL)); ASSERT(context->count >= 0); ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5d263dfdb3bc..2db43ff4f8b5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1042,7 +1042,7 @@ out_trans_cancel: goto out_unlock; } -static int +int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, @@ -1126,9 +1126,9 @@ xfs_free_file_space( * page could be mmap'd and iomap_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ - if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) { + if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - (offset + len) & ~PAGE_MASK, LLONG_MAX); + round_down(offset + len, PAGE_SIZE), LLONG_MAX); } return error; @@ -1162,16 +1162,13 @@ xfs_zero_file_space( * by virtue of the hole punch. */ error = xfs_free_file_space(ip, offset, len); - if (error) - goto out; + if (error || xfs_is_always_cow_inode(ip)) + return error; - error = xfs_alloc_file_space(ip, round_down(offset, blksize), + return xfs_alloc_file_space(ip, round_down(offset, blksize), round_up(offset + len, blksize) - round_down(offset, blksize), XFS_BMAPI_PREALLOC); -out: - return error; - } static int @@ -1195,13 +1192,7 @@ xfs_prepare_shift( * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); - if (error) - return error; - error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - offset >> PAGE_SHIFT, -1); - if (error) - return error; + error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); /* * Clean out anything hanging around in the cow fork now that diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 87363d136bb6..7a78229cf1a7 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); +int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + #endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b21ea2ba768d..548344e25128 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -776,13 +776,24 @@ _xfs_buf_read( } /* - * If the caller passed in an ops structure and the buffer doesn't have ops - * assigned, set the ops and use them to verify the contents. If the contents - * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no - * recorded errors and is already in XBF_DONE state. + * Reverify a buffer found in cache without an attached ->b_ops. + * + * If the caller passed an ops structure and the buffer doesn't have ops + * assigned, set the ops and use it to verify the contents. If verification + * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is + * already in XBF_DONE state on entry. + * + * Under normal operations, every in-core buffer is verified on read I/O + * completion. There are two scenarios that can lead to in-core buffers without + * an assigned ->b_ops. The first is during log recovery of buffers on a V4 + * filesystem, though these buffers are purged at the end of recovery. The + * other is online repair, which intentionally reads with a NULL buffer ops to + * run several verifiers across an in-core buffer in order to establish buffer + * type. If repair can't establish that, the buffer will be left in memory + * with NULL buffer ops. */ int -xfs_buf_ensure_ops( +xfs_buf_reverify( struct xfs_buf *bp, const struct xfs_buf_ops *ops) { @@ -824,7 +835,7 @@ xfs_buf_read_map( return bp; } - xfs_buf_ensure_ops(bp, ops); + xfs_buf_reverify(bp, ops); if (flags & XBF_ASYNC) { /* @@ -1536,8 +1547,7 @@ __xfs_buf_submit( xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); - if (bp->b_flags & XBF_ASYNC) - xfs_buf_ioend(bp); + xfs_buf_ioend(bp); return -EIO; } @@ -1992,7 +2002,6 @@ xfs_buf_delwri_submit_buffers( struct list_head *wait_list) { struct xfs_buf *bp, *n; - LIST_HEAD (submit_list); int pinned = 0; struct blk_plug plug; @@ -2195,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) atomic_set(&bp->b_lru_ref, lru_ref); } + +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic( + struct xfs_buf *bp, + __be32 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + return false; + return dmagic == bp->b_ops->magic[idx]; +} +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic16( + struct xfs_buf *bp, + __be16 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + return false; + return dmagic == bp->b_ops->magic16[idx]; +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b9f5511ea998..d0b96e071cec 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -125,6 +125,10 @@ struct xfs_buf_map { struct xfs_buf_ops { char *name; + union { + __be32 magic[2]; /* v4 and v5 on disk magic values */ + __be16 magic16[2]; /* v4 and v5 on disk magic values */ + }; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); @@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); +bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 12d8455bfbb2..010db5f8fb00 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1233,9 +1233,23 @@ xfs_buf_iodone( } /* - * Requeue a failed buffer for writeback + * Requeue a failed buffer for writeback. * - * Return true if the buffer has been re-queued properly, false otherwise + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + * + * Return true if the buffer was added to the buffer list, false if it was + * already on the buffer list. */ bool xfs_buf_resubmit_failed_buffers( @@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers( struct list_head *buffer_list) { struct xfs_log_item *lip; + bool ret; + + ret = xfs_buf_delwri_queue(bp, buffer_list); /* - * Clear XFS_LI_FAILED flag from all items before resubmit - * - * XFS_LI_FAILED set/clear is protected by ail_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - /* Add this buffer back to the delayed write list */ - return xfs_buf_delwri_queue(bp, buffer_list); + return ret; } diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 9866f542e77b..a1e177f66404 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BUF_LRU_REF, XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, + XFS_RANDOM_IUNLINK_FALLBACK, }; struct xfs_errortag_attr { @@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); +XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), + XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), NULL, }; @@ -357,7 +360,8 @@ xfs_buf_verifier_error( fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); - xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", fa, bp->b_ops->name, bp->b_bn, name); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 246d3e989c6c..602aa7d62b66 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 +#define XFS_PTAG_VERIFIER_ERROR 0x00000100 #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d9da66c718bb..74ddf66f4cfe 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -494,7 +494,6 @@ xfs_efi_recover( int error = 0; xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; - struct xfs_owner_info oinfo; ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); @@ -526,11 +525,11 @@ xfs_efi_recover( return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - xfs_rmap_any_owner_update(&oinfo); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len, &oinfo, false); + extp->ext_len, + &XFS_RMAP_OINFO_ANY_OWNER, false); if (error) goto abort_error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 61a5ad2600e8..1f2e2845eb76 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -507,7 +507,7 @@ xfs_file_dio_aio_write( * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); return -EREMCHG; } @@ -872,14 +872,27 @@ xfs_file_fallocate( goto out_unlock; } - if (mode & FALLOC_FL_ZERO_RANGE) + if (mode & FALLOC_FL_ZERO_RANGE) { error = xfs_zero_file_space(ip, offset, len); - else { - if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; + } else if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + + if (!xfs_is_always_cow_inode(ip)) { + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); + } + } else { + /* + * If always_cow mode we can't use preallocations and + * thus should not create them. + */ + if (xfs_is_always_cow_inode(ip)) { + error = -EOPNOTSUPP; + goto out_unlock; } + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); } @@ -919,28 +932,67 @@ out_unlock: return error; } -STATIC int -xfs_file_clone_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) -{ - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); -} -STATIC int -xfs_file_dedupe_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) +STATIC loff_t +xfs_file_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + loff_t len, + unsigned int remap_flags) { - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + loff_t remapped = 0; + xfs_extlen_t cowextsize; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret < 0 || len == 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + + ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, + &remapped); + if (ret) + goto out_unlock; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (pos_in == 0 && len == i_size_read(inode_in) && + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + pos_out == 0 && len >= i_size_read(inode_out) && + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + remap_flags); + +out_unlock: + xfs_reflink_remap_unlock(file_in, file_out); + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + return remapped > 0 ? remapped : ret; } STATIC int @@ -1029,10 +1081,10 @@ xfs_file_llseek( default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); break; case SEEK_DATA: - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); break; } @@ -1164,6 +1216,7 @@ const struct file_operations xfs_file_operations = { .write_iter = xfs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, @@ -1175,8 +1228,7 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .clone_file_range = xfs_file_clone_range, - .dedupe_file_range = xfs_file_dedupe_range, + .remap_file_range = xfs_file_remap_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 093c2b8d7e20..584648582ba7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -40,7 +40,6 @@ xfs_growfs_data_private( xfs_rfsblock_t new; xfs_agnumber_t oagcount; xfs_trans_t *tp; - LIST_HEAD (buffer_list); struct aghdr_init_data id = {}; nb = in->newblocks; @@ -252,7 +251,7 @@ xfs_growfs_data( if (mp->m_sb.sb_imax_pct) { uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; do_div(icount, 100); - mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + mp->m_maxicount = XFS_FSB_TO_INO(mp, icount); } else mp->m_maxicount = 0; @@ -534,6 +533,7 @@ xfs_fs_reserve_ag_blocks( int error = 0; int err2; + mp->m_finobt_nores = false; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { pag = xfs_perag_get(mp, agno); err2 = xfs_ag_resv_init(pag, NULL); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 5169e84ae382..d0d377384120 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -16,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 255 }, + .panic_mask = { 0, 0, 256 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 05db9540e459..f643a9295179 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1332,7 +1332,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); + error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -1754,7 +1754,7 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - if (unlikely(mp->m_inotbt_nores)) { + if (unlikely(mp->m_finobt_nores)) { error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); @@ -1907,86 +1907,510 @@ xfs_inactive( } /* - * This is called when the inode's link count goes to 0 or we are creating a - * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be - * set to true as the link count is dropped to zero by the VFS after we've - * created the file successfully, so we have to add it to the unlinked list - * while the link count is non-zero. + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * What if we modelled the unlinked list as a collection of records capturing + * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd + * have a fast way to look up unlinked list predecessors, which avoids the + * slow list walk. That's exactly what we do here (in-core) with a per-AG + * rhashtable. + * + * Because this is a backref cache, we ignore operational failures since the + * iunlink code can fall back to the slow bucket walk. The only errors that + * should bubble out are for obviously incorrect situations. + * + * All users of the backref cache MUST hold the AGI buffer lock to serialize + * access or have otherwise provided for concurrency control. + */ + +/* Capture a "X.next_unlinked = Y" relationship. */ +struct xfs_iunlink { + struct rhash_head iu_rhash_head; + xfs_agino_t iu_agino; /* X */ + xfs_agino_t iu_next_unlinked; /* Y */ +}; + +/* Unlinked list predecessor lookup hashtable construction */ +static int +xfs_iunlink_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const xfs_agino_t *key = arg->key; + const struct xfs_iunlink *iu = obj; + + if (iu->iu_next_unlinked != *key) + return 1; + return 0; +} + +static const struct rhashtable_params xfs_iunlink_hash_params = { + .min_size = XFS_AGI_UNLINKED_BUCKETS, + .key_len = sizeof(xfs_agino_t), + .key_offset = offsetof(struct xfs_iunlink, + iu_next_unlinked), + .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = xfs_iunlink_obj_cmpfn, +}; + +/* + * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such + * relation is found. + */ +static xfs_agino_t +xfs_iunlink_lookup_backref( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + struct xfs_iunlink *iu; + + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + return iu ? iu->iu_agino : NULLAGINO; +} + +/* + * Take ownership of an iunlink cache entry and insert it into the hash table. + * If successful, the entry will be owned by the cache; if not, it is freed. + * Either way, the caller does not own @iu after this call. + */ +static int +xfs_iunlink_insert_backref( + struct xfs_perag *pag, + struct xfs_iunlink *iu) +{ + int error; + + error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + /* + * Fail loudly if there already was an entry because that's a sign of + * corruption of in-memory data. Also fail loudly if we see an error + * code we didn't anticipate from the rhashtable code. Currently we + * only anticipate ENOMEM. + */ + if (error) { + WARN(error != -ENOMEM, "iunlink cache insert error %d", error); + kmem_free(iu); + } + /* + * Absorb any runtime errors that aren't a result of corruption because + * this is a cache and we can always fall back to bucket list scanning. + */ + if (error != 0 && error != -EEXIST) + error = 0; + return error; +} + +/* Remember that @prev_agino.next_unlinked = @this_agino. */ +static int +xfs_iunlink_add_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t this_agino) +{ + struct xfs_iunlink *iu; + + if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) + return 0; + + iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu->iu_agino = prev_agino; + iu->iu_next_unlinked = this_agino; + + return xfs_iunlink_insert_backref(pag, iu); +} + +/* + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. + * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there + * wasn't any such entry then we don't bother. + */ +static int +xfs_iunlink_change_backref( + struct xfs_perag *pag, + xfs_agino_t agino, + xfs_agino_t next_unlinked) +{ + struct xfs_iunlink *iu; + int error; + + /* Look up the old entry; if there wasn't one then exit. */ + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + if (!iu) + return 0; + + /* + * Remove the entry. This shouldn't ever return an error, but if we + * couldn't remove the old entry we don't want to add it again to the + * hash table, and if the entry disappeared on us then someone's + * violated the locking rules and we need to fail loudly. Either way + * we cannot remove the inode because internal state is or would have + * been corrupt. + */ + error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + if (error) + return error; + + /* If there is no new next entry just free our item and return. */ + if (next_unlinked == NULLAGINO) { + kmem_free(iu); + return 0; + } + + /* Update the entry and re-add it to the hash table. */ + iu->iu_next_unlinked = next_unlinked; + return xfs_iunlink_insert_backref(pag, iu); +} + +/* Set up the in-core predecessor structures. */ +int +xfs_iunlink_init( + struct xfs_perag *pag) +{ + return rhashtable_init(&pag->pagi_unlinked_hash, + &xfs_iunlink_hash_params); +} + +/* Free the in-core predecessor structures. */ +static void +xfs_iunlink_free_item( + void *ptr, + void *arg) +{ + struct xfs_iunlink *iu = ptr; + bool *freed_anything = arg; + + *freed_anything = true; + kmem_free(iu); +} + +void +xfs_iunlink_destroy( + struct xfs_perag *pag) +{ + bool freed_anything = false; + + rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, + xfs_iunlink_free_item, &freed_anything); + + ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) + return -EFSCORRUPTED; + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +/* Set an on-disk inode's next_unlinked pointer. */ +STATIC void +xfs_iunlink_update_dinode( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_buf *ibp, + struct xfs_dinode *dip, + struct xfs_imap *imap, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + int offset; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + trace_xfs_iunlink_update_dinode(mp, agno, agino, + be32_to_cpu(dip->di_next_unlinked), next_agino); + + dip->di_next_unlinked = cpu_to_be32(next_agino); + offset = imap->im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + xfs_inobp_check(mp, ibp); +} + +/* Set an in-core inode's unlinked pointer and return the old value. */ +STATIC int +xfs_iunlink_update_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_agnumber_t agno, + xfs_agino_t next_agino, + xfs_agino_t *old_next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + xfs_agino_t old_value; + int error; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + return error; + + /* Make sure the old pointer isn't garbage. */ + old_value = be32_to_cpu(dip->di_next_unlinked); + if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + error = -EFSCORRUPTED; + goto out; + } + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + *old_next_agino = old_value; + if (old_value == next_agino) { + if (next_agino != NULLAGINO) + error = -EFSCORRUPTED; + goto out; + } + + /* Ok, update the new pointer. */ + xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + ibp, dip, &ip->i_imap, next_agino); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. * * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. */ STATIC int xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp = tp->t_mountp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agino_t agino; - short bucket_index; - int offset; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + xfs_agino_t next_agino; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ - error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(agi->agi_unlinked[bucket_index]); - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(mp, agno, next_agino)) + return -EFSCORRUPTED; + + if (next_agino != NULLAGINO) { + struct xfs_perag *pag; + xfs_agino_t old_agino; + + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + &old_agino); + if (error) + return error; + ASSERT(old_agino == NULLAGINO); - if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { /* - * There is already another inode in the bucket we need - * to add ourselves to. Add us at the front of the list. - * Here we put the head pointer into our next pointer, - * and then we fall through to point the head at us. + * agino has been unlinked, add a backref from the next inode + * back to agino. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_add_backref(pag, agino, next_agino); + xfs_perag_put(pag); if (error) return error; + } + + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); +} - ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); - dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); +/* Return the imap, dinode pointer, and buffer for an inode. */ +STATIC int +xfs_iunlink_map_ino( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = tp->t_mountp; + int error; - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); + imap->im_blkno = 0; + error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + return 0; +} + +/* + * Walk the unlinked chain from @head_agino until we find the inode that + * points to @target_agino. Return the inode number, map, dinode pointer, + * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. + * + * @tp, @pag, @head_agino, and @target_agino are input parameters. + * @agino, @imap, @dipp, and @bpp are all output parameters. + * + * Do not call this function if @target_agino is the head of the list. + */ +STATIC int +xfs_iunlink_map_prev( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t head_agino, + xfs_agino_t target_agino, + xfs_agino_t *agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_agino_t next_agino; + int error; + + ASSERT(head_agino != target_agino); + *bpp = NULL; + + /* See if our backref cache can find it faster. */ + *agino = xfs_iunlink_lookup_backref(pag, target_agino); + if (*agino != NULLAGINO) { + error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); + if (error) + return error; + + if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) + return 0; + + /* + * If we get here the cache contents were corrupt, so drop the + * buffer and fall back to walking the bucket list. + */ + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + WARN_ON_ONCE(1); + } + + trace_xfs_iunlink_map_prev_fallback(mp, agno); + + /* Otherwise, walk the entire bucket until we find it. */ + next_agino = head_agino; + while (next_agino != target_agino) { + xfs_agino_t unlinked_agino; + + if (*bpp) + xfs_trans_brelse(tp, *bpp); + + *agino = next_agino; + error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, + bpp); + if (error) + return error; + + unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); + /* + * Make sure this pointer is valid and isn't an obvious + * infinite loop. + */ + if (!xfs_verify_agino(mp, agno, unlinked_agino) || + next_agino == unlinked_agino) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, + *dipp, sizeof(**dipp)); + error = -EFSCORRUPTED; + return error; + } + next_agino = unlinked_agino; } - /* - * Point the bucket head pointer at the inode being inserted. - */ - ASSERT(agino != 0); - agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); return 0; } @@ -1995,181 +2419,106 @@ xfs_iunlink( */ STATIC int xfs_iunlink_remove( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_ino_t next_ino; - xfs_mount_t *mp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_agino_t agino; - xfs_agino_t next_agino; - xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip = NULL; - short bucket_index; - int offset, last_offset = 0; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + struct xfs_buf *last_ibp; + struct xfs_dinode *last_dip = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t next_agino; + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; - mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + trace_xfs_iunlink_remove(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - if (!xfs_verify_agino(mp, agno, agino)) - return -EFSCORRUPTED; - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - if (!xfs_verify_agino(mp, agno, - be32_to_cpu(agi->agi_unlinked[bucket_index]))) { + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; } - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { - /* - * We're at the head of the list. Get the inode's on-disk - * buffer to see if there is anyone after us on the list. - * Only modify our next pointer if it is not already NULLAGINO. - * This saves us the overhead of dealing with the buffer when - * there is no need to change it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the bucket head pointer at the next inode. - */ - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - } else { - /* - * We need to search the list for the inode being freed. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - last_ibp = NULL; - while (next_agino != agino) { - struct xfs_imap imap; + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + if (error) + return error; - if (last_ibp) - xfs_trans_brelse(tp, last_ibp); + /* + * If there was a backref pointing from the next inode back to this + * one, remove it because we've removed this inode from the list. + * + * Later, if this inode was in the middle of the list we'll update + * this inode's backref to point from the next inode. + */ + if (next_agino != NULLAGINO) { + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_change_backref(pag, next_agino, + NULLAGINO); + if (error) + goto out; + } - imap.im_blkno = 0; - next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + if (head_agino == agino) { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); + if (error) + goto out; + } else { + struct xfs_imap imap; + xfs_agino_t prev_agino; - error = xfs_imap(mp, tp, next_ino, &imap, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } + if (!pag) + pag = xfs_perag_get(mp, agno); - error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, - &last_ibp, 0, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } + /* We need to search the list for the inode being freed. */ + error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp, + pag); + if (error) + goto out; - last_offset = imap.im_boffset; - next_agino = be32_to_cpu(last_dip->di_next_unlinked); - if (!xfs_verify_agino(mp, agno, next_agino)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - last_dip, sizeof(*last_dip)); - return -EFSCORRUPTED; - } - } + /* Point the previous inode on the list to the next inode. */ + xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + last_dip, &imap, next_agino); /* - * Now last_ibp points to the buffer previous to us on the - * unlinked list. Pull us from the list. + * Now we deal with the backref for this inode. If this inode + * pointed at a real inode, change the backref that pointed to + * us to point to our old next. If this inode was the end of + * the list, delete the backref that pointed to us. Note that + * change_backref takes care of deleting the backref if + * next_agino is NULLAGINO. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the previous inode on the list to the next inode. - */ - last_dip->di_next_unlinked = cpu_to_be32(next_agino); - ASSERT(next_agino != 0); - offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, last_dip); - - xfs_trans_inode_buf(tp, last_ibp); - xfs_trans_log_buf(tp, last_ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, last_ibp); + error = xfs_iunlink_change_backref(pag, agino, next_agino); + if (error) + goto out; } - return 0; + +out: + if (pag) + xfs_perag_put(pag); + return error; } /* @@ -2184,8 +2533,6 @@ xfs_ifree_cluster( struct xfs_icluster *xic) { xfs_mount_t *mp = free_ip->i_mount; - int blks_per_cluster; - int inodes_per_cluster; int nbufs; int i, j; int ioffset; @@ -2199,11 +2546,9 @@ xfs_ifree_cluster( inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - blks_per_cluster = xfs_icluster_size_fsb(mp); - inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; - nbufs = mp->m_ialloc_blks / blks_per_cluster; + nbufs = mp->m_ialloc_blks / mp->m_blocks_per_cluster; - for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + for (j = 0; j < nbufs; j++, inum += mp->m_inodes_per_cluster) { /* * The allocation bitmap tells us which inodes of the chunk were * physically allocated. Skip the cluster if an inode falls into @@ -2211,7 +2556,7 @@ xfs_ifree_cluster( */ ioffset = inum - xic->first_ino; if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { - ASSERT(ioffset % inodes_per_cluster == 0); + ASSERT(ioffset % mp->m_inodes_per_cluster == 0); continue; } @@ -2227,7 +2572,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, + mp->m_bsize * mp->m_blocks_per_cluster, XBF_UNMAPPED); if (!bp) @@ -2242,7 +2587,7 @@ xfs_ifree_cluster( * want it to fail. We can acheive this by adding a write * verifier to the buffer. */ - bp->b_ops = &xfs_inode_buf_ops; + bp->b_ops = &xfs_inode_buf_ops; /* * Walk the inodes already attached to the buffer and mark them @@ -2274,7 +2619,7 @@ xfs_ifree_cluster( * transaction stale above, which means there is no point in * even trying to lock them. */ - for (i = 0; i < inodes_per_cluster; i++) { + for (i = 0; i < mp->m_inodes_per_cluster; i++) { retry: rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, @@ -2837,11 +3182,9 @@ xfs_rename_alloc_whiteout( /* * Prepare the tmpfile inode as if it were created through the VFS. - * Otherwise, the link increment paths will complain about nlink 0->1. - * Drop the link count as done by d_tmpfile(), complete the inode setup - * and flag it as linkable. + * Complete the inode setup and flag it as linkable. nlink is already + * zero, so we can skip the drop_nlink. */ - drop_nlink(VFS_I(tmpfile)); xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index be2014520155..e62074a5257c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone; bool xfs_inode_verify_forks(struct xfs_inode *ip); +int xfs_iunlink_init(struct xfs_perag *pag); +void xfs_iunlink_destroy(struct xfs_perag *pag); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6e2c08f30f60..6ecdbb3af7de 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1608,7 +1608,7 @@ xfs_ioc_getbmap( error = 0; out_free_buf: kmem_free(buf); - return 0; + return error; } struct getfsmap_info { diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index fba115f4103a..5001dca361e9 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -241,6 +241,32 @@ xfs_compat_ioc_bulkstat( int done; int error; + /* + * Output structure handling functions. Depending on the command, + * either the xfs_bstat and xfs_inogrp structures are written out + * to userpace memory via bulkreq.ubuffer. Normally the compat + * functions and structure size are the correct ones to use ... + */ + inumbers_fmt_pf inumbers_func = xfs_inumbers_fmt_compat; + bulkstat_one_pf bs_one_func = xfs_bulkstat_one_compat; + size_t bs_one_size = sizeof(struct compat_xfs_bstat); + +#ifdef CONFIG_X86_X32 + if (in_x32_syscall()) { + /* + * ... but on x32 the input xfs_fsop_bulkreq has pointers + * which must be handled in the "compat" (32-bit) way, while + * the xfs_bstat and xfs_inogrp structures follow native 64- + * bit layout convention. So adjust accordingly, otherwise + * the data written out in compat layout will not match what + * x32 userspace expects. + */ + inumbers_func = xfs_inumbers_fmt; + bs_one_func = xfs_bulkstat_one; + bs_one_size = sizeof(struct xfs_bstat); + } +#endif + /* done = 1 if there are more stats to get and if bulkstat */ /* should be called again (unused here, but used in dmapi) */ @@ -272,15 +298,15 @@ xfs_compat_ioc_bulkstat( if (cmd == XFS_IOC_FSINUMBERS_32) { error = xfs_inumbers(mp, &inlast, &count, - bulkreq.ubuffer, xfs_inumbers_fmt_compat); + bulkreq.ubuffer, inumbers_func); } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) { int res; - error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, - sizeof(compat_xfs_bstat_t), NULL, &res); + error = bs_one_func(mp, inlast, bulkreq.ubuffer, + bs_one_size, NULL, &res); } else if (cmd == XFS_IOC_FSBULKSTAT_32) { error = xfs_bulkstat(mp, &inlast, &count, - xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), + bs_one_func, bs_one_size, bulkreq.ubuffer, &done); } else error = -EINVAL; @@ -336,6 +362,7 @@ xfs_compat_attrlist_by_handle( { int error; attrlist_cursor_kern_t *cursor; + compat_xfs_fsop_attrlist_handlereq_t __user *p = arg; compat_xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; char *kbuf; @@ -370,6 +397,11 @@ xfs_compat_attrlist_by_handle( if (error) goto out_kfree; + if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { + error = -EFAULT; + goto out_kfree; + } + if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) error = -EFAULT; @@ -547,8 +579,12 @@ xfs_file_compat_ioctl( case FS_IOC_GETFSMAP: case XFS_IOC_SCRUB_METADATA: return xfs_file_ioctl(filp, cmd, p); -#ifndef BROKEN_X86_ALIGNMENT - /* These are handled fine if no alignment issues */ +#if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32) + /* + * These are handled fine if no alignment issues. To support x32 + * which uses native 64-bit alignment we must emit these cases in + * addition to the ia-32 compat set below. + */ case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: @@ -561,8 +597,16 @@ xfs_file_compat_ioctl( case XFS_IOC_FSGROWFSDATA: case XFS_IOC_FSGROWFSRT: case XFS_IOC_ZERO_RANGE: +#ifdef CONFIG_X86_X32 + /* + * x32 special: this gets a different cmd number from the ia-32 compat + * case below; the associated data will match native 64-bit alignment. + */ + case XFS_IOC_SWAPEXT: +#endif return xfs_file_ioctl(filp, cmd, p); -#else +#endif +#if defined(BROKEN_X86_ALIGNMENT) case XFS_IOC_ALLOCSP_32: case XFS_IOC_FREESP_32: case XFS_IOC_ALLOCSP64_32: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 27c93b5f029d..63d323916bba 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -35,18 +35,40 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -void +static int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return -EFSCORRUPTED; +} + +int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool shared) { struct xfs_mount *mp = ip->i_mount; + if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { + } else if (imap->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(imap->br_startblock)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { @@ -60,6 +82,13 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + + if (xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + if (shared) + iomap->flags |= IOMAP_F_SHARED; + return 0; } static void @@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb( return 0; } -STATIC int -xfs_alert_fsblock_zero( - xfs_inode_t *ip, - xfs_bmbt_irec_t *imap) -{ - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)imap->br_startblock, - (unsigned long long)imap->br_startoff, - (unsigned long long)imap->br_blockcount, - imap->br_state); - return -EFSCORRUPTED; -} - int xfs_iomap_write_direct( xfs_inode_t *ip, @@ -383,12 +395,13 @@ xfs_quota_calc_throttle( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_inode *ip, + int whichfork, loff_t offset, loff_t count, struct xfs_iext_cursor *icur) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmbt_irec prev; int shift = 0; @@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); xfs_fileoff_t end_fsb; - int error = 0, eof = 0; - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; + bool eof = false, cow_eof = false, shared = false; + int whichfork = XFS_DATA_FORK; + int error = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay( XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto out_unlock; @@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay( end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); + /* + * Search the data fork fork first to look up our source mapping. We + * always need the data fork map, as we have to return it to the + * iomap code so that the higher level write code can read data in to + * perform read-modify-write cycles for unaligned writes. + */ + eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) - got.br_startoff = end_fsb; /* fake hole until the end */ + imap.br_startoff = end_fsb; /* fake hole until the end */ + + /* We never need to allocate blocks for zeroing a hole. */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); + goto out_unlock; + } - if (got.br_startoff <= offset_fsb) { + /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. + */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + whichfork = XFS_COW_FORK; + goto done; + } + } + + if (imap.br_startoff <= offset_fsb) { /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of * existing extents that contain data. */ - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - got.br_state != XFS_EXT_UNWRITTEN)) { - xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got); - if (error) - goto out_unlock; + if (!xfs_is_cow_inode(ip) || + ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; } - trace_xfs_iomap_found(ip, offset, count, 0, &got); - goto done; - } + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); - if (flags & IOMAP_ZERO) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); - goto out_unlock; + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_inode_need_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the + * end of the extent. + */ + whichfork = XFS_COW_FORK; + end_fsb = imap.br_startoff + imap.br_blockcount; + } else { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat + * symmetric with the work writeback does. This is a completely + * arbitrary number pulled out of thin air. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (xfs_is_always_cow_inode(ip)) + whichfork = XFS_COW_FORK; } error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages - * to keep the chunks of work done where somewhat symmetric with the - * work writeback does. This is a completely arbitrary number pulled - * out of thin air as a best guess for initial testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, - &icur); + prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset, + count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &icur, - eof); + error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, + whichfork == XFS_DATA_FORK ? &imap : &cmap, + whichfork == XFS_DATA_FORK ? &icur : &ccur, + whichfork == XFS_DATA_FORK ? eof : cow_eof); switch (error) { case 0: break; @@ -647,186 +711,22 @@ retry: * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); + trace_xfs_iomap_alloc(ip, offset, count, whichfork, + whichfork == XFS_DATA_FORK ? &imap : &cmap); done: - if (isnullstartblock(got.br_startblock)) - got.br_startblock = DELAYSTARTBLOCK; - - if (!got.br_startblock) { - error = xfs_alert_fsblock_zero(ip, &got); - if (error) + if (whichfork == XFS_COW_FORK) { + if (imap.br_startoff > offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); goto out_unlock; - } - - xfs_bmbt_to_iomap(ip, iomap, &got); - -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - -/* - * Pass in a delayed allocate extent, convert it to real extents; - * return to the caller the extent we create which maps on top of - * the originating callers request. - * - * Called without a lock on the inode. - * - * We no longer bother to look at the incoming map - all we have to - * guarantee is that whatever we allocate fills the required range. - */ -int -xfs_iomap_write_allocate( - xfs_inode_t *ip, - int whichfork, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - unsigned int *cow_seq) -{ - xfs_mount_t *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_fileoff_t offset_fsb, last_block; - xfs_fileoff_t end_fsb, map_start_fsb; - xfs_filblks_t count_fsb; - xfs_trans_t *tp; - int nimaps; - int error = 0; - int flags = XFS_BMAPI_DELALLOC; - int nres; - - if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; - - /* - * Make sure that the dquots are there. - */ - error = xfs_qm_dqattach(ip); - if (error) - return error; - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = imap->br_blockcount; - map_start_fsb = imap->br_startoff; - - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); - - while (count_fsb != 0) { - /* - * Set up a transaction with which to allocate the - * backing store for the file. Do allocations in a - * loop until we get some space in the range we are - * interested in. The other space that might be allocated - * is in the delayed allocation extent on which we sit - * but before our buffer starts. - */ - nimaps = 0; - while (nimaps == 0) { - nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - /* - * We have already reserved space for the extent and any - * indirect blocks when creating the delalloc extent, - * there is no need to reserve space in this transaction - * again. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * it is possible that the extents have changed since - * we did the read call as we dropped the ilock for a - * while. We have to be careful about truncates or hole - * punchs here - we are not allowed to allocate - * non-delalloc blocks here. - * - * The only protection against truncation is the pages - * for the range we are being asked to convert are - * locked and hence a truncate will block on them - * first. - * - * As a result, if we go beyond the range we really - * need and hit an delalloc extent boundary followed by - * a hole while we have excess blocks in the map, we - * will fill the hole incorrectly and overrun the - * transaction reservation. - * - * Using a single map prevents this as we are forced to - * check each map we look for overlap with the desired - * range and abort as soon as we find it. Also, given - * that we only return a single map, having one beyond - * what we can return is probably a bit silly. - * - * We also need to check that we don't go beyond EOF; - * this is a truncate optimisation as a truncate sets - * the new file size before block on the pages we - * currently have locked under writeback. Because they - * are about to be tossed, we don't need to write them - * back.... - */ - nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(ip, &last_block, - XFS_DATA_FORK); - if (error) - goto trans_cancel; - - last_block = XFS_FILEOFF_MAX(last_block, end_fsb); - if ((map_start_fsb + count_fsb) > last_block) { - count_fsb = last_block - map_start_fsb; - if (count_fsb == 0) { - error = -EAGAIN; - goto trans_cancel; - } - } - - /* - * From this point onwards we overwrite the imap - * pointer that the caller gave to us. - */ - error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, flags, nres, imap, - &nimaps); - if (error) - goto trans_cancel; - - error = xfs_trans_commit(tp); - if (error) - goto error0; - - if (whichfork == XFS_COW_FORK) - *cow_seq = READ_ONCE(ifp->if_seq); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - /* - * See if we were able to allocate an extent that - * covers at least part of the callers request - */ - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, imap); - - if ((offset_fsb >= imap->br_startoff) && - (offset_fsb < (imap->br_startoff + - imap->br_blockcount))) { - XFS_STATS_INC(mp, xs_xstrat_quick); - return 0; } - - /* - * So far we have not mapped the requested part of the - * file, just surrounding data, try again. - */ - count_fsb -= imap->br_blockcount; - map_start_fsb = imap->br_startoff + imap->br_blockcount; + /* ensure we only report blocks we have a reservation for */ + xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); + shared = true; } - -trans_cancel: - xfs_trans_cancel(tp); -error0: + error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -975,7 +875,7 @@ xfs_ilock_for_iomap( * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && is_write) { + if (xfs_is_cow_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -1009,7 +909,7 @@ relock: * check, so if we got ILOCK_SHARED for a write and but we're now a * reflink inode we have to switch to ILOCK_EXCL and relock. */ - if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) { xfs_iunlock(ip, mode); mode = XFS_ILOCK_EXCL; goto relock; @@ -1081,23 +981,33 @@ xfs_file_iomap_begin( * Break shared extents if necessary. Checks for non-blocking IO have * been done up front, so we don't need to do them here. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { + struct xfs_bmbt_irec cmap; + bool directio = (flags & IOMAP_DIRECT); + /* if zeroing doesn't need COW allocation, then we are done. */ if ((flags & IOMAP_ZERO) && !needs_cow_for_zeroing(&imap, nimaps)) goto out_found; - if (flags & IOMAP_DIRECT) { - /* may drop and re-acquire the ilock */ - error = xfs_reflink_allocate_cow(ip, &imap, &shared, - &lockmode); - if (error) - goto out_unlock; - } else { - error = xfs_reflink_reserve_cow(ip, &imap); - if (error) - goto out_unlock; - } + /* may drop and re-acquire the ilock */ + cmap = imap; + error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode, + directio); + if (error) + goto out_unlock; + + /* + * For buffered writes we need to report the address of the + * previous block (if there was any) so that the higher level + * write code can perform read-modify-write operations; we + * won't need the CoW fork mapping until writeback. For direct + * I/O, which must be block aligned, we need to report the + * newly allocated address. If the data fork has a hole, copy + * the COW fork mapping to avoid allocating to the data fork. + */ + if (directio || imap.br_startblock == HOLESTARTBLOCK) + imap = cmap; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; @@ -1139,23 +1049,15 @@ xfs_file_iomap_begin( return error; iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: - if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields - & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; - - xfs_bmbt_to_iomap(ip, iomap, &imap); - - if (shared) - iomap->flags |= IOMAP_F_SHARED; - return 0; + return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); out_found: ASSERT(nimaps); xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); goto out_finish; out_unlock: @@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = { }; static int +xfs_seek_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + int error = 0; + unsigned lockmode; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lockmode = xfs_ilock_data_map_shared(ip); + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } + + if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { + /* + * If we found a data extent we are done. + */ + if (imap.br_startoff <= offset_fsb) + goto done; + data_fsb = imap.br_startoff; + } else { + /* + * Fake a hole until the end of the file. + */ + data_fsb = min(XFS_B_TO_FSB(mp, offset + length), + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + } + + /* + * If a COW fork extent covers the hole, report it - capped to the next + * data fork extent: + */ + if (xfs_inode_has_cow_data(ip) && + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cow_fsb = cmap.br_startoff; + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + if (data_fsb < cow_fsb + cmap.br_blockcount) + end_fsb = min(end_fsb, data_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + /* + * This is a COW extent, so we must probe the page cache + * because there could be dirty page cache being backed + * by this extent. + */ + iomap->type = IOMAP_UNWRITTEN; + goto out_unlock; + } + + /* + * Else report a hole, capped to the next found data or COW extent. + */ + if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) + imap.br_blockcount = cow_fsb - offset_fsb; + else + imap.br_blockcount = data_fsb - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; +done: + xfs_trim_extent(&imap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + +const struct iomap_ops xfs_seek_iomap_ops = { + .iomap_begin = xfs_seek_iomap_begin, +}; + +static int xfs_xattr_iomap_begin( struct inode *inode, loff_t offset, @@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin( out_unlock: xfs_iunlock(ip, lockmode); - if (!error) { - ASSERT(nimaps); - xfs_bmbt_to_iomap(ip, iomap, &imap); - } - - return error; + if (error) + return error; + ASSERT(nimaps); + return xfs_bmbt_to_iomap(ip, iomap, &imap, false); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c6170548831b..5c2f6aa6d78f 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,12 +13,10 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, - struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); -void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *); +int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *, bool shared); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t @@ -42,6 +40,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f48ffd7a8d3e..74047bd0c1ae 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -191,9 +191,18 @@ xfs_generic_create( xfs_setup_iops(ip); - if (tmpfile) + if (tmpfile) { + /* + * The VFS requires that any inode fed to d_tmpfile must have + * nlink == 1 so that it can decrement the nlink in d_tmpfile. + * However, we created the temp file with nlink == 0 because + * we're not allowed to put an inode with nlink > 0 on the + * unlinked list. Therefore we have to set nlink to 1 so that + * d_tmpfile can immediately set it back to zero. + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); - else + } else d_instantiate(dentry, inode); xfs_finish_inode_setup(ip); @@ -522,6 +531,10 @@ xfs_vn_getattr( } } + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) @@ -529,6 +542,10 @@ xfs_vn_getattr( if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND | + STATX_ATTR_NODUMP); + switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index e9508ba01ed1..942e4aa5e729 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -167,20 +167,18 @@ xfs_bulkstat_ichunk_ra( { xfs_agblock_t agbno; struct blk_plug plug; - int blks_per_cluster; - int inodes_per_cluster; int i; /* inode chunk index */ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); - blks_per_cluster = xfs_icluster_size_fsb(mp); - inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; blk_start_plug(&plug); for (i = 0; i < XFS_INODES_PER_CHUNK; - i += inodes_per_cluster, agbno += blks_per_cluster) { - if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) { - xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster, - &xfs_inode_buf_ops); + i += mp->m_inodes_per_cluster, agbno += mp->m_blocks_per_cluster) { + if (xfs_inobt_maskn(i, mp->m_inodes_per_cluster) & + ~irec->ir_free) { + xfs_btree_reada_bufs(mp, agno, agbno, + mp->m_blocks_per_cluster, + &xfs_inode_buf_ops); } } blk_finish_plug(&plug); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1fc9e9042e0e..3371d1ff27c4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type( case XFS_BLFT_BTREE_BUF: switch (magic32) { case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_allocbt_buf_ops; + bp->b_ops = &xfs_cntbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: - case XFS_FIBT_CRC_MAGIC: case XFS_IBT_MAGIC: - case XFS_FIBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; @@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); @@ -3850,7 +3854,6 @@ xlog_recover_do_icreate_pass2( unsigned int count; unsigned int isize; xfs_agblock_t length; - int blks_per_cluster; int bb_per_cluster; int cancel_count; int nbufs; @@ -3918,14 +3921,13 @@ xlog_recover_do_icreate_pass2( * buffers for cancellation so we don't overwrite anything written after * a cancellation. */ - blks_per_cluster = xfs_icluster_size_fsb(mp); - bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster); - nbufs = length / blks_per_cluster; + bb_per_cluster = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + nbufs = length / mp->m_blocks_per_cluster; for (i = 0, cancel_count = 0; i < nbufs; i++) { xfs_daddr_t daddr; daddr = XFS_AGB_TO_DADDR(mp, agno, - agbno + i * blks_per_cluster); + agbno + i * mp->m_blocks_per_cluster); if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) cancel_count++; } diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 576c375ce12a..6b736ea58d35 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -107,5 +107,5 @@ assfail(char *expr, char *file, int line) void xfs_hex_dump(void *p, int length) { - print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1); + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 02d15098dbee..fd63b0b1307c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -149,6 +149,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -227,6 +228,9 @@ xfs_initialize_perag( /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + error = xfs_iunlink_init(pag); + if (error) + goto out_hash_destroy; } index = xfs_set_inode_alloc(mp, agcount); @@ -249,6 +253,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_iunlink_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } @@ -798,6 +803,10 @@ xfs_mountfs( if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) mp->m_inode_cluster_size = new_size; } + mp->m_blocks_per_cluster = xfs_icluster_size_fsb(mp); + mp->m_inodes_per_cluster = XFS_FSB_TO_INO(mp, mp->m_blocks_per_cluster); + mp->m_cluster_align = xfs_ialloc_cluster_alignment(mp); + mp->m_cluster_align_inodes = XFS_FSB_TO_INO(mp, mp->m_cluster_align); /* * If enabled, sparse inode chunk alignment is expected to match the diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7964513c3128..110f927cf943 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -89,6 +89,13 @@ typedef struct xfs_mount { int m_logbsize; /* size of each log buffer */ uint m_rsumlevels; /* rt summary levels */ uint m_rsumsize; /* size of rt summary, bytes */ + /* + * Optional cache of rt summary level per bitmap block with the + * invariant that m_rsum_cache[bbno] <= the minimum i for which + * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip + * inode lock. + */ + uint8_t *m_rsum_cache; struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ @@ -101,6 +108,10 @@ typedef struct xfs_mount { uint8_t m_agno_log; /* log #ag's */ uint8_t m_agino_log; /* #bits for agino in inum */ uint m_inode_cluster_size;/* min inode buf size */ + unsigned int m_inodes_per_cluster; + unsigned int m_blocks_per_cluster; + unsigned int m_cluster_align; + unsigned int m_cluster_align_inodes; uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -127,7 +138,7 @@ typedef struct xfs_mount { struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint64_t m_flags; /* global mount flags */ - bool m_inotbt_nores; /* no per-AG finobt resv. */ + bool m_finobt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode @@ -183,6 +194,7 @@ typedef struct xfs_mount { */ uint32_t m_generation; + bool m_always_cow; bool m_fail_unmount; #ifdef DEBUG /* @@ -385,6 +397,13 @@ typedef struct xfs_perag { /* reference count */ uint8_t pagf_refcount_level; + + /* + * Unlinked inode information. This incore information reflects + * data stored in the AGI, so callers must hold the AGI buffer lock + * or have some other means to control concurrency. + */ + struct rhashtable pagi_unlinked_hash; } xfs_perag_t; static inline struct xfs_ag_resv * diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index d3e04d20d8d4..c8ba98fae30a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + + /* + * The v5 superblock format extended several v4 header structures with + * additional data. While new fields are only accessible on v5 + * superblocks, it's important that the v5 structures place original v4 + * fields/headers in the correct location on-disk. For example, we must + * be able to find magic values at the same location in certain blocks + * regardless of superblock version. + * + * The following checks ensure that various v5 data structures place the + * subset of v4 metadata associated with the same type of block at the + * start of the on-disk block. If there is no data structure definition + * for certain types of v4 blocks, traverse down to the first field of + * common metadata (e.g., magic value) and make sure it is at offset + * zero. + */ + XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index f44c3599527d..bde2c9f56a46 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -185,7 +185,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_bmbt_to_iomap(ip, iomap, &imap); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 73a1d77ec187..3091e4bc04ef 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -40,7 +40,7 @@ xfs_fill_statvfs_from_dquot( statp->f_files = limit; statp->f_ffree = (statp->f_files > dqp->q_res_icount) ? - (statp->f_ffree - dqp->q_res_icount) : 0; + (statp->f_files - dqp->q_res_icount) : 0; } } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 8eaeec9d58ed..680ae7662a78 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -234,92 +234,59 @@ xfs_reflink_trim_around_shared( } } -/* - * Trim the passed in imap to the next shared/unshared extent boundary, and - * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. - * - * Note that imap will always contain the block numbers for the existing blocks - * in the data fork, as the upper layers need them for read-modify-write - * operations. - */ -int -xfs_reflink_reserve_cow( +bool +xfs_inode_need_cow( struct xfs_inode *ip, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool *shared) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got; - int error = 0; - bool eof = false; - struct xfs_iext_cursor icur; - bool shared; - - /* - * Search the COW fork extent list first. This serves two purposes: - * first this implement the speculative preallocation using cowextisze, - * so that we also unshared block adjacent to shared blocks instead - * of just the shared blocks themselves. Second the lookup in the - * extent list is generally faster than going out to the shared extent - * tree. - */ - - if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) - eof = true; - if (!eof && got.br_startoff <= imap->br_startoff) { - trace_xfs_reflink_cow_found(ip, imap); - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + /* We can't update any real extents in always COW mode. */ + if (xfs_is_always_cow_inode(ip) && + !isnullstartblock(imap->br_startblock)) { + *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, &shared); - if (error) - return error; - - /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) - return 0; - - /* - * Fork all the shared blocks from our write offset until the end of - * the extent. - */ - error = xfs_qm_dqattach_locked(ip, false); - if (error) - return error; - - error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &icur, eof); - if (error == -ENOSPC || error == -EDQUOT) - trace_xfs_reflink_cow_enospc(ip, imap); - if (error) - return error; - - trace_xfs_reflink_cow_alloc(ip, &got); - return 0; + return xfs_reflink_trim_around_shared(ip, imap, shared); } -/* Convert part of an unwritten CoW extent to a real one. */ -STATIC int -xfs_reflink_convert_cow_extent( - struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - xfs_fileoff_t offset_fsb, - xfs_filblks_t count_fsb) +static int +xfs_reflink_convert_cow_locked( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb) { - int nimaps = 1; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_btree_cur *dummy_cur = NULL; + int dummy_logflags; + int error = 0; - if (imap->br_state == XFS_EXT_NORM) + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; - xfs_trim_extent(imap, offset_fsb, count_fsb); - trace_xfs_reflink_convert_cow(ip, imap); - if (imap->br_blockcount == 0) - return 0; - return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, - &nimaps); + do { + if (got.br_startoff >= offset_fsb + count_fsb) + break; + if (got.br_state == XFS_EXT_NORM) + continue; + if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) + return -EIO; + + xfs_trim_extent(&got, offset_fsb, count_fsb); + if (!got.br_blockcount) + continue; + + got.br_state = XFS_EXT_NORM; + error = xfs_bmap_add_extent_unwritten_real(NULL, ip, + XFS_COW_FORK, &icur, &dummy_cur, &got, + &dummy_logflags); + if (error) + return error; + } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); + + return error; } /* Convert all of the unwritten CoW extents in a file's range to real ones. */ @@ -333,15 +300,12 @@ xfs_reflink_convert_cow( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_filblks_t count_fsb = end_fsb - offset_fsb; - struct xfs_bmbt_irec imap; - int nimaps = 1, error = 0; + int error; ASSERT(count != 0); xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | - XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -374,7 +338,7 @@ xfs_find_trim_cow_extent( if (got.br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, got.br_startoff - imap->br_startoff); - return xfs_reflink_trim_around_shared(ip, imap, shared); + return xfs_inode_need_cow(ip, imap, shared); } *shared = true; @@ -396,7 +360,8 @@ xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, - uint *lockmode) + uint *lockmode, + bool convert_now) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; @@ -408,7 +373,10 @@ xfs_reflink_allocate_cow( xfs_extlen_t resblks = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } error = xfs_find_trim_cow_extent(ip, imap, shared, &found); if (error || !*shared) @@ -470,7 +438,16 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: - return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); + xfs_trim_extent(imap, offset_fsb, count_fsb); + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || imap->br_state == XFS_EXT_NORM) + return 0; + trace_xfs_reflink_convert_cow(ip, imap); + return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, @@ -585,7 +562,7 @@ xfs_reflink_cancel_cow_range( int error; trace_xfs_reflink_cancel_cow_range(ip, offset, count); - ASSERT(xfs_is_reflink_inode(ip)); + ASSERT(ip->i_cowfp); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (count == NULLFILEOFF) @@ -622,54 +599,47 @@ out: } /* - * Remap parts of a file's data fork after a successful CoW. + * Remap part of the CoW fork into the data fork. + * + * We aim to remap the range starting at @offset_fsb and ending at @end_fsb + * into the data fork; this function will remap what it can (at the end of the + * range) and update @end_fsb appropriately. Each remap gets its own + * transaction because we can end up merging and splitting bmbt blocks for + * every remap operation and we'd like to keep the block reservation + * requirements as low as possible. */ -int -xfs_reflink_end_cow( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t count) +STATIC int +xfs_reflink_end_cow_extent( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t *end_fsb) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, del; - struct xfs_trans *tp; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t end_fsb; - int error; - unsigned int resblks; - xfs_filblks_t rlen; - struct xfs_iext_cursor icur; - - trace_xfs_reflink_end_cow(ip, offset, count); + struct xfs_bmbt_irec got, del; + struct xfs_iext_cursor icur; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_filblks_t rlen; + unsigned int resblks; + int error; /* No COW extents? That's easy! */ - if (ifp->if_bytes == 0) + if (ifp->if_bytes == 0) { + *end_fsb = offset_fsb; return 0; + } - offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); + if (error) + return error; /* - * Start a rolling transaction to switch the mappings. We're - * unlikely ever to have to remap 16T worth of single-block - * extents, so just cap the worst case extent count to 2^32-1. - * Stick a warning in just in case, and avoid 64-bit division. + * Lock the inode. We have to ijoin without automatic unlock because + * the lead transaction is the refcountbt record deletion; the data + * fork update follows as a deferred log item. */ - BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); - if (end_fsb - offset_fsb > UINT_MAX) { - error = -EFSCORRUPTED; - xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); - ASSERT(0); - goto out; - } - resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, - (unsigned int)(end_fsb - offset_fsb), - XFS_DATA_FORK); - error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); - if (error) - goto out; - xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -678,80 +648,131 @@ xfs_reflink_end_cow( * left by the time I/O completes for the loser of the race. In that * case we are done. */ - if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) + if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || + got.br_startoff + got.br_blockcount <= offset_fsb) { + *end_fsb = offset_fsb; goto out_cancel; + } - /* Walk backwards until we're out of the I/O range... */ - while (got.br_startoff + got.br_blockcount > offset_fsb) { - del = got; - xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); - - /* Extent delete may have bumped ext forward */ - if (!del.br_blockcount) - goto prev_extent; + /* + * Structure copy @got into @del, then trim @del to the range that we + * were asked to remap. We preserve @got for the eventual CoW fork + * deletion; from now on @del represents the mapping that we're + * actually remapping. + */ + del = got; + xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); - /* - * Only remap real extent that contain data. With AIO - * speculatively preallocations can leak into the range we - * are called upon, and we need to skip them. - */ - if (!xfs_bmap_is_real_extent(&got)) - goto prev_extent; + ASSERT(del.br_blockcount > 0); - /* Unmap the old blocks in the data fork. */ - ASSERT(tp->t_firstblock == NULLFSBLOCK); - rlen = del.br_blockcount; - error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); - if (error) - goto out_cancel; + /* + * Only remap real extents that contain data. With AIO, speculative + * preallocations can leak into the range we are called upon, and we + * need to skip them. + */ + if (!xfs_bmap_is_real_extent(&got)) { + *end_fsb = del.br_startoff; + goto out_cancel; + } - /* Trim the extent to whatever got unmapped. */ - if (rlen) { - xfs_trim_extent(&del, del.br_startoff + rlen, - del.br_blockcount - rlen); - } - trace_xfs_reflink_cow_remap(ip, &del); + /* Unmap the old blocks in the data fork. */ + rlen = del.br_blockcount; + error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); + if (error) + goto out_cancel; - /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(tp, del.br_startblock, - del.br_blockcount); - if (error) - goto out_cancel; + /* Trim the extent to whatever got unmapped. */ + xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); + trace_xfs_reflink_cow_remap(ip, &del); - /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp, ip, &del); - if (error) - goto out_cancel; + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(tp, del.br_startblock, + del.br_blockcount); + if (error) + goto out_cancel; - /* Charge this new data fork mapping to the on-disk quota. */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, - (long)del.br_blockcount); + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(tp, ip, &del); + if (error) + goto out_cancel; - /* Remove the mapping from the CoW fork. */ - xfs_bmap_del_extent_cow(ip, &icur, &got, &del); + /* Charge this new data fork mapping to the on-disk quota. */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, + (long)del.br_blockcount); - error = xfs_defer_finish(&tp); - if (error) - goto out_cancel; - if (!xfs_iext_get_extent(ifp, &icur, &got)) - break; - continue; -prev_extent: - if (!xfs_iext_prev_extent(ifp, &icur, &got)) - break; - } + /* Remove the mapping from the CoW fork. */ + xfs_bmap_del_extent_cow(ip, &icur, &got, &del); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) - goto out; + return error; + + /* Update the caller about how much progress we made. */ + *end_fsb = del.br_startoff; return 0; out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); -out: - trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + return error; +} + +/* + * Remap parts of a file's data fork after a successful CoW. + */ +int +xfs_reflink_end_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + + /* + * Walk backwards until we're out of the I/O range. The loop function + * repeatedly cycles the ILOCK to allocate one transaction per remapped + * extent. + * + * If we're being called by writeback then the the pages will still + * have PageWriteback set, which prevents races with reflink remapping + * and truncate. Reflink remapping prevents races with writeback by + * taking the iolock and mmaplock before flushing the pages and + * remapping, which means there won't be any further writeback or page + * cache dirtying until the reflink completes. + * + * We should never have two threads issuing writeback for the same file + * region. There are also have post-eof checks in the writeback + * preparation code so that we don't bother writing out pages that are + * about to be truncated. + * + * If we're being called as part of directio write completion, the dio + * count is still elevated, which reflink and truncate will wait for. + * Reflink remapping takes the iolock and mmaplock and waits for + * pending dio to finish, which should prevent any directio until the + * remap completes. Multiple concurrent directio writes to the same + * region are handled by end_cow processing only occurring for the + * threads which succeed; the outcome of multiple overlapping direct + * writes is not well defined anyway. + * + * It's possible that a buffered write and a direct write could collide + * here (the buffered write stumbles in after the dio flushes and + * invalidates the page cache and immediately queues writeback), but we + * have never supported this 100%. If either disk write succeeds the + * blocks will be remapped. + */ + while (end_fsb > offset_fsb && !error) + error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); + + if (error) + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); return error; } @@ -913,18 +934,18 @@ out_error: /* * Update destination inode size & cowextsize hint, if necessary. */ -STATIC int +int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, - bool is_dedupe) + unsigned int remap_flags) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -945,10 +966,6 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } - if (!is_dedupe) { - xfs_trans_ichgtime(tp, dest, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); @@ -1112,19 +1129,28 @@ out: /* * Iteratively remap one file's extents (and holes) to another's. */ -STATIC int +int xfs_reflink_remap_blocks( struct xfs_inode *src, - xfs_fileoff_t srcoff, + loff_t pos_in, struct xfs_inode *dest, - xfs_fileoff_t destoff, - xfs_filblks_t len, - xfs_off_t new_isize) + loff_t pos_out, + loff_t remap_len, + loff_t *remapped) { struct xfs_bmbt_irec imap; + xfs_fileoff_t srcoff; + xfs_fileoff_t destoff; + xfs_filblks_t len; + xfs_filblks_t range_len; + xfs_filblks_t remapped_len = 0; + xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; - xfs_filblks_t range_len; + + destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); + srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); + len = XFS_B_TO_FSB(src->i_mount, remap_len); /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ while (len) { @@ -1139,10 +1165,10 @@ xfs_reflink_remap_blocks( error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); xfs_iunlock(src, lock_mode); if (error) - goto err; + break; ASSERT(nimaps == 1); - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, &imap); /* Translate imap into the destination file. */ @@ -1153,23 +1179,24 @@ xfs_reflink_remap_blocks( error = xfs_reflink_remap_extent(dest, &imap, destoff, new_isize); if (error) - goto err; + break; if (fatal_signal_pending(current)) { error = -EINTR; - goto err; + break; } /* Advance drange/srange */ srcoff += range_len; destoff += range_len; len -= range_len; + remapped_len += range_len; } - return 0; - -err: - trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + if (error) + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + *remapped = min_t(loff_t, remap_len, + XFS_FSB_TO_B(src->i_mount, remapped_len)); return error; } @@ -1218,7 +1245,7 @@ retry: } /* Unlock both inodes after they've been prepped for a range clone. */ -STATIC void +void xfs_reflink_remap_unlock( struct file *file_in, struct file *file_out) @@ -1286,21 +1313,20 @@ xfs_reflink_zero_posteof( * stale data in the destination file. Hence we reject these clone attempts with * -EINVAL in this case. */ -STATIC int +int xfs_reflink_remap_prep( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, - bool is_dedupe) + loff_t *len, + unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); bool same_inode = (inode_in == inode_out); - u64 blkmask = i_blocksize(inode_in) - 1; ssize_t ret; /* Lock both files against IO */ @@ -1323,29 +1349,11 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - len, is_dedupe); - if (ret <= 0) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); + if (ret < 0 || *len == 0) goto out_unlock; - /* - * If the dedupe data matches, chop off the partial EOF block - * from the source file so we don't try to dedupe the partial - * EOF block. - */ - if (is_dedupe) { - *len &= ~blkmask; - } else if (*len & blkmask) { - /* - * The user is attempting to share a partial EOF block, - * if it's inside the destination EOF then reject it. - */ - if (pos_out + *len < i_size_read(inode_out)) { - ret = -EINVAL; - goto out_unlock; - } - } - /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) @@ -1364,102 +1372,23 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; - /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + *len) - 1); - - /* If we're altering the file contents... */ - if (!is_dedupe) { - /* - * ...update the timestamps (which will grab the ilock again - * from xfs_fs_dirty_inode, so we have to call it before we - * take the ilock). - */ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - goto out_unlock; - } - - /* - * ...clear the security bits if the process is not being run - * by root. This keeps people from modifying setuid and setgid - * binaries. - */ - ret = file_remove_privs(file_out); - if (ret) - goto out_unlock; + /* + * If pos_out > EOF, we may have dirtied blocks between EOF and + * pos_out. In that case, we need to extend the flush and unmap to cover + * from EOF to the end of the copy length. + */ + if (pos_out > XFS_ISIZE(dest)) { + loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); + ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); + } else { + ret = xfs_flush_unmap_range(dest, pos_out, *len); } - - return 1; -out_unlock: - xfs_reflink_remap_unlock(file_in, file_out); - return ret; -} - -/* - * Link a range of blocks from one file to another. - */ -int -xfs_reflink_remap_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe) -{ - struct inode *inode_in = file_inode(file_in); - struct xfs_inode *src = XFS_I(inode_in); - struct inode *inode_out = file_inode(file_out); - struct xfs_inode *dest = XFS_I(inode_out); - struct xfs_mount *mp = src->i_mount; - xfs_fileoff_t sfsbno, dfsbno; - xfs_filblks_t fsblen; - xfs_extlen_t cowextsize; - ssize_t ret; - - if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return -EOPNOTSUPP; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - /* Prepare and then clone file data. */ - ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, - &len, is_dedupe); - if (ret <= 0) - return ret; - - trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - - dfsbno = XFS_B_TO_FSBT(mp, pos_out); - sfsbno = XFS_B_TO_FSBT(mp, pos_in); - fsblen = XFS_B_TO_FSB(mp, len); - ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, - pos_out + len); if (ret) goto out_unlock; - /* - * Carry the cowextsize hint from src to dest if we're sharing the - * entire source file to the entire destination file, the source file - * has a cowextsize hint, and the destination file does not. - */ - cowextsize = 0; - if (pos_in == 0 && len == i_size_read(inode_in) && - (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && - pos_out == 0 && len >= i_size_read(inode_out) && - !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) - cowextsize = src->i_d.di_cowextsize; - - ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, - is_dedupe); - + return 1; out_unlock: xfs_reflink_remap_unlock(file_in, file_out); - if (ret) - trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; } diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 7f47202b5639..28a43b7f581d 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -6,16 +6,28 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) +{ + return ip->i_mount->m_always_cow && + xfs_sb_version_hasreflink(&ip->i_mount->m_sb); +} + +static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +{ + return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); +} + extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + bool *shared); -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); + struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, + bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); @@ -27,13 +39,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); -extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); +extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t len, + unsigned int remap_flags); extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *has_shared); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, struct xfs_trans **tpp); extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); +extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t *len, + unsigned int remap_flags); +extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, + struct xfs_inode *dest, loff_t pos_out, loff_t remap_len, + loff_t *remapped); +extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, + xfs_extlen_t cowextsize, unsigned int remap_flags); +extern void xfs_reflink_remap_unlock(struct file *file_in, + struct file *file_out); #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 926ed314ffba..ac0fcdad0c4e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -64,8 +64,12 @@ xfs_rtany_summary( int log; /* loop counter, log2 of ext. size */ xfs_suminfo_t sum; /* summary data */ + /* There are no extents at levels < m_rsum_cache[bbno]. */ + if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno]) + low = mp->m_rsum_cache[bbno]; + /* - * Loop over logs of extent sizes. Order is irrelevant. + * Loop over logs of extent sizes. */ for (log = low; log <= high; log++) { /* @@ -80,13 +84,17 @@ xfs_rtany_summary( */ if (sum) { *stat = 1; - return 0; + goto out; } } /* * Found nothing, return failure. */ *stat = 0; +out: + /* There were no extents at levels < log. */ + if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; return 0; } @@ -853,6 +861,21 @@ out_trans_cancel: return error; } +static void +xfs_alloc_rsum_cache( + xfs_mount_t *mp, /* file system mount structure */ + xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */ +{ + /* + * The rsum cache is initialized to all zeroes, which is trivially a + * lower bound on the minimum level with any free extents. We can + * continue without the cache if it couldn't be allocated. + */ + mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); + if (!mp->m_rsum_cache) + xfs_warn(mp, "could not allocate realtime summary cache"); +} + /* * Visible (exported) functions. */ @@ -881,6 +904,7 @@ xfs_growfs_rt( xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ xfs_fsblock_t sumbno; /* summary block number */ + uint8_t *rsum_cache; /* old summary cache */ sbp = &mp->m_sb; /* @@ -937,6 +961,11 @@ xfs_growfs_rt( error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); if (error) return error; + + rsum_cache = mp->m_rsum_cache; + if (nrbmblocks != sbp->sb_rbmblocks) + xfs_alloc_rsum_cache(mp, nrbmblocks); + /* * Allocate a new (fake) mount/sb. */ @@ -1062,6 +1091,20 @@ error_cancel: */ kmem_free(nmp); + /* + * If we had to allocate a new rsum_cache, we either need to free the + * old one (if we succeeded) or free the new one and restore the old one + * (if there was an error). + */ + if (rsum_cache != mp->m_rsum_cache) { + if (error) { + kmem_free(mp->m_rsum_cache); + mp->m_rsum_cache = rsum_cache; + } else { + kmem_free(rsum_cache); + } + } + return error; } @@ -1187,8 +1230,8 @@ xfs_rtmount_init( } /* - * Get the bitmap and summary inodes into the mount structure - * at mount time. + * Get the bitmap and summary inodes and the summary cache into the mount + * structure at mount time. */ int /* error */ xfs_rtmount_inodes( @@ -1198,19 +1241,18 @@ xfs_rtmount_inodes( xfs_sb_t *sbp; sbp = &mp->m_sb; - if (sbp->sb_rbmino == NULLFSINO) - return 0; error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); if (error) return error; ASSERT(mp->m_rbmip != NULL); - ASSERT(sbp->sb_rsumino != NULLFSINO); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); if (error) { xfs_irele(mp->m_rbmip); return error; } ASSERT(mp->m_rsumip != NULL); + xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); return 0; } @@ -1218,6 +1260,7 @@ void xfs_rtunmount_inodes( struct xfs_mount *mp) { + kmem_free(mp->m_rsum_cache); if (mp->m_rbmip) xfs_irele(mp->m_rbmip); if (mp->m_rsumip) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d3e6cd063688..f093ea244849 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -38,6 +38,7 @@ #include "xfs_refcount_item.h" #include "xfs_bmap_item.h" #include "xfs_reflink.h" +#include "xfs_defer.h" #include <linux/namei.h> #include <linux/dax.h> @@ -607,7 +608,7 @@ xfs_set_inode_alloc( } /* Get the last possible inode in the filesystem */ - agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + agino = XFS_AGB_TO_AGINO(mp, sbp->sb_agblocks - 1); ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); /* @@ -1149,7 +1150,7 @@ xfs_fs_statfs( statp->f_bfree = fdblocks - mp->m_alloc_set_aside; statp->f_bavail = statp->f_bfree; - fakeinos = statp->f_bfree << sbp->sb_inopblog; + fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), @@ -1593,6 +1594,13 @@ xfs_mount_alloc( INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; return mp; } @@ -1728,11 +1736,18 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) { - xfs_alert(mp, + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, "reflink not compatible with realtime device!"); - error = -EINVAL; - goto out_filestream_unmount; + error = -EINVAL; + goto out_filestream_unmount; + } + + if (xfs_globals.always_cow) { + xfs_info(mp, "using DEBUG-only always_cow mode."); + mp->m_always_cow = true; + } } if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { @@ -2085,11 +2100,6 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); - xfs_extent_free_init_defer_op(); - xfs_rmap_update_init_defer_op(); - xfs_refcount_update_init_defer_op(); - xfs_bmap_update_init_defer_op(); - xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index a3e98c64b6e3..b2c1177c717f 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -192,6 +192,7 @@ xfs_symlink( pathlen = strlen(target_path); if (pathlen >= XFS_SYMLINK_MAXLEN) /* total string too long */ return -ENAMETOOLONG; + ASSERT(pathlen > 0); udqp = gdqp = NULL; prid = xfs_get_initial_prid(dp); @@ -378,6 +379,12 @@ out_release_inode: /* * Free a symlink that has blocks associated with it. + * + * Note: zero length symlinks are not allowed to exist. When we set the size to + * zero, also change it to a regular file so that it does not get written to + * disk as a zero length symlink. The inode is on the unlinked list already, so + * userspace cannot find this inode anymore, so this change is not user visible + * but allows us to catch corrupt zero-length symlinks in the verifiers. */ STATIC int xfs_inactive_symlink_rmt( @@ -412,13 +419,14 @@ xfs_inactive_symlink_rmt( xfs_trans_ijoin(tp, ip, 0); /* - * Lock the inode, fix the size, and join it to the transaction. - * Hold it so in the normal path, we still have it locked for - * the second transaction. In the error paths we need it + * Lock the inode, fix the size, turn it into a regular file and join it + * to the transaction. Hold it so in the normal path, we still have it + * locked for the second transaction. In the error paths we need it * held so the cancel won't rele it, see below. */ size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; + VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -494,17 +502,10 @@ xfs_inactive_symlink( return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Zero length symlinks _can_ exist. - */ pathlen = (int)ip->i_d.di_size; - if (!pathlen) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; - } + ASSERT(pathlen); - if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { + if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", __func__, (unsigned long long)ip->i_ino, pathlen); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -512,12 +513,12 @@ xfs_inactive_symlink( return -EFSCORRUPTED; } + /* + * Inline fork state gets removed by xfs_difree() so we have nothing to + * do here in that case. + */ if (ip->i_df.if_flags & XFS_IFINLINE) { - if (ip->i_df.if_bytes > 0) - xfs_idata_realloc(ip, -(ip->i_df.if_bytes), - XFS_DATA_FORK); xfs_iunlock(ip, XFS_ILOCK_EXCL); - ASSERT(ip->i_df.if_bytes == 0); return 0; } diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 168488130a19..ad7f9be13087 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,7 @@ struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ + bool always_cow; /* use COW fork for all overwrites */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index cd6a994a7250..cabda13f3c64 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -183,10 +183,34 @@ mount_delay_show( } XFS_SYSFS_ATTR_RW(mount_delay); +static ssize_t +always_cow_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.always_cow); + if (ret < 0) + return ret; + return count; +} + +static ssize_t +always_cow_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow); +} +XFS_SYSFS_ATTR_RW(always_cow); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), + ATTR_LIST(always_cow), NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3043e5ed6495..47fb07d86efd 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; + if (bp->b_bn == XFS_BUF_DADDR_NULL) + __entry->bno = bp->b_maps[0].bm_bn; + else + __entry->bno = bp->b_bn; __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -637,6 +640,16 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); +/* + * ftrace's __print_symbolic requires that all enum values be wrapped in the + * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace + * ring buffer. Somehow this was only worth mentioning in the ftrace sample + * code. + */ +TRACE_DEFINE_ENUM(PE_SIZE_PTE); +TRACE_DEFINE_ENUM(PE_SIZE_PMD); +TRACE_DEFINE_ENUM(PE_SIZE_PUD); + TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), @@ -1207,15 +1220,15 @@ DEFINE_READPAGE_EVENT(xfs_vm_readpages); DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int type, struct xfs_bmbt_irec *irec), - TP_ARGS(ip, offset, count, type, irec), + int whichfork, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, whichfork, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, type) + __field(int, whichfork) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -1226,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->type = type; + __entry->whichfork = whichfork; __entry->startoff = irec ? irec->br_startoff : 0; __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->count, - __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->whichfork == XFS_COW_FORK ? "cow" : "data", __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount) ) -#define DEFINE_IOMAP_EVENT(name) \ +#define DEFINE_IMAP_EVENT(name) \ DEFINE_EVENT(xfs_imap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int type, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, type, irec)) -DEFINE_IOMAP_EVENT(xfs_map_blocks_found); -DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_found); + int whichfork, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, whichfork, irec)) +DEFINE_IMAP_EVENT(xfs_map_blocks_found); +DEFINE_IMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -1882,11 +1895,11 @@ TRACE_EVENT(xfs_dir2_leafn_moveents, { 0, "target" }, \ { 1, "temp" } -#define XFS_INODE_FORMAT_STR \ - { 0, "invalid" }, \ - { 1, "local" }, \ - { 2, "extent" }, \ - { 3, "btree" } +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_DEV); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID); DECLARE_EVENT_CLASS(xfs_swap_extent_class, TP_PROTO(struct xfs_inode *ip, int which), @@ -2175,6 +2188,14 @@ DEFINE_DISCARD_EVENT(xfs_discard_exclude); DEFINE_DISCARD_EVENT(xfs_discard_busy); /* btree cursor events */ +TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); +TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); + DECLARE_EVENT_CLASS(xfs_btree_cur_class, TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), TP_ARGS(cur, level, bp), @@ -2194,9 +2215,9 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, __entry->ptr = cur->bc_ptrs[level]; __entry->daddr = bp ? bp->b_bn : -1; ), - TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx", + TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->btnum, + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->nlevels, __entry->ptr, @@ -2273,7 +2294,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type->type; + __entry->type = dfp->dfp_type; __entry->intent = dfp->dfp_intent; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; @@ -2402,7 +2423,7 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); DECLARE_EVENT_CLASS(xfs_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, - struct xfs_owner_info *oinfo), + const struct xfs_owner_info *oinfo), TP_ARGS(mp, agno, agbno, len, unwritten, oinfo), TP_STRUCT__entry( __field(dev_t, dev) @@ -2437,7 +2458,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, DEFINE_EVENT(xfs_rmap_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \ - struct xfs_owner_info *oinfo), \ + const struct xfs_owner_info *oinfo), \ TP_ARGS(mp, agno, agbno, len, unwritten, oinfo)) /* simple AG-based error/%ip tracepoint class */ @@ -2607,10 +2628,9 @@ DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); #define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name) /* ag btree lookup tracepoint class */ -#define XFS_AG_BTREE_CMP_FORMAT_STR \ - { XFS_LOOKUP_EQ, "eq" }, \ - { XFS_LOOKUP_LE, "le" }, \ - { XFS_LOOKUP_GE, "ge" } +TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi); +TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi); +TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi); DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_lookup_t dir), @@ -3052,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); TRACE_EVENT(xfs_reflink_remap_blocks_loop, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, @@ -3176,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); -DEFINE_RW_EVENT(xfs_reflink_reserve_cow); - DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); @@ -3345,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll); DEFINE_TRANS_EVENT(xfs_trans_add_item); DEFINE_TRANS_EVENT(xfs_trans_free_items); +TRACE_EVENT(xfs_iunlink_update_bucket, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bucket = bucket; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_ptr, + __entry->new_ptr) +); + +TRACE_EVENT(xfs_iunlink_update_dinode, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, agino, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->old_ptr, + __entry->new_ptr) +); + +DECLARE_EVENT_CLASS(xfs_ag_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno %u agino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agino) +) + +#define DEFINE_AGINODE_EVENT(name) \ +DEFINE_EVENT(xfs_ag_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_AGINODE_EVENT(xfs_iunlink); +DEFINE_AGINODE_EVENT(xfs_iunlink_remove); +DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a0c5dbda18aa..c6e1c5704a8c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -223,13 +223,13 @@ void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); bool xfs_trans_buf_is_dirty(struct xfs_buf *bp); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); -void xfs_extent_free_init_defer_op(void); struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *, struct xfs_efi_log_item *, uint); int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, - xfs_extlen_t, struct xfs_owner_info *, + xfs_extlen_t, + const struct xfs_owner_info *, bool); int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **); @@ -248,7 +248,6 @@ extern kmem_zone_t *xfs_trans_zone; /* rmap updates */ enum xfs_rmap_intent_type; -void xfs_rmap_update_init_defer_op(void); struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp, struct xfs_rui_log_item *ruip); int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, @@ -260,7 +259,6 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, /* refcount updates */ enum xfs_refcount_intent_type; -void xfs_refcount_update_init_defer_op(void); struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp, struct xfs_cui_log_item *cuip); int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, @@ -272,7 +270,6 @@ int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, /* mapping updates */ enum xfs_bmap_intent_type; -void xfs_bmap_update_init_defer_op(void); struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp, struct xfs_bui_log_item *buip); int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 741c558b2179..e1c7d55b32c3 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -220,8 +220,7 @@ xfs_bmap_update_cancel_item( kmem_free(bmap); } -static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .type = XFS_DEFER_OPS_TYPE_BMAP, +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .max_items = XFS_BUI_MAX_FAST_EXTENTS, .diff_items = xfs_bmap_update_diff_items, .create_intent = xfs_bmap_update_create_intent, @@ -231,10 +230,3 @@ static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .finish_item = xfs_bmap_update_finish_item, .cancel_item = xfs_bmap_update_cancel_item, }; - -/* Register the deferred op type. */ -void -xfs_bmap_update_init_defer_op(void) -{ - xfs_defer_init_op_type(&xfs_bmap_update_defer_type); -} diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 629f1479c9d2..7d65ebf1e847 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -277,7 +277,7 @@ xfs_trans_read_buf_map( * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); - error = xfs_buf_ensure_ops(bp, ops); + error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 855c0b651fd4..8ee7a3f8bb20 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -52,19 +52,20 @@ xfs_trans_get_efd(struct xfs_trans *tp, */ int xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len, - struct xfs_owner_info *oinfo, - bool skip_discard) + struct xfs_trans *tp, + struct xfs_efd_log_item *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len, + const struct xfs_owner_info *oinfo, + bool skip_discard) { - struct xfs_mount *mp = tp->t_mountp; - uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, start_block); - struct xfs_extent *extp; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, + start_block); + int error; trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); @@ -206,8 +207,7 @@ xfs_extent_free_cancel_item( kmem_free(free); } -static const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .type = XFS_DEFER_OPS_TYPE_FREE, +const struct xfs_defer_op_type xfs_extent_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, @@ -274,8 +274,7 @@ xfs_agfl_free_finish_item( /* sub-type with special handling for AGFL deferred frees */ -static const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .type = XFS_DEFER_OPS_TYPE_AGFL_FREE, +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, @@ -285,11 +284,3 @@ static const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .finish_item = xfs_agfl_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, }; - -/* Register the deferred op type. */ -void -xfs_extent_free_init_defer_op(void) -{ - xfs_defer_init_op_type(&xfs_extent_free_defer_type); - xfs_defer_init_op_type(&xfs_agfl_free_defer_type); -} diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c index 523c55663954..8d734728dd1b 100644 --- a/fs/xfs/xfs_trans_refcount.c +++ b/fs/xfs/xfs_trans_refcount.c @@ -227,8 +227,7 @@ xfs_refcount_update_cancel_item( kmem_free(refc); } -static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .type = XFS_DEFER_OPS_TYPE_REFCOUNT, +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .max_items = XFS_CUI_MAX_FAST_EXTENTS, .diff_items = xfs_refcount_update_diff_items, .create_intent = xfs_refcount_update_create_intent, @@ -239,10 +238,3 @@ static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .finish_cleanup = xfs_refcount_update_finish_cleanup, .cancel_item = xfs_refcount_update_cancel_item, }; - -/* Register the deferred op type. */ -void -xfs_refcount_update_init_defer_op(void) -{ - xfs_defer_init_op_type(&xfs_refcount_update_defer_type); -} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 05b00e40251f..5c7936b1be13 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -244,8 +244,7 @@ xfs_rmap_update_cancel_item( kmem_free(rmap); } -static const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .type = XFS_DEFER_OPS_TYPE_RMAP, +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .max_items = XFS_RUI_MAX_FAST_EXTENTS, .diff_items = xfs_rmap_update_diff_items, .create_intent = xfs_rmap_update_create_intent, @@ -256,10 +255,3 @@ static const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .finish_cleanup = xfs_rmap_update_finish_cleanup, .cancel_item = xfs_rmap_update_cancel_item, }; - -/* Register the deferred op type. */ -void -xfs_rmap_update_init_defer_op(void) -{ - xfs_defer_init_op_type(&xfs_rmap_update_defer_type); -} diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 63ee1d5bf1d7..9a63016009a1 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -129,6 +129,9 @@ __xfs_xattr_put_listent( char *offset; int arraytop; + if (context->count < 0 || context->seen_enough) + return; + if (!context->alist) goto compute_size; |