diff options
Diffstat (limited to 'fs/xfs/libxfs')
42 files changed, 2387 insertions, 1504 deletions
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 1e4ee042d52f..3e920cf1b454 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -173,7 +173,6 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -192,7 +191,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); + XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b52ed339727f..d3f2886fdc08 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2511,7 +2511,7 @@ __xfs_free_extent_later( ASSERT(bno != NULLFSBLOCK); ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); ASSERT(!isnullstartblock(bno)); agno = XFS_FSB_TO_AGNO(mp, bno); agbno = XFS_FSB_TO_AGBNO(mp, bno); @@ -2777,7 +2777,7 @@ xfs_alloc_get_freelist( xfs_agblock_t bno; __be32 *agfl_bno; int error; - int logflags; + uint32_t logflags; struct xfs_mount *mp = tp->t_mountp; struct xfs_perag *pag; @@ -2830,9 +2830,9 @@ xfs_alloc_get_freelist( */ void xfs_alloc_log_agf( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields) /* mask of fields to be logged (XFS_AGF_...) */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte offset */ int last; /* last byte offset */ @@ -2902,7 +2902,7 @@ xfs_alloc_put_freelist( struct xfs_perag *pag; __be32 *blockp; int error; - int logflags; + uint32_t logflags; __be32 *agfl_bno; int startoff; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d4c057b764f9..84ca09b2223f 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -121,7 +121,7 @@ void xfs_alloc_log_agf( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* buffer for a.g. freelist header */ - int fields);/* mask of fields to be logged (XFS_AGF_...) */ + uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ /* * Interface for inode allocation to force the pag data to be initialized. diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 23523b802539..836ab1b8ed7b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -24,6 +24,10 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" + +struct kmem_cache *xfs_attr_intent_cache; /* * xfs_attr.c @@ -53,26 +57,22 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac); -STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, - struct xfs_da_state **state); -STATIC int xfs_attr_fillstate(xfs_da_state_t *state); -STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp); -STATIC int xfs_attr_node_removename(struct xfs_da_args *args, - struct xfs_da_state *state); +static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_lookup(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( struct xfs_inode *ip) { - if (!XFS_IFORK_Q(ip) || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0)) + if (!XFS_IFORK_Q(ip)) + return 0; + if (!ip->i_afp) + return 0; + if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0) return 0; return 1; } @@ -97,6 +97,123 @@ xfs_attr_is_leaf( return imap.br_startoff == 0 && imap.br_blockcount == 1; } +/* + * XXX (dchinner): name path state saving and refilling is an optimisation to + * avoid needing to look up name entries after rolling transactions removing + * remote xattr blocks between the name entry lookup and name entry removal. + * This optimisation got sidelined when combining the set and remove state + * machines, but the code has been left in place because it is worthwhile to + * restore the optimisation once the combined state machine paths have settled. + * + * This comment is a public service announcement to remind Future Dave that he + * still needs to restore this code to working order. + */ +#if 0 +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +static int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + trace_xfs_attr_fillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = xfs_buf_daddr(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return 0; +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +static int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + trace_xfs_attr_refillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read_mapped(state->args->trans, + state->args->dp, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + return 0; +} +#else +static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; } +#endif + /*======================================================================== * Overall external interface routines. *========================================================================*/ @@ -166,7 +283,7 @@ xfs_attr_get( /* * Calculate how many blocks we need for the new attribute, */ -STATIC int +int xfs_attr_calc_size( struct xfs_da_args *args, int *local) @@ -199,6 +316,33 @@ xfs_attr_calc_size( return nblks; } +/* Initialize transaction reservation for attr operations */ +void +xfs_init_attr_trans( + struct xfs_da_args *args, + struct xfs_trans_res *tres, + unsigned int *total) +{ + struct xfs_mount *mp = args->dp->i_mount; + + if (args->value) { + tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; + *total = args->total; + } else { + *tres = M_RES(mp)->tr_attrrm; + *total = XFS_ATTRRM_SPACE_RES(mp); + } +} + +/* + * Add an attr to a shortform fork. If there is no space, + * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC. + * to use. + */ STATIC int xfs_attr_try_sf_addname( struct xfs_inode *dp, @@ -230,411 +374,487 @@ xfs_attr_try_sf_addname( return error; } -/* - * Check to see if the attr should be upgraded from non-existent or shortform to - * single-leaf-block attribute list. - */ -static inline bool -xfs_attr_is_shortform( - struct xfs_inode *ip) +static int +xfs_attr_sf_addname( + struct xfs_attr_intent *attr) { - return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0); + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + int error = 0; + + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) { + ASSERT(!error || error == -EEXIST); + attr->xattri_dela_state = XFS_DAS_DONE; + goto out; + } + + /* + * It won't fit in the shortform, transform to a leaf block. GROT: + * another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a concurrent AIL + * push cannot grab the half-baked leaf buffer and run into problems + * with the write verifier. + */ + xfs_trans_bhold(args->trans, attr->xattri_leaf_bp); + attr->xattri_dela_state = XFS_DAS_LEAF_ADD; +out: + trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp); + return error; } /* - * Checks to see if a delayed attribute transaction should be rolled. If so, - * transaction is finished or rolled as needed. + * Handle the state change on completion of a multi-state attr operation. + * + * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first + * modification in a attr replace operation and we still have to do the second + * state, indicated by @replace_state. + * + * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on + * completion of the second half of the attr replace operation we correctly + * signal that it is done. */ -STATIC int -xfs_attr_trans_roll( - struct xfs_delattr_context *dac) +static enum xfs_delattr_state +xfs_attr_complete_op( + struct xfs_attr_intent *attr, + enum xfs_delattr_state replace_state) { - struct xfs_da_args *args = dac->da_args; - int error; + struct xfs_da_args *args = attr->xattri_da_args; + bool do_replace = args->op_flags & XFS_DA_OP_REPLACE; + + args->op_flags &= ~XFS_DA_OP_REPLACE; + if (do_replace) { + args->attr_filter &= ~XFS_ATTR_INCOMPLETE; + return replace_state; + } + return XFS_DAS_DONE; +} + +static int +xfs_attr_leaf_addname( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error; + + ASSERT(xfs_attr_is_leaf(args->dp)); + + /* + * Use the leaf buffer we may already hold locked as a result of + * a sf-to-leaf conversion. The held buffer is no longer valid + * after this call, regardless of the result. + */ + error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp); + attr->xattri_leaf_bp = NULL; + + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; - if (dac->flags & XFS_DAC_DEFER_FINISH) { /* - * The caller wants us to finish all the deferred ops so that we - * avoid pinning the log tail with a large number of deferred - * ops. + * We're not in leaf format anymore, so roll the transaction and + * retry the add to the newly allocated node block. */ - dac->flags &= ~XFS_DAC_DEFER_FINISH; - error = xfs_defer_finish(&args->trans); - } else - error = xfs_trans_roll_inode(&args->trans, args->dp); + attr->xattri_dela_state = XFS_DAS_NODE_ADD; + goto out; + } + if (error) + return error; + /* + * We need to commit and roll if we need to allocate remote xattr blocks + * or perform more xattr manipulations. Otherwise there is nothing more + * to do and we can return success. + */ + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_LEAF_REPLACE); +out: + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); return error; } /* - * Set the attribute specified in @args. + * Add an entry to a node format attr tree. + * + * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell + * the difference between leaf + remote attr blocks and a node format tree, + * so we may still end up having to convert from leaf to node format here. */ -int -xfs_attr_set_args( - struct xfs_da_args *args) +static int +xfs_attr_node_addname( + struct xfs_attr_intent *attr) { - struct xfs_buf *leaf_bp = NULL; - int error = 0; - struct xfs_delattr_context dac = { - .da_args = args, - }; + struct xfs_da_args *args = attr->xattri_da_args; + int error; - do { - error = xfs_attr_set_iter(&dac, &leaf_bp); - if (error != -EAGAIN) - break; + ASSERT(!attr->xattri_leaf_bp); + + error = xfs_attr_node_addname_find_attr(attr); + if (error) + return error; - error = xfs_attr_trans_roll(&dac); - if (error) { - if (leaf_bp) - xfs_trans_brelse(args->trans, leaf_bp); + error = xfs_attr_node_try_addname(attr); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) return error; - } - } while (true); + /* + * No state change, we really are in node form now + * but we need the transaction rolled to continue. + */ + goto out; + } + if (error) + return error; + if (args->rmtblkno) + attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT; + else + attr->xattri_dela_state = xfs_attr_complete_op(attr, + XFS_DAS_NODE_REPLACE); +out: + trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp); return error; } -STATIC int -xfs_attr_sf_addname( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) +static int +xfs_attr_rmtval_alloc( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_inode *dp = args->dp; + struct xfs_da_args *args = attr->xattri_da_args; int error = 0; /* - * Try to add the attr to the attribute list in the inode. + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. */ - error = xfs_attr_try_sf_addname(dp, args); + if (attr->xattri_blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(attr); + if (error) + return error; + /* Roll the transaction only if there is more to allocate. */ + if (attr->xattri_blkcnt > 0) + goto out; + } - /* Should only be 0, -EEXIST or -ENOSPC */ - if (error != -ENOSPC) + error = xfs_attr_rmtval_set_value(args); + if (error) return error; + attr->xattri_dela_state = xfs_attr_complete_op(attr, + ++attr->xattri_dela_state); /* - * It won't fit in the shortform, transform to a leaf block. GROT: - * another possible req'mt for a double-split btree op. + * If we are not doing a rename, we've finished the operation but still + * have to clear the incomplete flag protecting the new attr from + * exposing partially initialised state if we crash during creation. */ - error = xfs_attr_shortform_to_leaf(args, leaf_bp); - if (error) - return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + error = xfs_attr3_leaf_clearflag(args); +out: + trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp); + return error; +} + +/* + * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers + * for later deletion of the entry. + */ +static int +xfs_attr_leaf_mark_incomplete( + struct xfs_da_args *args, + struct xfs_da_state *state) +{ + int error; /* - * Prevent the leaf buffer from being unlocked so that a concurrent AIL - * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. */ - xfs_trans_bhold(args->trans, *leaf_bp); + error = xfs_attr_fillstate(state); + if (error) + return error; /* - * We're still in XFS_DAS_UNINIT state here. We've converted - * the attr fork to leaf format and will restart with the leaf - * add. + * Mark the attribute as INCOMPLETE */ - trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp); - dac->flags |= XFS_DAC_DEFER_FINISH; - return -EAGAIN; + return xfs_attr3_leaf_setflag(args); +} + +/* Ensure the da state of an xattr deferred work item is ready to go. */ +static inline void +xfs_attr_item_init_da_state( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + + if (!attr->xattri_da_state) + attr->xattri_da_state = xfs_da_state_alloc(args); + else + xfs_da_state_reset(attr->xattri_da_state, args); } /* - * Set the attribute specified in @args. - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - * returned. + * Initial setup for xfs_attr_node_removename. Make sure the attr is there and + * the blocks are valid. Attr keys with remote blocks will be marked + * incomplete. */ -int -xfs_attr_set_iter( - struct xfs_delattr_context *dac, - struct xfs_buf **leaf_bp) +static +int xfs_attr_node_removename_setup( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_inode *dp = args->dp; - struct xfs_buf *bp = NULL; - int forkoff, error = 0; - - /* State machine switch */ - switch (dac->dela_state) { - case XFS_DAS_UNINIT: - /* - * If the fork is shortform, attempt to add the attr. If there - * is no space, this converts to leaf format and returns - * -EAGAIN with the leaf buffer held across the roll. The caller - * will deal with a transaction roll error, but otherwise - * release the hold once we return with a clean transaction. - */ - if (xfs_attr_is_shortform(dp)) - return xfs_attr_sf_addname(dac, leaf_bp); - if (*leaf_bp != NULL) { - xfs_trans_bhold_release(args->trans, *leaf_bp); - *leaf_bp = NULL; - } + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state; + int error; - if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_try_add(args, *leaf_bp); - if (error == -ENOSPC) { - error = xfs_attr3_leaf_to_node(args); - if (error) - return error; - - /* - * Finish any deferred work items and roll the - * transaction once more. The goal here is to - * call node_addname with the inode and - * transaction in the same state (inode locked - * and joined, transaction clean) no matter how - * we got to this step. - * - * At this point, we are still in - * XFS_DAS_UNINIT, but when we come back, we'll - * be a node, so we'll fall down into the node - * handling code below - */ - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } else if (error) { - return error; - } + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); + if (error != -EEXIST) + goto out; + error = 0; - dac->dela_state = XFS_DAS_FOUND_LBLK; - } else { - error = xfs_attr_node_addname_find_attr(dac); - if (error) - return error; + state = attr->xattri_da_state; + ASSERT(state->path.blk[state->path.active - 1].bp != NULL); + ASSERT(state->path.blk[state->path.active - 1].magic == + XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_node_addname(dac); - if (error) - return error; + error = xfs_attr_leaf_mark_incomplete(args, state); + if (error) + goto out; + if (args->rmtblkno > 0) + error = xfs_attr_rmtval_invalidate(args); +out: + if (error) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } - dac->dela_state = XFS_DAS_FOUND_NBLK; - } - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - case XFS_DAS_FOUND_LBLK: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ + return error; +} - /* Open coded xfs_attr_rmtval_set without trans handling */ - if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { - dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT; - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); - if (error) - return error; - } - } +/* + * Remove the original attr we have just replaced. This is dependent on the + * original lookup and insert placing the old attr in args->blkno/args->index + * and the new attr in args->blkno2/args->index2. + */ +static int +xfs_attr_leaf_remove_attr( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int forkoff; + int error; - /* - * Repeat allocating remote blocks for the attr value until - * blkcnt drops to zero. - */ - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); - if (error) - return error; - trace_xfs_attr_set_iter_return(dac->dela_state, - args->dp); - return -EAGAIN; - } + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); + if (error) + return error; - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; + xfs_attr3_leaf_remove(bp, args); - /* - * If this is not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - return error; - } + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ - /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans in - * series. - */ - dac->dela_state = XFS_DAS_FLIP_LFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - case XFS_DAS_FLIP_LFLAG: - /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; + return error; +} - fallthrough; - case XFS_DAS_RM_LBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_LBLK; - if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - if (error) - return error; +/* + * Shrink an attribute from leaf to shortform. Used by the node format remove + * path when the node format collapses to a single block and so we have to check + * if it can be collapsed further. + */ +static int +xfs_attr_leaf_shrink( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp; + int forkoff; + int error; - dac->dela_state = XFS_DAS_RD_LEAF; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - } + if (!xfs_attr_is_leaf(dp)) + return 0; - fallthrough; - case XFS_DAS_RD_LEAF: - /* - * This is the last step for leaf format. Read the block with - * the old attr, remove the old attr, check for shortform - * conversion and return. - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; - xfs_attr3_leaf_remove(bp, args); + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) { + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + } else { + xfs_trans_brelse(args->trans, bp); + } - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ + return error; +} - return error; +/* + * Run the attribute operation specified in @attr. + * + * This routine is meant to function as a delayed operation and will set the + * state to XFS_DAS_DONE when the operation is complete. Calling functions will + * need to handle this, and recall the function until either an error or + * XFS_DAS_DONE is detected. + */ +int +xfs_attr_set_iter( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + int error = 0; - case XFS_DAS_FOUND_NBLK: - /* - * Find space for remote blocks and fall into the allocation - * state. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_find_space(dac); - if (error) - return error; + /* State machine switch */ +next_state: + switch (attr->xattri_dela_state) { + case XFS_DAS_UNINIT: + ASSERT(0); + return -EFSCORRUPTED; + case XFS_DAS_SF_ADD: + return xfs_attr_sf_addname(attr); + case XFS_DAS_LEAF_ADD: + return xfs_attr_leaf_addname(attr); + case XFS_DAS_NODE_ADD: + return xfs_attr_node_addname(attr); + + case XFS_DAS_SF_REMOVE: + error = xfs_attr_sf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_LEAF_REMOVE: + error = xfs_attr_leaf_removename(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + case XFS_DAS_NODE_REMOVE: + error = xfs_attr_node_removename_setup(attr); + if (error == -ENOATTR && + (args->op_flags & XFS_DA_OP_RECOVERY)) { + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + error = 0; + break; } + if (error) + return error; + attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT; + if (args->rmtblkno == 0) + attr->xattri_dela_state++; + break; + case XFS_DAS_LEAF_SET_RMT: + case XFS_DAS_NODE_SET_RMT: + error = xfs_attr_rmtval_find_space(attr); + if (error) + return error; + attr->xattri_dela_state++; fallthrough; - case XFS_DAS_ALLOC_NODE: - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - dac->dela_state = XFS_DAS_ALLOC_NODE; - if (args->rmtblkno > 0) { - if (dac->blkcnt > 0) { - error = xfs_attr_rmtval_set_blk(dac); - if (error) - return error; - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } - - error = xfs_attr_rmtval_set_value(args); - if (error) - return error; - } - /* - * If this was not a rename, clear the incomplete flag and we're - * done. - */ - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - goto out; - } + case XFS_DAS_LEAF_ALLOC_RMT: + case XFS_DAS_NODE_ALLOC_RMT: + error = xfs_attr_rmtval_alloc(attr); + if (error) + return error; + if (attr->xattri_dela_state == XFS_DAS_DONE) + break; + goto next_state; + case XFS_DAS_LEAF_REPLACE: + case XFS_DAS_NODE_REPLACE: /* - * If this is an atomic rename operation, we must "flip" the - * incomplete flags on the "new" and "old" attribute/value pairs - * so that one disappears and one appears atomically. Then we - * must remove the "old" attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the - * "old" attr and clear the incomplete flag on the "new" attr. + * We must "flip" the incomplete flags on the "new" and "old" + * attribute/value pairs so that one disappears and one appears + * atomically. */ error = xfs_attr3_leaf_flipflags(args); if (error) - goto out; + return error; /* - * Commit the flag value change and start the next trans in - * series + * We must commit the flag value change now to make it atomic + * and then we can start the next trans in series at REMOVE_OLD. */ - dac->dela_state = XFS_DAS_FLIP_NFLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + attr->xattri_dela_state++; + break; - case XFS_DAS_FLIP_NFLAG: + case XFS_DAS_LEAF_REMOVE_OLD: + case XFS_DAS_NODE_REMOVE_OLD: /* - * Dismantle the "old" attribute/value pair by removing a - * "remote" value (if it exists). + * If we have a remote attr, start the process of removing it + * by invalidating any cached buffers. + * + * If we don't have a remote attr, we skip the remote block + * removal state altogether with a second state increment. */ xfs_attr_restore_rmt_blk(args); - - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - fallthrough; - case XFS_DAS_RM_NBLK: - /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ - dac->dela_state = XFS_DAS_RM_NBLK; if (args->rmtblkno) { - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) - trace_xfs_attr_set_iter_return( - dac->dela_state, args->dp); - + error = xfs_attr_rmtval_invalidate(args); if (error) return error; + } else { + attr->xattri_dela_state++; + } - dac->dela_state = XFS_DAS_CLR_FLAG; - trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); - return -EAGAIN; + attr->xattri_dela_state++; + goto next_state; + + case XFS_DAS_LEAF_REMOVE_RMT: + case XFS_DAS_NODE_REMOVE_RMT: + error = xfs_attr_rmtval_remove(attr); + if (error == -EAGAIN) { + error = 0; + break; } + if (error) + return error; - fallthrough; - case XFS_DAS_CLR_FLAG: /* - * The last state for node format. Look up the old attr and - * remove it. + * We've finished removing the remote attr blocks, so commit the + * transaction and move on to removing the attr name from the + * leaf/node block. Removing the attr might require a full + * transaction reservation for btree block freeing, so we + * can't do that in the same transaction where we removed the + * remote attr blocks. */ - error = xfs_attr_node_addname_clear_incomplete(dac); + attr->xattri_dela_state++; + break; + + case XFS_DAS_LEAF_REMOVE_ATTR: + error = xfs_attr_leaf_remove_attr(attr); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); + break; + + case XFS_DAS_NODE_REMOVE_ATTR: + error = xfs_attr_node_remove_attr(attr); + if (!error) + error = xfs_attr_leaf_shrink(args); + attr->xattri_dela_state = xfs_attr_complete_op(attr, + xfs_attr_init_add_state(args)); break; default: ASSERT(0); break; } -out: + + trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp); return error; } @@ -648,6 +868,7 @@ xfs_attr_lookup( { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; + struct xfs_da_state *state; int error; if (!xfs_inode_hasattr(dp)) @@ -665,33 +886,85 @@ xfs_attr_lookup( return error; } - return xfs_attr_node_hasname(args, NULL); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); + xfs_da_state_free(state); + return error; +} + +static int +xfs_attr_intent_init( + struct xfs_da_args *args, + unsigned int op_flags, /* op flag (set or remove) */ + struct xfs_attr_intent **attr) /* new xfs_attr_intent */ +{ + + struct xfs_attr_intent *new; + + new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); + new->xattri_op_flags = op_flags; + new->xattri_da_args = args; + + *attr = new; + return 0; } -/* - * Remove the attribute specified in @args. - */ -int -xfs_attr_remove_args( +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_add( struct xfs_da_args *args) { - int error; - struct xfs_delattr_context dac = { - .da_args = args, - }; + struct xfs_attr_intent *new; + int error = 0; - do { - error = xfs_attr_remove_iter(&dac); - if (error != -EAGAIN) - break; + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); + if (error) + return error; - error = xfs_attr_trans_roll(&dac); - if (error) - return error; + new->xattri_dela_state = xfs_attr_init_add_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - } while (true); + return 0; +} - return error; +/* Sets an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_replace( + struct xfs_da_args *args) +{ + struct xfs_attr_intent *new; + int error = 0; + + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); + if (error) + return error; + + new->xattri_dela_state = xfs_attr_init_replace_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); + + return 0; +} + +/* Removes an attribute for an inode as a deferred operation */ +static int +xfs_attr_defer_remove( + struct xfs_da_args *args) +{ + + struct xfs_attr_intent *new; + int error; + + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); + if (error) + return error; + + new->xattri_dela_state = xfs_attr_init_remove_state(args); + xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); + + return 0; } /* @@ -730,8 +1003,6 @@ xfs_attr_set( if (args->value) { XFS_STATS_INC(mp, xs_attr_set); - - args->op_flags |= XFS_DA_OP_ADDNAME; args->total = xfs_attr_calc_size(args, &local); /* @@ -748,20 +1019,10 @@ xfs_attr_set( return error; } - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - total = args->total; - if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); } else { XFS_STATS_INC(mp, xs_attr_remove); - - tres = M_RES(mp)->tr_attrrm; - total = XFS_ATTRRM_SPACE_RES(mp); rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } @@ -769,6 +1030,7 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ + xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; @@ -776,33 +1038,43 @@ xfs_attr_set( if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(args->trans, dp, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; } error = xfs_attr_lookup(args); - if (args->value) { - if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) - goto out_trans_cancel; - if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_trans_cancel; - if (error != -ENOATTR && error != -EEXIST) + switch (error) { + case -EEXIST: + /* if no value, we are performing a remove operation */ + if (!args->value) { + error = xfs_attr_defer_remove(args); + break; + } + /* Pure create fails if the attr already exists */ + if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - error = xfs_attr_set_args(args); - if (error) - goto out_trans_cancel; - /* shortform attribute has already been committed */ - if (!args->trans) - goto out_unlock; - } else { - if (error != -EEXIST) + error = xfs_attr_defer_replace(args); + break; + case -ENOATTR: + /* Can't remove what isn't there. */ + if (!args->value) goto out_trans_cancel; - error = xfs_attr_remove_args(args); - if (error) + /* Pure replace fails if no existing attr to replace. */ + if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; + + error = xfs_attr_defer_add(args); + break; + default: + goto out_trans_cancel; } + if (error) + goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the @@ -845,28 +1117,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) * Add a name to the shortform attribute list structure * This is the external routine. */ -STATIC int -xfs_attr_shortform_addname(xfs_da_args_t *args) +static int +xfs_attr_shortform_addname( + struct xfs_da_args *args) { - int newsize, forkoff, retval; + int newsize, forkoff; + int error; trace_xfs_attr_sf_addname(args); - retval = xfs_attr_shortform_lookup(args); - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - return retval; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) - return retval; - retval = xfs_attr_sf_removename(args); - if (retval) - return retval; + error = xfs_attr_shortform_lookup(args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + return error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) + return error; + + error = xfs_attr_sf_removename(args); + if (error) + return error; + /* - * Since we have removed the old attr, clear ATTR_REPLACE so - * that the leaf format add routine won't trip over the attr - * not being around. + * Since we have removed the old attr, clear XFS_DA_OP_REPLACE + * so that the new attr doesn't fit in shortform format, the + * leaf format add routine won't trip over the attr not being + * around. */ - args->attr_flags &= ~XATTR_REPLACE; + args->op_flags &= ~XFS_DA_OP_REPLACE; + break; + case 0: + break; + default: + return error; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -889,8 +1174,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * External routines when attribute list is one block *========================================================================*/ -/* Store info about a remote block */ -STATIC void +/* Save the current remote block info and clear the current pointers. */ +static void xfs_attr_save_rmt_blk( struct xfs_da_args *args) { @@ -899,10 +1184,13 @@ xfs_attr_save_rmt_blk( args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; args->rmtvaluelen2 = args->rmtvaluelen; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* Set stored info about a remote block */ -STATIC void +static void xfs_attr_restore_rmt_blk( struct xfs_da_args *args) { @@ -928,45 +1216,54 @@ xfs_attr_leaf_try_add( struct xfs_da_args *args, struct xfs_buf *bp) { - int retval; + int error; /* - * Look up the given attribute in the leaf block. Figure out if - * the given flags produce an error or call for an atomic rename. + * If the caller provided a buffer to us, it is locked and held in + * the transaction because it just did a shortform to leaf conversion. + * Hence we don't need to read it again. Otherwise read in the leaf + * buffer. */ - retval = xfs_attr_leaf_hasname(args, &bp); - if (retval != -ENOATTR && retval != -EEXIST) - return retval; - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out_brelse; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + if (bp) { + xfs_trans_bhold_release(args->trans, bp); + } else { + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); + if (error) + return error; + } + + /* + * Look up the xattr name to set the insertion point for the new xattr. + */ + error = xfs_attr3_leaf_lookup_int(bp, args); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto out_brelse; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto out_brelse; trace_xfs_attr_leaf_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ - xfs_attr_save_rmt_blk(args); - /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto out_brelse; } - /* - * Add the attribute to the leaf block - */ return xfs_attr3_leaf_add(bp, args); out_brelse: xfs_trans_brelse(args->trans, bp); - return retval; + return error; } /* @@ -1012,9 +1309,10 @@ xfs_attr_leaf_removename( dp = args->dp; error = xfs_attr_leaf_hasname(args, &bp); - if (error == -ENOATTR) { xfs_trans_brelse(args->trans, bp); + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; return error; } else if (error != -EEXIST) return error; @@ -1062,32 +1360,20 @@ xfs_attr_leaf_get(xfs_da_args_t *args) return error; } -/* - * Return EEXIST if attr is found, or ENOATTR if not - * statep: If not null is set to point at the found state. Caller will - * be responsible for freeing the state in this case. - */ +/* Return EEXIST if attr is found, or ENOATTR if not. */ STATIC int -xfs_attr_node_hasname( +xfs_attr_node_lookup( struct xfs_da_args *args, - struct xfs_da_state **statep) + struct xfs_da_state *state) { - struct xfs_da_state *state; int retval, error; - state = xfs_da_state_alloc(args); - if (statep != NULL) - *statep = state; - /* * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); if (error) - retval = error; - - if (!statep) - xfs_da_state_free(state); + return error; return retval; } @@ -1098,46 +1384,48 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - int retval; + struct xfs_da_args *args = attr->xattri_da_args; + int error; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - retval = xfs_attr_node_hasname(args, &dac->da_state); - if (retval != -ENOATTR && retval != -EEXIST) - goto error; - - if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto error; - if (retval == -EEXIST) { - if (args->attr_flags & XATTR_CREATE) + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); + switch (error) { + case -ENOATTR: + if (args->op_flags & XFS_DA_OP_REPLACE) + goto error; + break; + case -EEXIST: + if (!(args->op_flags & XFS_DA_OP_REPLACE)) goto error; - trace_xfs_attr_node_replace(args); - - /* save the attribute state for later removal*/ - args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ - xfs_attr_save_rmt_blk(args); + trace_xfs_attr_node_replace(args); /* - * clear the remote attr state now that it is saved so that the - * values reflect the state of the attribute we are about to + * Save the existing remote attr state so that the current + * values reflect the state of the new attribute we are about to * add, not the attribute we just found and will remove later. */ - args->rmtblkno = 0; - args->rmtblkcnt = 0; - args->rmtvaluelen = 0; + xfs_attr_save_rmt_blk(args); + break; + case 0: + break; + default: + goto error; } return 0; error: - if (dac->da_state) - xfs_da_state_free(dac->da_state); - return retval; + if (attr->xattri_da_state) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } + return error; } /* @@ -1146,21 +1434,13 @@ error: * This will involve walking down the Btree, and may involve splitting * leaf nodes and even splitting intermediate nodes up to and including * the root node (a special case of an intermediate node). - * - * "Remote" attribute values confuse the issue and atomic rename operations - * add a whole extra layer of confusion on top of that. - * - * This routine is meant to function as a delayed operation, and may return - * -EAGAIN when the transaction needs to be rolled. Calling functions will need - * to handle this, and recall the function until a successful error code is - *returned. */ -STATIC int -xfs_attr_node_addname( - struct xfs_delattr_context *dac) +static int +xfs_attr_node_try_addname( + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state = attr->xattri_da_state; struct xfs_da_state_blk *blk; int error; @@ -1175,25 +1455,9 @@ xfs_attr_node_addname( /* * Its really a single leaf node, but it had * out-of-line values so it looked like it *might* - * have been a b-tree. - */ - xfs_da_state_free(state); - state = NULL; - error = xfs_attr3_leaf_to_node(args); - if (error) - goto out; - - /* - * Now that we have converted the leaf to a node, we can - * roll the transaction, and try xfs_attr3_leaf_add - * again on re-entry. No need to set dela_state to do - * this. dela_state is still unset by this function at - * this point. + * have been a b-tree. Let the caller deal with this. */ - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_node_addname_return( - dac->dela_state, args->dp); - return -EAGAIN; + goto out; } /* @@ -1205,7 +1469,6 @@ xfs_attr_node_addname( error = xfs_da3_split(state); if (error) goto out; - dac->flags |= XFS_DAC_DEFER_FINISH; } else { /* * Addition succeeded, update Btree hashvals. @@ -1214,148 +1477,12 @@ xfs_attr_node_addname( } out: - if (state) - xfs_da_state_free(state); - return error; -} - - -STATIC int -xfs_attr_node_addname_clear_incomplete( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = NULL; - int retval = 0; - int error = 0; - - /* - * Re-find the "old" attribute entry after any split ops. The INCOMPLETE - * flag means that we will find the "old" attr, not the "new" one. - */ - args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(args); - state->inleaf = 0; - error = xfs_da3_node_lookup_int(state, &retval); - if (error) - goto out; - - error = xfs_attr_node_removename(args, state); - - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - } - retval = error = 0; - -out: - if (state) - xfs_da_state_free(state); - if (error) - return error; - return retval; -} - -/* - * Shrink an attribute from leaf to shortform - */ -STATIC int -xfs_attr_node_shrink( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - struct xfs_inode *dp = args->dp; - int error, forkoff; - struct xfs_buf *bp; - - /* - * Have to get rid of the copy of this dabuf in the state. - */ - ASSERT(state->path.active == 1); - ASSERT(state->path.blk[0].bp); - state->path.blk[0].bp = NULL; - - error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); - if (error) - return error; - - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) { - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - } else - xfs_trans_brelse(args->trans, bp); - - return error; -} - -/* - * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers - * for later deletion of the entry. - */ -STATIC int -xfs_attr_leaf_mark_incomplete( - struct xfs_da_args *args, - struct xfs_da_state *state) -{ - int error; - - /* - * Fill in disk block numbers in the state structure - * so that we can get the buffers back after we commit - * several transactions in the following calls. - */ - error = xfs_attr_fillstate(state); - if (error) - return error; - - /* - * Mark the attribute as INCOMPLETE - */ - return xfs_attr3_leaf_setflag(args); -} - -/* - * Initial setup for xfs_attr_node_removename. Make sure the attr is there and - * the blocks are valid. Attr keys with remote blocks will be marked - * incomplete. - */ -STATIC -int xfs_attr_node_removename_setup( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state **state = &dac->da_state; - int error; - - error = xfs_attr_node_hasname(args, state); - if (error != -EEXIST) - goto out; - error = 0; - - ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); - ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == - XFS_ATTR_LEAF_MAGIC); - - if (args->rmtblkno > 0) { - error = xfs_attr_leaf_mark_incomplete(args, *state); - if (error) - goto out; - - error = xfs_attr_rmtval_invalidate(args); - } -out: - if (error) - xfs_da_state_free(*state); - + xfs_da_state_free(state); + attr->xattri_da_state = NULL; return error; } -STATIC int +static int xfs_attr_node_removename( struct xfs_da_args *args, struct xfs_da_state *state) @@ -1374,246 +1501,42 @@ xfs_attr_node_removename( return retval; } -/* - * Remove the attribute specified in @args. - * - * This will involve walking down the Btree, and may involve joining - * leaf nodes and even joining intermediate nodes up to and including - * the root node (a special case of an intermediate node). - * - * This routine is meant to function as either an in-line or delayed operation, - * and may return -EAGAIN when the transaction needs to be rolled. Calling - * functions will need to handle this, and call the function until a - * successful error code is returned. - */ -int -xfs_attr_remove_iter( - struct xfs_delattr_context *dac) -{ - struct xfs_da_args *args = dac->da_args; - struct xfs_da_state *state = dac->da_state; - int retval, error = 0; - struct xfs_inode *dp = args->dp; - - trace_xfs_attr_node_removename(args); - - switch (dac->dela_state) { - case XFS_DAS_UNINIT: - if (!xfs_inode_hasattr(dp)) - return -ENOATTR; - - /* - * Shortform or leaf formats don't require transaction rolls and - * thus state transitions. Call the right helper and return. - */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_removename(args); - - if (xfs_attr_is_leaf(dp)) - return xfs_attr_leaf_removename(args); - - /* - * Node format may require transaction rolls. Set up the - * state context and fall into the state machine. - */ - if (!dac->da_state) { - error = xfs_attr_node_removename_setup(dac); - if (error) - return error; - state = dac->da_state; - } - - fallthrough; - case XFS_DAS_RMTBLK: - dac->dela_state = XFS_DAS_RMTBLK; - - /* - * If there is an out-of-line value, de-allocate the blocks. - * This is done before we remove the attribute so that we don't - * overflow the maximum size of a transaction and/or hit a - * deadlock. - */ - if (args->rmtblkno > 0) { - /* - * May return -EAGAIN. Roll and repeat until all remote - * blocks are removed. - */ - error = xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) { - trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); - return error; - } else if (error) { - goto out; - } - - /* - * Refill the state structure with buffers (the prior - * calls released our buffers) and close out this - * transaction before proceeding. - */ - ASSERT(args->rmtblkno == 0); - error = xfs_attr_refillstate(state); - if (error) - goto out; - dac->dela_state = XFS_DAS_RM_NAME; - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_NAME: - /* - * If we came here fresh from a transaction roll, reattach all - * the buffers to the current transaction. - */ - if (dac->dela_state == XFS_DAS_RM_NAME) { - error = xfs_attr_refillstate(state); - if (error) - goto out; - } - - retval = xfs_attr_node_removename(args, state); - - /* - * Check to see if the tree needs to be collapsed. If so, roll - * the transacton and fall into the shrink state. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - - dac->flags |= XFS_DAC_DEFER_FINISH; - dac->dela_state = XFS_DAS_RM_SHRINK; - trace_xfs_attr_remove_iter_return( - dac->dela_state, args->dp); - return -EAGAIN; - } - - fallthrough; - case XFS_DAS_RM_SHRINK: - /* - * If the result is small enough, push it all into the inode. - * This is our final state so it's safe to return a dirty - * transaction. - */ - if (xfs_attr_is_leaf(dp)) - error = xfs_attr_node_shrink(args, state); - ASSERT(error != -EAGAIN); - break; - default: - ASSERT(0); - error = -EINVAL; - goto out; - } -out: - if (state) - xfs_da_state_free(state); - return error; -} - -/* - * Fill in the disk block numbers in the state structure for the buffers - * that are attached to the state structure. - * This is done so that we can quickly reattach ourselves to those buffers - * after some set of transaction commits have released these buffers. - */ -STATIC int -xfs_attr_fillstate(xfs_da_state_t *state) +static int +xfs_attr_node_remove_attr( + struct xfs_attr_intent *attr) { - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level; - - trace_xfs_attr_fillstate(state->args); - - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_da_state *state = xfs_da_state_alloc(args); + int retval = 0; + int error = 0; /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". + * The attr we are removing has already been marked incomplete, so + * we need to set the filter appropriately to re-find the "old" + * attribute entry after any split ops. */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->bp) { - blk->disk_blkno = xfs_buf_daddr(blk->bp); - blk->bp = NULL; - } else { - blk->disk_blkno = 0; - } - } - - return 0; -} - -/* - * Reattach the buffers to the state structure based on the disk block - * numbers stored in the state structure. - * This is done after some set of transaction commits have released those - * buffers from our grip. - */ -STATIC int -xfs_attr_refillstate(xfs_da_state_t *state) -{ - xfs_da_state_path_t *path; - xfs_da_state_blk_t *blk; - int level, error; - - trace_xfs_attr_refillstate(state->args); + args->attr_filter |= XFS_ATTR_INCOMPLETE; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; - /* - * Roll down the "path" in the state structure, storing the on-disk - * block number for those buffers in the "path". - */ - path = &state->path; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } - } + error = xfs_attr_node_removename(args, state); /* - * Roll down the "altpath" in the state structure, storing the on-disk - * block number for those buffers in the "altpath". + * Check to see if the tree needs to be collapsed. */ - path = &state->altpath; - ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); - for (blk = path->blk, level = 0; level < path->active; blk++, level++) { - if (blk->disk_blkno) { - error = xfs_da3_node_read_mapped(state->args->trans, - state->args->dp, blk->disk_blkno, - &blk->bp, XFS_ATTR_FORK); - if (error) - return error; - } else { - blk->bp = NULL; - } + if (retval && (state->path.active > 1)) { + error = xfs_da3_join(state); + if (error) + goto out; } + retval = error = 0; - return 0; +out: + xfs_da_state_free(state); + if (error) + return error; + return retval; } /* @@ -1639,7 +1562,8 @@ xfs_attr_node_get( /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_attr_node_hasname(args, &state); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); if (error != -EEXIST) goto out_release; @@ -1658,8 +1582,7 @@ out_release: state->path.blk[i].bp = NULL; } - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); return error; } @@ -1679,3 +1602,20 @@ xfs_attr_namecheck( /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } + +int __init +xfs_attr_intent_init_cache(void) +{ + xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent", + sizeof(struct xfs_attr_intent), + 0, 0, NULL); + + return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attr_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attr_intent_cache); + xfs_attr_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 5e71f719bdd5..e329da3e7afa 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -28,6 +28,16 @@ struct xfs_attr_list_context; */ #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ +static inline bool xfs_has_larp(struct xfs_mount *mp) +{ +#ifdef DEBUG + /* Logged xattrs require a V5 super for log_incompat */ + return xfs_has_crc(mp) && xfs_globals.larp; +#else + return false; +#endif +} + /* * Kernel-internal version of the attrlist cursor. */ @@ -425,7 +435,7 @@ struct xfs_attr_list_context { */ /* - * Enum values for xfs_delattr_context.da_state + * Enum values for xfs_attr_intent.xattri_da_state * * These values are used by delayed attribute operations to keep track of where * they were before they returned -EAGAIN. A return code of -EAGAIN signals the @@ -434,46 +444,107 @@ struct xfs_attr_list_context { * to where it was and resume executing where it left off. */ enum xfs_delattr_state { - XFS_DAS_UNINIT = 0, /* No state has been set yet */ - XFS_DAS_RMTBLK, /* Removing remote blks */ - XFS_DAS_RM_NAME, /* Remove attr name */ - XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ - XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ - XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ - XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ - XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ - XFS_DAS_RD_LEAF, /* Read in the new leaf */ - XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ - XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ - XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ - XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ + XFS_DAS_UNINIT = 0, /* No state has been set yet */ + + /* + * Initial sequence states. The replace setup code relies on the + * ADD and REMOVE states for a specific format to be sequential so + * that we can transform the initial operation to be performed + * according to the xfs_has_larp() state easily. + */ + XFS_DAS_SF_ADD, /* Initial sf add state */ + XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */ + + XFS_DAS_LEAF_ADD, /* Initial leaf add state */ + XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */ + + XFS_DAS_NODE_ADD, /* Initial node add state */ + XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */ + + /* Leaf state set/replace/remove sequence */ + XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */ + XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */ + XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */ + XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */ + + /* Node state sequence, must match leaf state above */ + XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */ + XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */ + XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */ + XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */ + XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */ + XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */ + + XFS_DAS_DONE, /* finished operation */ }; -/* - * Defines for xfs_delattr_context.flags - */ -#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */ -#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/ +#define XFS_DAS_STRINGS \ + { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \ + { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \ + { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \ + { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \ + { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \ + { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \ + { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \ + { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \ + { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \ + { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \ + { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \ + { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \ + { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \ + { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \ + { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \ + { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \ + { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \ + { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \ + { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ + { XFS_DAS_DONE, "XFS_DAS_DONE" } + +struct xfs_attri_log_nameval; /* * Context used for keeping track of delayed attribute operations */ -struct xfs_delattr_context { - struct xfs_da_args *da_args; - - /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - int blkcnt; +struct xfs_attr_intent { + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ + struct list_head xattri_list; /* Used in xfs_attr_node_removename to roll through removing blocks */ - struct xfs_da_state *da_state; + struct xfs_da_state *xattri_da_state; + + struct xfs_da_args *xattri_da_args; + + /* + * Shared buffer containing the attr name and value so that the logging + * code can share large memory buffers between log items. + */ + struct xfs_attri_log_nameval *xattri_nameval; + + /* + * Used by xfs_attr_set to hold a leaf buffer across a transaction roll + */ + struct xfs_buf *xattri_leaf_bp; /* Used to keep track of current state of delayed operation */ - unsigned int flags; - enum xfs_delattr_state dela_state; + enum xfs_delattr_state xattri_dela_state; + + /* + * Attr operation being performed - XFS_ATTRI_OP_FLAGS_* + */ + unsigned int xattri_op_flags; + + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; + struct xfs_bmbt_irec xattri_map; }; + /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -489,11 +560,77 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_remove_iter(struct xfs_delattr_context *dac); +int xfs_attr_set_iter(struct xfs_attr_intent *attr); +int xfs_attr_remove_iter(struct xfs_attr_intent *attr); bool xfs_attr_namecheck(const void *name, size_t length); -void xfs_delattr_context_init(struct xfs_delattr_context *dac, - struct xfs_da_args *args); +int xfs_attr_calc_size(struct xfs_da_args *args, int *local); +void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, + unsigned int *total); + +/* + * Check to see if the attr should be upgraded from non-existent or shortform to + * single-leaf-block attribute list. + */ +static inline bool +xfs_attr_is_shortform( + struct xfs_inode *ip) +{ + return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0); +} + +static inline enum xfs_delattr_state +xfs_attr_init_add_state(struct xfs_da_args *args) +{ + /* + * When called from the completion of a attr remove to determine the + * next state, the attribute fork may be null. This can occur only occur + * on a pure remove, but we grab the next state before we check if a + * replace operation is being performed. If we are called from any other + * context, i_afp is guaranteed to exist. Hence if the attr fork is + * null, we were called from a pure remove operation and so we are done. + */ + if (!args->dp->i_afp) + return XFS_DAS_DONE; + + args->op_flags |= XFS_DA_OP_ADDNAME; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_ADD; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_ADD; + return XFS_DAS_NODE_ADD; +} + +static inline enum xfs_delattr_state +xfs_attr_init_remove_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_REMOVE; + if (xfs_attr_is_shortform(args->dp)) + return XFS_DAS_SF_REMOVE; + if (xfs_attr_is_leaf(args->dp)) + return XFS_DAS_LEAF_REMOVE; + return XFS_DAS_NODE_REMOVE; +} + +/* + * If we are logging the attributes, then we have to start with removal of the + * old attribute so that there is always consistent state that we can recover + * from if the system goes down part way through. We always log the new attr + * value, so even when we remove the attr first we still have the information in + * the log to finish the replace operation atomically. + */ +static inline enum xfs_delattr_state +xfs_attr_init_replace_state(struct xfs_da_args *args) +{ + args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE; + if (xfs_has_larp(args->dp->i_mount)) + return xfs_attr_init_remove_state(args); + return xfs_attr_init_add_state(args); +} + +extern struct kmem_cache *xfs_attr_intent_cache; +int __init xfs_attr_intent_init_cache(void); +void xfs_attr_intent_destroy_cache(void); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 014daa8c542d..15a990409463 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -28,6 +28,7 @@ #include "xfs_dir2.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_errortag.h" /* @@ -310,6 +311,15 @@ xfs_attr3_leaf_verify( return fa; /* + * Empty leaf blocks should never occur; they imply the existence of a + * software bug that needs fixing. xfs_repair also flags them as a + * corruption that needs fixing, so we should never let these go to + * disk. + */ + if (ichdr.count == 0) + return __this_address; + + /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -445,6 +455,14 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ +/* + * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE + * flag on disk - if there's an incomplete attr then recovery needs to tear it + * down. If there's no incomplete attr, then recovery needs to tear that attr + * down to replace it with the attr that has been logged. In this case, the + * INCOMPLETE flag will not be set in attr->attr_filter, but rather + * XFS_DA_OP_RECOVERY will be set in args->op_flags. + */ static bool xfs_attr_match( struct xfs_da_args *args, @@ -452,14 +470,18 @@ xfs_attr_match( unsigned char *name, int flags) { + if (args->namelen != namelen) return false; if (memcmp(args->name, name, namelen) != 0) return false; - /* - * If we are looking for incomplete entries, show only those, else only - * show complete entries. - */ + + /* Recovery ignores the INCOMPLETE flag. */ + if ((args->op_flags & XFS_DA_OP_RECOVERY) && + args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return true; + + /* All remaining matches need to be filtered by INCOMPLETE state. */ if (args->attr_filter != (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) return false; @@ -798,6 +820,14 @@ xfs_attr_sf_removename( sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); + + /* + * If we are recovering an operation, finding nothing to + * remove is not an error - it just means there was nothing + * to clean up. + */ + if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) + return 0; if (error != -EEXIST) return error; size = xfs_attr_sf_entsize(sfe); @@ -818,7 +848,7 @@ xfs_attr_sf_removename( totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && - !(args->op_flags & XFS_DA_OP_ADDNAME)) { + !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); @@ -1127,9 +1157,17 @@ xfs_attr3_leaf_to_shortform( goto out; if (forkoff == -1) { - ASSERT(xfs_has_attr2(dp->i_mount)); - ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_remove(dp, args->trans); + /* + * Don't remove the attr fork if this operation is the first + * part of a attr replace operations. We're going to add a new + * attr immediately, so we need to keep the attr fork around in + * this case. + */ + if (!(args->op_flags & XFS_DA_OP_REPLACE)) { + ASSERT(xfs_has_attr2(dp->i_mount)); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_remove(dp, args->trans); + } goto out; } @@ -1189,6 +1227,11 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + error = -EIO; + goto out; + } + error = xfs_da_grow_inode(args, &blkno); if (error) goto out; @@ -1486,8 +1529,9 @@ xfs_attr3_leaf_add_work( entry->flags = args->attr_filter; if (tmp) entry->flags |= XFS_ATTR_LOCAL; - if (args->op_flags & XFS_DA_OP_RENAME) { - entry->flags |= XFS_ATTR_INCOMPLETE; + if (args->op_flags & XFS_DA_OP_REPLACE) { + if (!xfs_has_larp(mp)) + entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 83b95be9ded8..7298c148f848 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -568,14 +568,14 @@ xfs_attr_rmtval_stale( */ int xfs_attr_rmtval_find_space( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_da_args *args = attr->xattri_da_args; + struct xfs_bmbt_irec *map = &attr->xattri_map; int error; - dac->lblkno = 0; - dac->blkcnt = 0; + attr->xattri_lblkno = 0; + attr->xattri_blkcnt = 0; args->rmtblkcnt = 0; args->rmtblkno = 0; memset(map, 0, sizeof(struct xfs_bmbt_irec)); @@ -584,8 +584,8 @@ xfs_attr_rmtval_find_space( if (error) return error; - dac->blkcnt = args->rmtblkcnt; - dac->lblkno = args->rmtblkno; + attr->xattri_blkcnt = args->rmtblkcnt; + attr->xattri_lblkno = args->rmtblkno; return 0; } @@ -598,17 +598,18 @@ xfs_attr_rmtval_find_space( */ int xfs_attr_rmtval_set_blk( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; - struct xfs_bmbt_irec *map = &dac->map; + struct xfs_bmbt_irec *map = &attr->xattri_map; int nmap; int error; nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno, - dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total, + error = xfs_bmapi_write(args->trans, dp, + (xfs_fileoff_t)attr->xattri_lblkno, + attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total, map, &nmap); if (error) return error; @@ -618,8 +619,8 @@ xfs_attr_rmtval_set_blk( (map->br_startblock != HOLESTARTBLOCK)); /* roll attribute extent map forwards */ - dac->lblkno += map->br_blockcount; - dac->blkcnt -= map->br_blockcount; + attr->xattri_lblkno += map->br_blockcount; + attr->xattri_blkcnt -= map->br_blockcount; return 0; } @@ -673,9 +674,9 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_delattr_context *dac) + struct xfs_attr_intent *attr) { - struct xfs_da_args *args = dac->da_args; + struct xfs_da_args *args = attr->xattri_da_args; int error, done; /* @@ -695,8 +696,8 @@ xfs_attr_rmtval_remove( * the parent */ if (!done) { - dac->flags |= XFS_DAC_DEFER_FINISH; - trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp); + trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state, + args->dp); return -EAGAIN; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index d72eff30ca18..d097ec6c4dc3 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); -int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); -int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 74198dd82b03..6833110d1bd4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels( xfs_mount_t *mp, /* file system mount structure */ int whichfork) /* data or attr fork */ { + uint64_t maxblocks; /* max blocks at this level */ + xfs_extnum_t maxleafents; /* max leaf entries possible */ int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ int maxrootrecs; /* max records in root block */ int minleafrecs; /* min records in leaf block */ int minnoderecs; /* min records in node block */ int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum number of - * leaf entries, is controlled by the size of the on-disk extent count, - * either a signed 32-bit number for the data fork, or a signed 16-bit - * number for the attr fork. + * The maximum number of extents in a fork, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count. * * Note that we can no longer assume that if we are in ATTR1 that the * fork offset of all the inodes will be @@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels( * ATTR2 we have to assume the worst case scenario of a minimum size * available. */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; + maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp), + whichfork); + if (whichfork == XFS_DATA_FORK) sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); - } else { - maxleafents = MAXAEXTNUM; + else sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); - } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + maxblocks = howmany_64(maxleafents, minleafrecs); for (level = 1; maxblocks > 1; level++) { if (maxblocks <= maxrootrecs) maxblocks = 1; else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + maxblocks = howmany_64(maxblocks, minnoderecs); } mp->m_bm_maxlevels[whichfork] = level; ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk()); @@ -468,7 +466,7 @@ error0: if (bp_release) xfs_trans_brelse(NULL, bp); error_norelse: - xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + xfs_warn(mp, "%s: BAD after btree leaves for %llu extents", __func__, i); xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -485,7 +483,7 @@ STATIC void xfs_bmap_validate_ret( xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_bmbt_irec_t *mval, int nmap, int ret_nmap) @@ -1399,7 +1397,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t da_new; /* new count del alloc blocks used */ xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ @@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -1950,7 +1948,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec old; @@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; /* @@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real( new_endoff == RIGHT.br_startoff && new->br_startblock + new->br_blockcount == RIGHT.br_startblock && new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN && ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING)) != (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING) || LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)) + <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; /* @@ -2479,7 +2477,7 @@ xfs_bmap_add_extent_hole_delay( xfs_filblks_t newlen=0; /* new indirect size */ xfs_filblks_t oldlen=0; /* old indirect size */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ ifp = XFS_IFORK_PTR(ip, whichfork); @@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay( */ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN))) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) state |= BMAP_RIGHT_CONTIG; /* @@ -2616,7 +2614,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp, - int flags) + uint32_t flags) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; @@ -2626,7 +2624,7 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); @@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real( left.br_startoff + left.br_blockcount == new->br_startoff && left.br_startblock + left.br_blockcount == new->br_startblock && left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) state |= BMAP_LEFT_CONTIG; if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && new->br_startoff + new->br_blockcount == right.br_startoff && new->br_startblock + new->br_blockcount == right.br_startblock && new->br_state == right.br_state && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && (!(state & BMAP_LEFT_CONTIG) || left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN)) + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) state |= BMAP_RIGHT_CONTIG; error = 0; @@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align( /* * For large extent hint sizes, the aligned extent might be larger than - * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls - * the length back under MAXEXTLEN. The outer allocation loops handle - * short allocation just fine, so it is safe to do this. We only want to - * do it when we are forced to, though, because it means more allocation - * operations are required. + * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so + * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer + * allocation loops handle short allocation just fine, so it is safe to + * do this. We only want to do it when we are forced to, though, because + * it means more allocation operations are required. */ - while (align_alen > MAXEXTLEN) + while (align_alen > XFS_MAX_BMBT_EXTLEN) align_alen -= extsz; - ASSERT(align_alen <= MAXEXTLEN); + ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN); /* * If the previous block overlaps with this proposed allocation @@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - /* see MAXEXTLEN handling above */ + /* see XFS_BMBT_MAX_EXTLEN handling above */ ASSERT(orig_end <= align_off + align_alen || - align_alen + extsz > MAXEXTLEN); + align_alen + extsz > XFS_MAX_BMBT_EXTLEN); } #ifdef DEBUG @@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map( xfs_fileoff_t obno, xfs_fileoff_t end, int n, - int flags) + uint32_t flags) { if ((flags & XFS_BMAPI_ENTIRE) || got->br_startoff + got->br_blockcount <= obno) { @@ -3811,7 +3809,7 @@ xfs_bmapi_update_map( xfs_fileoff_t obno, xfs_fileoff_t end, int *n, - int flags) + uint32_t flags) { xfs_bmbt_irec_t *mval = *map; @@ -3864,7 +3862,7 @@ xfs_bmapi_read( xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); @@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc( * Cap the alloc length. Keep track of prealloc so we know whether to * tag the inode before we return. */ - alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); if (prealloc && alen >= len) @@ -4104,7 +4102,7 @@ xfs_bmapi_allocate( if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev)) bma->prev.br_startoff = NULLFILEOFF; } else { - bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN); if (!bma->eof) bma->length = XFS_FILBLKS_MIN(bma->length, bma->got.br_startoff - bma->offset); @@ -4184,7 +4182,7 @@ xfs_bmapi_convert_unwritten( struct xfs_bmalloca *bma, struct xfs_bmbt_irec *mval, xfs_filblks_t len, - int flags) + uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); @@ -4312,7 +4310,7 @@ xfs_bmapi_write( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting file offs. mapped */ xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ + uint32_t flags, /* XFS_BMAPI_... */ xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ @@ -4424,8 +4422,8 @@ xfs_bmapi_write( * xfs_extlen_t and therefore 32 bits. Hence we have to * check for 32-bit overflows and handle them here. */ - if (len > (xfs_filblks_t)MAXEXTLEN) - bma.length = MAXEXTLEN; + if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN) + bma.length = XFS_MAX_BMBT_EXTLEN; else bma.length = len; @@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); error = xfs_iext_count_may_overflow(ip, whichfork, XFS_IEXT_ADD_NOSPLIT_CNT); + if (error == -EFBIG) + error = xfs_iext_count_upgrade(tp, ip, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, 0); - if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || bma.got.br_startoff > offset_fsb) { /* @@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc( bma.ip = ip; bma.wasdel = true; bma.offset = bma.got.br_startoff; - bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, + XFS_MAX_BMBT_EXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); /* @@ -4629,7 +4630,7 @@ xfs_bmapi_remap( xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags) + uint32_t flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -4641,7 +4642,7 @@ xfs_bmapi_remap( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(len > 0); - ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); @@ -4801,7 +4802,7 @@ xfs_bmap_del_extent_delay( int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; xfs_filblks_t got_indlen, new_indlen, stolen; - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); int error = 0; bool isrt; @@ -4926,7 +4927,7 @@ xfs_bmap_del_extent_cow( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; - int state = BMAP_COWFORK; + uint32_t state = BMAP_COWFORK; XFS_STATS_INC(mp, xs_del_exlist); @@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ int whichfork, /* data or attr fork */ - int bflags) /* bmapi flags */ + uint32_t bflags) /* bmapi flags */ { xfs_fsblock_t del_endblock=0; /* first block past del */ xfs_fileoff_t del_endoff; /* first offset past del */ @@ -5015,7 +5016,7 @@ xfs_bmap_del_extent_real( xfs_bmbt_irec_t new; /* new record to be inserted */ /* REFERENCED */ uint qfield; /* quota field to update */ - int state = xfs_bmap_fork_to_state(whichfork); + uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; mp = ip->i_mount; @@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real( * Deleting the middle of the extent. */ - /* - * For directories, -ENOSPC is returned since a directory entry - * remove operation must not fail due to low extent count - * availability. -ENOSPC will be handled by higher layers of XFS - * by letting the corresponding empty Data/Free blocks to linger - * until a future remove operation. Dabtree blocks would be - * swapped with the last block in the leaf space and then the - * new last block will be unmapped. - * - * The above logic also applies to the source directory entry of - * a rename operation. - */ - error = xfs_iext_count_may_overflow(ip, whichfork, 1); - if (error) { - ASSERT(S_ISDIR(VFS_I(ip)->i_mode) && - whichfork == XFS_DATA_FORK); - error = -ENOSPC; - goto done; - } - old = got; got.br_blockcount = del->br_startoff - got.br_startoff; @@ -5281,7 +5262,7 @@ __xfs_bunmapi( struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t start, /* first file offset deleted */ xfs_filblks_t *rlen, /* i/o: amount remaining */ - int flags, /* misc flags */ + uint32_t flags, /* misc flags */ xfs_extnum_t nexts) /* number of extents max */ { struct xfs_btree_cur *cur; /* bmap btree cursor */ @@ -5299,7 +5280,6 @@ __xfs_bunmapi( int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ - xfs_fileoff_t max_len; xfs_fileoff_t end; struct xfs_iext_cursor icur; bool done = false; @@ -5318,16 +5298,6 @@ __xfs_bunmapi( ASSERT(len > 0); ASSERT(nexts >= 0); - /* - * Guesstimate how many blocks we can unmap without running the risk of - * blowing out the transaction with a mix of EFIs and reflink - * adjustments. - */ - if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) - max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); - else - max_len = len; - error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; @@ -5366,7 +5336,7 @@ __xfs_bunmapi( extno = 0; while (end != (xfs_fileoff_t)-1 && end >= start && - (nexts == 0 || extno < nexts) && max_len > 0) { + (nexts == 0 || extno < nexts)) { /* * Is the found extent after a hole in which end lives? * Just back up to the previous extent, if so. @@ -5400,14 +5370,6 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - /* How much can we safely unmap? */ - if (max_len < del.br_blockcount) { - del.br_startoff += del.br_blockcount - max_len; - if (!wasdel) - del.br_startblock += del.br_blockcount - max_len; - del.br_blockcount = max_len; - } - if (!isrt) goto delete; @@ -5543,7 +5505,6 @@ delete: if (error) goto error0; - max_len -= del.br_blockcount; end = del.br_startoff - 1; nodelete: /* @@ -5609,7 +5570,7 @@ xfs_bunmapi( struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, - int flags, + uint32_t flags, xfs_extnum_t nexts, int *done) { @@ -5641,7 +5602,7 @@ xfs_bmse_can_merge( if ((left->br_startoff + left->br_blockcount != startoff) || (left->br_startblock + left->br_blockcount != got->br_startblock) || (left->br_state != got->br_state) || - (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN)) return false; return true; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 03d9aaf87413..16db95b11589 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -39,7 +39,7 @@ struct xfs_bmalloca { bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ - int flags; + uint32_t flags; }; #define XFS_BMAP_MAX_NMAP 4 @@ -47,17 +47,17 @@ struct xfs_bmalloca { /* * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ -#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ -#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ -#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */ +#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */ +#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ -#define XFS_BMAPI_CONVERT 0x040 +#define XFS_BMAPI_CONVERT (1u << 5) /* * allocate zeroed extents - this requires all newly allocated user data extents @@ -65,7 +65,7 @@ struct xfs_bmalloca { * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ -#define XFS_BMAPI_ZERO 0x080 +#define XFS_BMAPI_ZERO (1u << 6) /* * Map the inode offset to the block given in ap->firstblock. Primarily @@ -75,16 +75,16 @@ struct xfs_bmalloca { * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ -#define XFS_BMAPI_REMAP 0x100 +#define XFS_BMAPI_REMAP (1u << 7) /* Map something in the CoW fork. */ -#define XFS_BMAPI_COWFORK 0x200 +#define XFS_BMAPI_COWFORK (1u << 8) /* Skip online discard of freed extents */ -#define XFS_BMAPI_NODISCARD 0x1000 +#define XFS_BMAPI_NODISCARD (1u << 9) /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ -#define XFS_BMAPI_NORMAP 0x2000 +#define XFS_BMAPI_NORMAP (1u << 10) #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w) (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } -static inline int xfs_bmapi_whichfork(int bmapi_flags) +static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; @@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) /* * Flags for xfs_bmap_add_extent*. */ -#define BMAP_LEFT_CONTIG (1 << 0) -#define BMAP_RIGHT_CONTIG (1 << 1) -#define BMAP_LEFT_FILLING (1 << 2) -#define BMAP_RIGHT_FILLING (1 << 3) -#define BMAP_LEFT_DELAY (1 << 4) -#define BMAP_RIGHT_DELAY (1 << 5) -#define BMAP_LEFT_VALID (1 << 6) -#define BMAP_RIGHT_VALID (1 << 7) -#define BMAP_ATTRFORK (1 << 8) -#define BMAP_COWFORK (1 << 9) +#define BMAP_LEFT_CONTIG (1u << 0) +#define BMAP_RIGHT_CONTIG (1u << 1) +#define BMAP_LEFT_FILLING (1u << 2) +#define BMAP_RIGHT_FILLING (1u << 3) +#define BMAP_LEFT_DELAY (1u << 4) +#define BMAP_RIGHT_DELAY (1u << 5) +#define BMAP_LEFT_VALID (1u << 6) +#define BMAP_RIGHT_VALID (1u << 7) +#define BMAP_ATTRFORK (1u << 8) +#define BMAP_COWFORK (1u << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ @@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); + int *nmap, uint32_t flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, @@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); -static inline int xfs_bmap_fork_to_state(int whichfork) +static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: @@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - int flags); + uint32_t flags); extern struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 453309fc85f2..2b77d45c215f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -597,7 +597,11 @@ xfs_bmbt_maxrecs( return xfs_bmbt_block_maxrecs(blocklen, leaf); } -/* Compute the max possible height for block mapping btrees. */ +/* + * Calculate the maximum possible height of the btree that the on-disk format + * supports. This is used for sizing structures large enough to support every + * possible configuration of a filesystem that might get mounted. + */ unsigned int xfs_bmbt_maxlevels_ondisk(void) { @@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void) minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2; /* One extra level for the inode root. */ - return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1; + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1; } /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c1500b238520..2eecc49fc1b2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -52,6 +52,71 @@ xfs_btree_magic( } /* + * These sibling pointer checks are optimised for null sibling pointers. This + * happens a lot, and we don't need to byte swap at runtime if the sibling + * pointer is NULL. + * + * These are explicitly marked at inline because the cost of calling them as + * functions instead of inlining them is about 36 bytes extra code per call site + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these + * two sibling check functions reduces the compiled code size by over 300 + * bytes. + */ +static inline xfs_failaddr_t +xfs_btree_check_lblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_fsblock_t fsb, + __be64 dsibling) +{ + xfs_fsblock_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == fsb) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_lptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + } + + return NULL; +} + +static inline xfs_failaddr_t +xfs_btree_check_sblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + __be32 dsibling) +{ + xfs_agblock_t sibling; + + if (dsibling == cpu_to_be32(NULLAGBLOCK)) + return NULL; + + sibling = be32_to_cpu(dsibling); + if (sibling == agbno) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_sptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_agbno(mp, agno, sibling)) + return __this_address; + } + return NULL; +} + +/* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. */ @@ -65,6 +130,8 @@ __xfs_btree_check_lblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_fsblock_t fsb = NULLFSBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -83,16 +150,16 @@ __xfs_btree_check_lblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /* Check a long btree block header. */ @@ -130,6 +197,9 @@ __xfs_btree_check_sblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); + xfs_failaddr_t fa; + xfs_agblock_t agbno = NULLAGBLOCK; + xfs_agnumber_t agno = NULLAGNUMBER; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -146,16 +216,18 @@ __xfs_btree_check_sblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) { + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); + } + + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, + agbno, block->bb_u.s.bb_rightsib); + return fa; } /* Check a short btree block header. */ @@ -373,8 +445,14 @@ xfs_btree_del_cursor( break; } + /* + * If we are doing a BMBT update, the number of unaccounted blocks + * allocated during this cursor life time should be zero. If it's not + * zero, then we should be shut down or on our way to shutdown due to + * cancelling a dirty transaction on error. + */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || - xfs_is_shutdown(cur->bc_mp)); + xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) @@ -751,20 +829,20 @@ xfs_btree_lastrec( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ - int64_t imask; /* mask for current bit number */ + uint32_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* * Find the lowest bit, so the first byte offset. */ - for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + for (i = 0, imask = 1u; ; i++, imask <<= 1) { if (imask & fields) { *first = offsets[i]; break; @@ -773,7 +851,7 @@ xfs_btree_offsets( /* * Find the highest bit, so the last byte offset. */ - for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { if (imask & fields) { *last = offsets[i + 1] - 1; break; @@ -1456,7 +1534,7 @@ void xfs_btree_log_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ + uint32_t fields) /* mask of fields: XFS_BB_... */ { int first; /* first byte offset logged */ int last; /* last byte offset logged */ @@ -3194,7 +3272,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3274,7 +3352,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3294,7 +3372,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3379,6 +3457,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } @@ -4271,6 +4351,21 @@ xfs_btree_visit_block( if (xfs_btree_ptr_is_null(cur, &rptr)) return -ENOENT; + /* + * We only visit blocks once in this walk, so we have to avoid the + * internal xfs_btree_lookup_get_block() optimisation where it will + * return the same block without checking if the right sibling points + * back to us and creates a cyclic reference in the btree. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } else { + if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4445,20 +4540,21 @@ xfs_btree_lblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_fsblock_t fsb; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) - return __this_address; - - return NULL; + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /** @@ -4499,7 +4595,9 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agblock_t agno; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) @@ -4507,14 +4605,13 @@ xfs_btree_sblock_verify( /* sibling pointer verification */ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) - return __this_address; - - return NULL; + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + block->bb_u.s.bb_rightsib); + return fa; } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 22d9f411fde6..eef27858a013 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. */ -#define XFS_BB_MAGIC (1 << 0) -#define XFS_BB_LEVEL (1 << 1) -#define XFS_BB_NUMRECS (1 << 2) -#define XFS_BB_LEFTSIB (1 << 3) -#define XFS_BB_RIGHTSIB (1 << 4) -#define XFS_BB_BLKNO (1 << 5) -#define XFS_BB_LSN (1 << 6) -#define XFS_BB_UUID (1 << 7) -#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_MAGIC (1u << 0) +#define XFS_BB_LEVEL (1u << 1) +#define XFS_BB_NUMRECS (1u << 2) +#define XFS_BB_LEFTSIB (1u << 3) +#define XFS_BB_RIGHTSIB (1u << 4) +#define XFS_BB_BLKNO (1u << 5) +#define XFS_BB_LSN (1u << 6) +#define XFS_BB_UUID (1u << 7) +#define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 -#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 -#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) +#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface @@ -345,7 +345,7 @@ xfs_btree_dup_cursor( */ void xfs_btree_offsets( - int64_t fields, /* bitmask of fields */ + uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ @@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ -void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9dc1ecb9713d..e7201dc68f43 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -22,6 +22,7 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_errortag.h" /* * xfs_da_btree.c @@ -116,6 +117,17 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_cache_free(xfs_da_state_cache, state); } +void +xfs_da_state_reset( + struct xfs_da_state *state, + struct xfs_da_args *args) +{ + xfs_da_state_kill_altpath(state); + memset(state, 0, sizeof(struct xfs_da_state)); + state->args = args; + state->mp = state->args->dp->i_mount; +} + static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) { if (whichfork == XFS_DATA_FORK) @@ -482,6 +494,9 @@ xfs_da3_split( trace_xfs_da_split(state->args); + if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + return -EIO; + /* * Walk back up the tree splitting/inserting/adjusting as necessary. * If we need to insert and there isn't room, split the node, then diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0faf7d9ac241..d33b7686a0b3 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -30,6 +30,7 @@ struct xfs_da_geometry { unsigned int free_hdr_size; /* dir2 free header size */ unsigned int free_max_bests; /* # of bests entries in dir2 free */ xfs_dablk_t freeblk; /* blockno of free data v2 */ + xfs_extnum_t max_extents; /* Max. extents in corresponding fork */ xfs_dir2_data_aoff_t data_first_offset; size_t data_entry_offset; @@ -76,27 +77,31 @@ typedef struct xfs_da_args { xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ int rmtvaluelen2; /* remote attr value length in bytes */ - int op_flags; /* operation flags */ + uint32_t op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* * Operation flags: */ -#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ -#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ -#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ -#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ -#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ +#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */ +#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */ +#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */ +#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */ +#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */ +#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */ +#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ - { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_REPLACE, "REPLACE" }, \ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_NOTIME, "NOTIME" } + { XFS_DA_OP_NOTIME, "NOTIME" }, \ + { XFS_DA_OP_REMOVE, "REMOVE" }, \ + { XFS_DA_OP_RECOVERY, "RECOVERY" } /* * Storage for holding state during Btree searches and split/join ops. @@ -197,7 +202,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp, * Utility routines. */ -#define XFS_DABUF_MAP_HOLE_OK (1 << 0) +#define XFS_DABUF_MAP_HOLE_OK (1u << 0) int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, @@ -220,6 +225,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5a49caa5c9df..25e2841084e1 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) * Directory address space divided into sections, * spaces separated by 32GB. */ +#define XFS_DIR2_MAX_SPACES 3 #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) @@ -688,10 +689,10 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ #define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ #define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) +#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) /* diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 0805ade2d300..5a321b783398 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -22,6 +22,10 @@ #include "xfs_refcount.h" #include "xfs_bmap.h" #include "xfs_alloc.h" +#include "xfs_buf.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -184,36 +188,61 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, + [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; -static void +/* + * Ensure there's a log intent item associated with this deferred work item if + * the operation must be restarted on crash. Returns 1 if there's a log item; + * 0 if there isn't; or a negative errno. + */ +static int xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, bool sort) { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; - if (!dfp->dfp_intent) - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); + if (dfp->dfp_intent) + return 1; + + lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + if (!lip) + return 0; + if (IS_ERR(lip)) + return PTR_ERR(lip); + + dfp->dfp_intent = lip; + return 1; } /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. + * + * Returns 1 if at least one log item was associated with the deferred work; + * 0 if there are no log items; or a negative errno. */ -STATIC void +static int xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; + int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { + int ret2; + trace_xfs_defer_create_intent(tp->t_mountp, dfp); - xfs_defer_create_intent(tp, dfp, true); + ret2 = xfs_defer_create_intent(tp, dfp, true); + if (ret2 < 0) + return ret2; + ret |= ret2; } + return ret; } /* Abort all the intents that were committed. */ @@ -449,6 +478,8 @@ xfs_defer_finish_one( dfp->dfp_count--; error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { + int ret; + /* * Caller wants a fresh transaction; put the work item * back on the list and log a new log intent item to @@ -459,7 +490,9 @@ xfs_defer_finish_one( dfp->dfp_count++; dfp->dfp_done = NULL; dfp->dfp_intent = NULL; - xfs_defer_create_intent(tp, dfp, false); + ret = xfs_defer_create_intent(tp, dfp, false); + if (ret < 0) + error = ret; } if (error) @@ -487,7 +520,7 @@ int xfs_defer_finish_noroll( struct xfs_trans **tp) { - struct xfs_defer_pending *dfp; + struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); @@ -506,17 +539,24 @@ xfs_defer_finish_noroll( * of time that any one intent item can stick around in memory, * pinning the log tail. */ - xfs_defer_create_intents(*tp); - list_splice_init(&(*tp)->t_dfops, &dop_pending); + int has_intents = xfs_defer_create_intents(*tp); - error = xfs_defer_trans_roll(tp); - if (error) - goto out_shutdown; + list_splice_init(&(*tp)->t_dfops, &dop_pending); - /* Possibly relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) + if (has_intents < 0) { + error = has_intents; goto out_shutdown; + } + if (has_intents || dfp) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + + /* Relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + } dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); @@ -665,13 +705,15 @@ xfs_defer_ops_capture( if (list_empty(&tp->t_dfops)) return NULL; + error = xfs_defer_create_intents(tp); + if (error < 0) + return ERR_PTR(error); + /* Create an object to capture the defer ops. */ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); - xfs_defer_create_intents(tp); - /* Move the dfops chain and transaction state to the capture struct. */ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; @@ -748,6 +790,10 @@ xfs_defer_ops_capture_and_commit( /* If we don't capture anything, commit transaction and exit. */ dfc = xfs_defer_ops_capture(tp); + if (IS_ERR(dfc)) { + xfs_trans_cancel(tp); + return PTR_ERR(dfc); + } if (!dfc) return xfs_trans_commit(tp); @@ -774,17 +820,25 @@ xfs_defer_ops_continue( struct xfs_trans *tp, struct xfs_defer_resources *dres) { + unsigned int i; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); - /* Lock and join the captured inode to the new transaction. */ + /* Lock the captured resources to the new transaction. */ if (dfc->dfc_held.dr_inos == 2) xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); else if (dfc->dfc_held.dr_inos == 1) xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_lock(dfc->dfc_held.dr_bp[i]); + + /* Join the captured resources to the new transaction. */ xfs_defer_restore_resources(tp, &dfc->dfc_held); memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); + dres->dr_bufs = 0; /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); @@ -854,7 +908,9 @@ xfs_defer_init_item_caches(void) error = xfs_extfree_intent_init_cache(); if (error) goto err; - + error = xfs_attr_intent_init_cache(); + if (error) + goto err; return 0; err: xfs_defer_destroy_item_caches(); @@ -865,6 +921,7 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7bb8a31ad65b..114a3a4930a3 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -19,6 +19,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, + XFS_DEFER_OPS_TYPE_ATTR, XFS_DEFER_OPS_TYPE_MAX, }; @@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +extern const struct xfs_defer_op_type xfs_attr_defer_type; + /* * Deferred operation item relogging limits. diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 5f1e4799e8fa..3cd51fa3837b 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -150,6 +150,8 @@ xfs_da_mount( dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> + mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ @@ -161,6 +163,12 @@ xfs_da_mount( dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); + + if (xfs_has_large_extent_counts(mp)) + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + else + dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a23a52e643ad..5362908164b0 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -59,7 +59,10 @@ #define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 #define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 #define XFS_ERRTAG_AG_RESV_FAIL 38 -#define XFS_ERRTAG_MAX 39 +#define XFS_ERRTAG_LARP 39 +#define XFS_ERRTAG_DA_LEAF_SPLIT 40 +#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41 +#define XFS_ERRTAG_MAX 42 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -103,5 +106,8 @@ #define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 #define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #define XFS_RANDOM_AG_RESV_FAIL 1 +#define XFS_RANDOM_LARP 1 +#define XFS_RANDOM_DA_LEAF_SPLIT 1 +#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index d665c04e69dd..afdfc8108c5f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */ #define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */ #define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */ +#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */ #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE| \ XFS_SB_FEAT_INCOMPAT_SPINODES| \ XFS_SB_FEAT_INCOMPAT_META_UUID| \ XFS_SB_FEAT_INCOMPAT_BIGTIME| \ - XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) + XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \ + XFS_SB_FEAT_INCOMPAT_NREXT64) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature( return (sbp->sb_features_incompat & feature) != 0; } -#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */ +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \ + (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS) #define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL static inline bool xfs_sb_has_incompat_log_feature( @@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features( sbp->sb_features_log_incompat |= features; } +static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp) +{ + return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); +} static inline bool xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) @@ -525,26 +534,26 @@ typedef struct xfs_agf { #define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) -#define XFS_AGF_MAGICNUM 0x00000001 -#define XFS_AGF_VERSIONNUM 0x00000002 -#define XFS_AGF_SEQNO 0x00000004 -#define XFS_AGF_LENGTH 0x00000008 -#define XFS_AGF_ROOTS 0x00000010 -#define XFS_AGF_LEVELS 0x00000020 -#define XFS_AGF_FLFIRST 0x00000040 -#define XFS_AGF_FLLAST 0x00000080 -#define XFS_AGF_FLCOUNT 0x00000100 -#define XFS_AGF_FREEBLKS 0x00000200 -#define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 -#define XFS_AGF_REFCOUNT_ROOT 0x00008000 -#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 -#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_MAGICNUM (1u << 0) +#define XFS_AGF_VERSIONNUM (1u << 1) +#define XFS_AGF_SEQNO (1u << 2) +#define XFS_AGF_LENGTH (1u << 3) +#define XFS_AGF_ROOTS (1u << 4) +#define XFS_AGF_LEVELS (1u << 5) +#define XFS_AGF_FLFIRST (1u << 6) +#define XFS_AGF_FLLAST (1u << 7) +#define XFS_AGF_FLCOUNT (1u << 8) +#define XFS_AGF_FREEBLKS (1u << 9) +#define XFS_AGF_LONGEST (1u << 10) +#define XFS_AGF_BTREEBLKS (1u << 11) +#define XFS_AGF_UUID (1u << 12) +#define XFS_AGF_RMAP_BLOCKS (1u << 13) +#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14) +#define XFS_AGF_REFCOUNT_ROOT (1u << 15) +#define XFS_AGF_REFCOUNT_LEVEL (1u << 16) +#define XFS_AGF_SPARE64 (1u << 17) #define XFS_AGF_NUM_BITS 18 -#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) +#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ @@ -619,22 +628,22 @@ typedef struct xfs_agi { #define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) -#define XFS_AGI_MAGICNUM (1 << 0) -#define XFS_AGI_VERSIONNUM (1 << 1) -#define XFS_AGI_SEQNO (1 << 2) -#define XFS_AGI_LENGTH (1 << 3) -#define XFS_AGI_COUNT (1 << 4) -#define XFS_AGI_ROOT (1 << 5) -#define XFS_AGI_LEVEL (1 << 6) -#define XFS_AGI_FREECOUNT (1 << 7) -#define XFS_AGI_NEWINO (1 << 8) -#define XFS_AGI_DIRINO (1 << 9) -#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_MAGICNUM (1u << 0) +#define XFS_AGI_VERSIONNUM (1u << 1) +#define XFS_AGI_SEQNO (1u << 2) +#define XFS_AGI_LENGTH (1u << 3) +#define XFS_AGI_COUNT (1u << 4) +#define XFS_AGI_ROOT (1u << 5) +#define XFS_AGI_LEVEL (1u << 6) +#define XFS_AGI_FREECOUNT (1u << 7) +#define XFS_AGI_NEWINO (1u << 8) +#define XFS_AGI_DIRINO (1u << 9) +#define XFS_AGI_UNLINKED (1u << 10) #define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ -#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) -#define XFS_AGI_FREE_ROOT (1 << 11) -#define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */ +#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1u << 11) +#define XFS_AGI_FREE_LEVEL (1u << 12) +#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */ #define XFS_AGI_NUM_BITS_R2 14 /* disk block (xfs_daddr_t) in the AG */ @@ -791,16 +800,41 @@ struct xfs_dinode { __be32 di_nlink; /* number of links to file */ __be16 di_projid_lo; /* lower part of owner's project id */ __be16 di_projid_hi; /* higher part owner's project id */ - __u8 di_pad[6]; /* unused, zeroed space */ - __be16 di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + __be64 di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + __be64 di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + __u8 di_v2_pad[6]; + __be16 di_flushiter; + }; + }; xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ xfs_timestamp_t di_ctime; /* time created/inode modified */ __be64 di_size; /* number of bytes in file */ __be64 di_nblocks; /* # of direct & btree blocks used */ __be32 di_extsize; /* basic/minimum extent size for file */ - __be32 di_nextents; /* number of extents in data fork */ - __be16 di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + __be32 di_nextents; + __be16 di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + __be32 di_big_anextents; + __be16 di_nrext64_pad; + } __packed; + } __packed; __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ __s8 di_aformat; /* format of attr fork's data */ __be32 di_dmevmask; /* DMIG event mask */ @@ -870,6 +904,56 @@ enum xfs_dinode_fmt { { XFS_DINODE_FMT_UUID, "uuid" } /* + * Max values for extnum and aextnum. + * + * The original on-disk extent counts were held in signed fields, resulting in + * maximum extent counts of 2^31 and 2^15 for the data and attr forks + * respectively. Similarly the maximum extent length is limited to 2^21 blocks + * by the 21-bit wide blockcount field of a BMBT extent record. + * + * The newly introduced data fork extent counter can hold a 64-bit value, + * however the maximum number of extents in a file is also limited to 2^54 + * extents by the 54-bit wide startoff field of a BMBT extent record. + * + * It is further limited by the maximum supported file size of 2^63 + * *bytes*. This leads to a maximum extent count for maximally sized filesystem + * blocks (64kB) of: + * + * 2^63 bytes / 2^16 bytes per block = 2^47 blocks + * + * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence + * 2^48 was chosen as the maximum data fork extent count. + * + * The maximum file size that can be represented by the data fork extent counter + * in the worst case occurs when all extents are 1 block in length and each + * block is 1KB in size. + * + * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and + * with 1KB sized blocks, a file can reach upto, + * 1KB * (2^31) = 2TB + * + * This is much larger than the theoretical maximum size of a directory + * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB. + * + * Hence, a directory inode can never overflow its data fork extent counter. + */ +#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1)) +#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1)) +#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1)) + +/* + * When we upgrade an inode to the large extent counts, the maximum value by + * which the extent count can increase is bound by the change in size of the + * on-disk field. No upgrade operation should ever be adding more than a few + * tens of extents, so if we get a really large value it is a sign of a code bug + * or corruption. + */ +#define XFS_MAX_EXTCNT_UPGRADE_NR \ + min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \ + XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL) + +/* * Inode minimum and maximum sizes. */ #define XFS_DINODE_MIN_LOG 8 @@ -918,10 +1002,6 @@ enum xfs_dinode_fmt { ((w) == XFS_DATA_FORK ? \ (dip)->di_format : \ (dip)->di_aformat) -#define XFS_DFORK_NEXTENTS(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_nextents) : \ - be16_to_cpu((dip)->di_anextents)) /* * For block and character special files the 32bit dev_t is stored at the @@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */ +#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT) +#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT) #define XFS_DIFLAG2_ANY \ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \ - XFS_DIFLAG2_BIGTIME) + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64) static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) { @@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME)); } +static inline bool xfs_dinode_has_large_extent_counts( + const struct xfs_dinode *dip) +{ + return dip->di_version >= 3 && + (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64)); +} + /* * Inode number format: * low inopblog bits - offset in block @@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip) #define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ #define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */ -#define XFS_DQTYPE_USER 0x01 /* user dquot record */ -#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */ -#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */ -#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */ +#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */ +#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */ +#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */ +#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */ /* bitmask to determine if this is a user/group/project dquot */ #define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \ @@ -1596,6 +1685,8 @@ typedef struct xfs_bmdr_block { #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) #define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) +#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK)) + /* * bmbt records have a file offset (block) field that is 54 bits wide, so this * is the largest xfs_fileoff_t that we ever expect to see. diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 505533c43a92..1cfd5bc6520a 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ #define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ +#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */ /* * Minimum and maximum sizes need for growth checks. @@ -377,7 +378,7 @@ struct xfs_bulkstat { uint32_t bs_extsize_blks; /* extent size hint, blocks */ uint32_t bs_nlink; /* number of links */ - uint32_t bs_extents; /* number of extents */ + uint32_t bs_extents; /* 32-bit data fork extent counter */ uint32_t bs_aextents; /* attribute number of extents */ uint16_t bs_version; /* structure version */ uint16_t bs_forkoff; /* inode fork offset in bytes */ @@ -386,8 +387,9 @@ struct xfs_bulkstat { uint16_t bs_checked; /* checked inode metadata */ uint16_t bs_mode; /* type and mode */ uint16_t bs_pad2; /* zeroed */ + uint64_t bs_extents64; /* 64-bit data fork extent counter */ - uint64_t bs_pad[7]; /* zeroed */ + uint64_t bs_pad[6]; /* zeroed */ }; #define XFS_BULKSTAT_VERSION_V1 (1) @@ -459,17 +461,28 @@ struct xfs_bulk_ireq { * Only return results from the specified @agno. If @ino is zero, start * with the first inode of @agno. */ -#define XFS_BULK_IREQ_AGNO (1 << 0) +#define XFS_BULK_IREQ_AGNO (1U << 0) /* * Return bulkstat information for a single inode, where @ino value is a * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_* * values below. Not compatible with XFS_BULK_IREQ_AGNO. */ -#define XFS_BULK_IREQ_SPECIAL (1 << 1) +#define XFS_BULK_IREQ_SPECIAL (1U << 1) -#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ - XFS_BULK_IREQ_SPECIAL) +/* + * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign + * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use + * xfs_bulkstat->bs_extents for returning data fork extent count and set + * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and + * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than + * XFS_MAX_EXTCNT_DATA_FORK_OLD. + */ +#define XFS_BULK_IREQ_NREXT64 (1U << 2) + +#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ + XFS_BULK_IREQ_SPECIAL | \ + XFS_BULK_IREQ_NREXT64) /* Operate on the root directory inode. */ #define XFS_BULK_IREQ_SPECIAL_ROOT (1) @@ -699,34 +712,34 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_NR 25 /* i: Repair this metadata. */ -#define XFS_SCRUB_IFLAG_REPAIR (1 << 0) +#define XFS_SCRUB_IFLAG_REPAIR (1u << 0) /* o: Metadata object needs repair. */ -#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1) +#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1) /* * o: Metadata object could be optimized. It's not corrupt, but * we could improve on it somehow. */ -#define XFS_SCRUB_OFLAG_PREEN (1 << 2) +#define XFS_SCRUB_OFLAG_PREEN (1u << 2) /* o: Cross-referencing failed. */ -#define XFS_SCRUB_OFLAG_XFAIL (1 << 3) +#define XFS_SCRUB_OFLAG_XFAIL (1u << 3) /* o: Metadata object disagrees with cross-referenced metadata. */ -#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4) +#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4) /* o: Scan was not complete. */ -#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5) +#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5) /* o: Metadata object looked funny but isn't corrupt. */ -#define XFS_SCRUB_OFLAG_WARNING (1 << 6) +#define XFS_SCRUB_OFLAG_WARNING (1u << 6) /* * o: IFLAG_REPAIR was set but metadata object did not need fixing or * optimization and has therefore not been altered. */ -#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b418fe0c0679..bf2f4bc89193 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2414,9 +2414,9 @@ out_drop: */ void xfs_ialloc_log_agi( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *bp, /* allocation group header buffer */ - int fields) /* bitmask of fields to log */ + struct xfs_trans *tp, + struct xfs_buf *bp, + uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ @@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry( igeo->new_diflags2 = 0; if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; + if (xfs_has_large_extent_counts(mp)) + igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 8b5c2b709022..a7705b6a1fd3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -60,7 +60,7 @@ void xfs_ialloc_log_agi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *bp, /* allocation group header buffer */ - int fields); /* bitmask of fields to log */ + uint32_t fields); /* bitmask of fields to log */ /* * Read in the allocation group header (inode allocation section) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index cae9708c8587..3b1b63f9d886 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -279,6 +279,25 @@ xfs_inode_to_disk_ts( return ts; } +static inline void +xfs_inode_to_disk_iext_counters( + struct xfs_inode *ip, + struct xfs_dinode *to) +{ + if (xfs_inode_has_large_extent_counts(ip)) { + to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp)); + /* + * We might be upgrading the inode to use larger extent counters + * than was previously used. Hence zero the unused field. + */ + to->di_nrext64_pad = cpu_to_be16(0); + } else { + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); + } +} + void xfs_inode_to_disk( struct xfs_inode *ip, @@ -296,7 +315,6 @@ xfs_inode_to_disk( to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); - memset(to->di_pad, 0, sizeof(to->di_pad)); to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); @@ -307,8 +325,6 @@ xfs_inode_to_disk( to->di_size = cpu_to_be64(ip->i_disk_size); to->di_nblocks = cpu_to_be64(ip->i_nblocks); to->di_extsize = cpu_to_be32(ip->i_extsize); - to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); to->di_forkoff = ip->i_forkoff; to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = cpu_to_be16(ip->i_diflags); @@ -323,11 +339,14 @@ xfs_inode_to_disk( to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); - to->di_flushiter = 0; + to->di_v3_pad = 0; } else { to->di_version = 2; to->di_flushiter = cpu_to_be16(ip->i_flushiter); + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); } + + xfs_inode_to_disk_iext_counters(ip, to); } static xfs_failaddr_t @@ -336,20 +355,40 @@ xfs_dinode_verify_fork( struct xfs_mount *mp, int whichfork) { - uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t di_nextents; + xfs_extnum_t max_extents; + mode_t mode = be16_to_cpu(dip->di_mode); + uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); + uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); + + di_nextents = xfs_dfork_nextents(dip, whichfork); + + /* + * For fork types that can contain local data, check that the fork + * format matches the size of local data contained within the fork. + * + * For all types, check that when the size says the should be in extent + * or btree format, the inode isn't claiming it is in local format. + */ + if (whichfork == XFS_DATA_FORK) { + if (S_ISDIR(mode) || S_ISLNK(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } - switch (XFS_DFORK_FORMAT(dip, whichfork)) { + if (be64_to_cpu(dip->di_size) > fork_size && + fork_format == XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + switch (fork_format) { case XFS_DINODE_FMT_LOCAL: /* - * no local regular files yet + * No local regular files yet. */ - if (whichfork == XFS_DATA_FORK) { - if (S_ISREG(be16_to_cpu(dip->di_mode))) - return __this_address; - if (be64_to_cpu(dip->di_size) > - XFS_DFORK_SIZE(dip, mp, whichfork)) - return __this_address; - } + if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) + return __this_address; if (di_nextents) return __this_address; break; @@ -358,12 +397,11 @@ xfs_dinode_verify_fork( return __this_address; break; case XFS_DINODE_FMT_BTREE: - if (whichfork == XFS_ATTR_FORK) { - if (di_nextents > MAXAEXTNUM) - return __this_address; - } else if (di_nextents > MAXEXTNUM) { + max_extents = xfs_iext_max_nextents( + xfs_dinode_has_large_extent_counts(dip), + whichfork); + if (di_nextents > max_extents) return __this_address; - } break; default: return __this_address; @@ -396,6 +434,24 @@ xfs_dinode_verify_forkoff( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_nrext64( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) { + if (!xfs_has_large_extent_counts(mp)) + return __this_address; + if (dip->di_nrext64_pad != 0) + return __this_address; + } else if (dip->di_version >= 3) { + if (dip->di_v3_pad != 0) + return __this_address; + } + + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -407,6 +463,9 @@ xfs_dinode_verify( uint16_t flags; uint64_t flags2; uint64_t di_size; + xfs_extnum_t nextents; + xfs_extnum_t naextents; + xfs_filblks_t nblocks; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return __this_address; @@ -437,10 +496,19 @@ xfs_dinode_verify( if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) return __this_address; + fa = xfs_dinode_verify_nrext64(mp, dip); + if (fa) + return fa; + + nextents = xfs_dfork_data_extents(dip); + naextents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + /* Fork checks carried over from xfs_iformat_fork */ - if (mode && - be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) > - be64_to_cpu(dip->di_nblocks)) + if (mode && nextents + naextents > nblocks) + return __this_address; + + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) @@ -497,7 +565,7 @@ xfs_dinode_verify( default: return __this_address; } - if (dip->di_anextents) + if (naextents) return __this_address; } @@ -639,7 +707,7 @@ xfs_inode_validate_extsize( if (extsize_bytes % blocksize_bytes) return __this_address; - if (extsize > MAXEXTLEN) + if (extsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) @@ -696,7 +764,7 @@ xfs_inode_validate_cowextsize( if (cowextsize_bytes % mp->m_sb.sb_blocksize) return __this_address; - if (cowextsize > MAXEXTLEN) + if (cowextsize > XFS_MAX_BMBT_EXTLEN) return __this_address; if (cowextsize > mp->m_sb.sb_agblocks / 2) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9149f4f796fc..1a4cdf550f6d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -36,7 +36,7 @@ xfs_init_local_fork( int64_t size) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - int mem_size = size, real_size = 0; + int mem_size = size; bool zero_terminate; /* @@ -50,8 +50,7 @@ xfs_init_local_fork( mem_size++; if (size) { - real_size = roundup(mem_size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS); + ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); memcpy(ifp->if_u1.if_data, data, size); if (zero_terminate) ifp->if_u1.if_data[size] = '\0'; @@ -105,7 +104,7 @@ xfs_iformat_extents( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); - int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); struct xfs_iext_cursor icur; struct xfs_bmbt_rec *dp; @@ -117,8 +116,8 @@ xfs_iformat_extents( * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { - xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); + xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", + ip->i_ino, nex); xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); @@ -230,7 +229,7 @@ xfs_iformat_data_fork( * depend on it. */ ip->i_df.if_format = dip->di_format; - ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents); + ip->i_df.if_nextents = xfs_dfork_data_extents(dip); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -295,14 +294,14 @@ xfs_iformat_attr_fork( struct xfs_inode *ip, struct xfs_dinode *dip) { + xfs_extnum_t naextents = xfs_dfork_attr_extents(dip); int error = 0; /* * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = xfs_ifork_alloc(dip->di_aformat, - be16_to_cpu(dip->di_anextents)); + ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents); switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_LOCAL: @@ -497,12 +496,7 @@ xfs_idata_realloc( return; } - /* - * For inline data, the underlying buffer must be a multiple of 4 bytes - * in size so that it can be logged and stay on word boundaries. - * We enforce that here. - */ - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4), + ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } @@ -744,7 +738,8 @@ xfs_iext_count_may_overflow( if (whichfork == XFS_COW_FORK) return 0; - max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM; + max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip), + whichfork); if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) max_exts = 10; @@ -755,3 +750,27 @@ xfs_iext_count_may_overflow( return 0; } + +/* + * Upgrade this inode's extent counter fields to be able to handle a potential + * increase in the extent count by nr_to_add. Normally this is the same + * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG. + */ +int +xfs_iext_count_upgrade( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint nr_to_add) +{ + ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR); + + if (!xfs_has_large_extent_counts(ip->i_mount) || + xfs_inode_has_large_extent_counts(ip) || + XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + return -EFBIG; + + ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 3d64a3acb0ed..4f68c1f20beb 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -21,9 +21,9 @@ struct xfs_ifork { void *if_root; /* extent tree root */ char *if_data; /* inline file data */ } if_u1; + xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ - xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* @@ -40,19 +40,6 @@ struct xfs_ifork { #define XFS_IEXT_PUNCH_HOLE_CNT (1) /* - * Directory entry addition can cause the following, - * 1. Data block can be added/removed. - * A new extent can cause extent count to increase by 1. - * 2. Free disk block can be added/removed. - * Same behaviour as described above for Data block. - * 3. Dabtree blocks. - * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new - * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH. - */ -#define XFS_IEXT_DIR_MANIP_CNT(mp) \ - ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount) - -/* * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to * be added. One extra extent for dabtree in case a local attr is * large enough to cause a double split. It can also cause extent @@ -133,6 +120,65 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) return ifp->if_format; } +static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + case XFS_COW_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_DATA_FORK_LARGE; + return XFS_MAX_EXTCNT_DATA_FORK_SMALL; + + case XFS_ATTR_FORK: + if (has_large_extent_counts) + return XFS_MAX_EXTCNT_ATTR_FORK_LARGE; + return XFS_MAX_EXTCNT_ATTR_FORK_SMALL; + + default: + ASSERT(0); + return 0; + } +} + +static inline xfs_extnum_t +xfs_dfork_data_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be64_to_cpu(dip->di_big_nextents); + + return be32_to_cpu(dip->di_nextents); +} + +static inline xfs_extnum_t +xfs_dfork_attr_extents( + struct xfs_dinode *dip) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + return be32_to_cpu(dip->di_big_anextents); + + return be16_to_cpu(dip->di_anextents); +} + +static inline xfs_extnum_t +xfs_dfork_nextents( + struct xfs_dinode *dip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_dfork_data_extents(dip); + case XFS_ATTR_FORK: + return xfs_dfork_attr_extents(dip); + default: + ASSERT(0); + break; + } + + return 0; +} + struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); @@ -229,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, int nr_to_add); +int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, + uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index b322db523d65..b351b9dc6561 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr) /* Log Clients */ #define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 #define XFS_LOG 0xaa #define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ @@ -114,7 +113,12 @@ struct xfs_unmount_log_format { #define XLOG_REG_TYPE_CUD_FORMAT 24 #define XLOG_REG_TYPE_BUI_FORMAT 25 #define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_MAX 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_MAX 30 + /* * Flags to log operation header @@ -237,6 +241,8 @@ typedef struct xfs_trans_header { #define XFS_LI_CUD 0x1243 #define XFS_LI_BUI 0x1244 /* bmbt update intent */ #define XFS_LI_BUD 0x1245 +#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/ +#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */ #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -252,7 +258,9 @@ typedef struct xfs_trans_header { { XFS_LI_CUI, "XFS_LI_CUI" }, \ { XFS_LI_CUD, "XFS_LI_CUD" }, \ { XFS_LI_BUI, "XFS_LI_BUI" }, \ - { XFS_LI_BUD, "XFS_LI_BUD" } + { XFS_LI_BUD, "XFS_LI_BUD" }, \ + { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \ + { XFS_LI_ATTRD, "XFS_LI_ATTRD" } /* * Inode Log Item Format definitions. @@ -388,16 +396,41 @@ struct xfs_log_dinode { uint32_t di_nlink; /* number of links to file */ uint16_t di_projid_lo; /* lower part of owner's project id */ uint16_t di_projid_hi; /* higher part of owner's project id */ - uint8_t di_pad[6]; /* unused, zeroed space */ - uint16_t di_flushiter; /* incremented on flush */ + union { + /* Number of data fork extents if NREXT64 is set */ + uint64_t di_big_nextents; + + /* Padding for V3 inodes without NREXT64 set. */ + uint64_t di_v3_pad; + + /* Padding and inode flush counter for V2 inodes. */ + struct { + uint8_t di_v2_pad[6]; /* V2 inode zeroed space */ + uint16_t di_flushiter; /* V2 inode incremented on flush */ + }; + }; xfs_log_timestamp_t di_atime; /* time last accessed */ xfs_log_timestamp_t di_mtime; /* time last modified */ xfs_log_timestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + union { + /* + * For V2 inodes and V3 inodes without NREXT64 set, this + * is the number of data and attr fork extents. + */ + struct { + uint32_t di_nextents; + uint16_t di_anextents; + } __packed; + + /* Number of attr fork extents if NREXT64 is set. */ + struct { + uint32_t di_big_anextents; + uint16_t di_nrext64_pad; + } __packed; + } __packed; uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ @@ -869,4 +902,44 @@ struct xfs_icreate_log { __be32 icl_gen; /* inode generation number to use */ }; +/* + * Flags for deferred attribute operations. + * Upper bits are flags, lower byte is type code + */ +#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ + +/* + * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should + * never have any other bits set. + */ +#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_INCOMPLETE) + +/* + * This is the structure used to lay out an attr log item in the + * log. + */ +struct xfs_attri_log_format { + uint16_t alfi_type; /* attri log item type */ + uint16_t alfi_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfi_id; /* attri identifier */ + uint64_t alfi_ino; /* the inode for this attr operation */ + uint32_t alfi_op_flags; /* marks the op as a set or remove */ + uint32_t alfi_name_len; /* attr name length */ + uint32_t alfi_value_len; /* attr value length */ + uint32_t alfi_attr_filter;/* attr filter flags */ +}; + +struct xfs_attrd_log_format { + uint16_t alfd_type; /* attrd log item type */ + uint16_t alfd_size; /* size of this item */ + uint32_t __pad; /* pad to 64 bit aligned */ + uint64_t alfd_alf_id; /* id of corresponding attri */ +}; + #endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index ff69a0000817..2420865f3007 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops; extern const struct xlog_recover_item_ops xlog_rud_item_ops; extern const struct xlog_recover_item_ops xlog_cui_item_ops; extern const struct xlog_recover_item_ops xlog_cud_item_ops; +extern const struct xlog_recover_item_ops xlog_attri_item_ops; +extern const struct xlog_recover_item_ops xlog_attrd_item_ops; /* * Macros, structures, prototypes for internal log manager use. @@ -108,12 +110,6 @@ struct xlog_recover { #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) -/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 @@ -126,5 +122,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); +int xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 67798ff5e14e..9975b93a7412 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -14,6 +14,7 @@ #include "xfs_trans_space.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" +#include "xfs_trace.h" /* * Calculate the maximum length in bytes that would be required for a local @@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res( } /* + * Compute an alternate set of log reservation sizes for use exclusively with + * minimum log size calculations. + */ +static void +xfs_log_calc_trans_resv_for_minlogblocks( + struct xfs_mount *mp, + struct xfs_trans_resv *resv) +{ + unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; + + /* + * In the early days of rmap+reflink, we always set the rmap maxlevels + * to 9 even if the AG was small enough that it would never grow to + * that height. Transaction reservation sizes influence the minimum + * log size calculation, which influences the size of the log that mkfs + * creates. Use the old value here to ensure that newly formatted + * small filesystems will mount on older kernels. + */ + if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) + mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + + xfs_trans_resv_calc(mp, resv); + + if (xfs_has_reflink(mp)) { + /* + * In the early days of reflink, typical log operation counts + * were greatly overestimated. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + resv->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + } else if (xfs_has_rmapbt(mp)) { + /* + * In the early days of non-reflink rmap, the impact of rmapbt + * updates on log counts were not taken into account at all. + */ + resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + } + + /* + * In the early days of reflink, we did not use deferred refcount + * update log items, so log reservations must be recomputed using the + * old calculations. + */ + resv->tr_write.tr_logres = + xfs_calc_write_reservation_minlogsize(mp); + resv->tr_itruncate.tr_logres = + xfs_calc_itruncate_reservation_minlogsize(mp); + resv->tr_qm_dqalloc.tr_logres = + xfs_calc_qm_dqalloc_reservation_minlogsize(mp); + + /* Put everything back the way it was. This goes at the end. */ + mp->m_rmap_maxlevels = rmap_maxlevels; +} + +/* * Iterate over the log space reservation table to figure out and return * the maximum one in terms of the pre-calculated values which were done * at mount time. @@ -46,19 +106,25 @@ xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { + struct xfs_trans_resv resv = {}; struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; + unsigned int i; int log_space = 0; int attr_space; attr_space = xfs_log_calc_max_attrsetm_res(mp); - resp = (struct xfs_trans_res *)M_RES(mp); - end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); - for (; resp < end_resp; resp++) { + xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv); + + resp = (struct xfs_trans_res *)&resv; + end_resp = (struct xfs_trans_res *)(&resv + 1); + for (i = 0; resp < end_resp; i++, resp++) { int tmp = resp->tr_logcount > 1 ? resp->tr_logres * resp->tr_logcount : resp->tr_logres; + + trace_xfs_trans_resv_calc_minlogsize(mp, i, resp); if (log_space < tmp) { log_space = tmp; *max_resp = *resp; /* struct copy */ @@ -66,9 +132,10 @@ xfs_log_get_max_trans_res( } if (attr_space > log_space) { - *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + *max_resp = resv.tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } + trace_xfs_log_get_max_trans_res(mp, max_resp); } /* diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index a02c5062f9b2..cb035da3f990 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -16,7 +16,6 @@ * and quota-limits. This is a waste in the common case, but hey ... */ typedef uint64_t xfs_qcnt_t; -typedef uint16_t xfs_qwarncnt_t; typedef uint8_t xfs_dqtype_t; @@ -29,8 +28,8 @@ typedef uint8_t xfs_dqtype_t; /* * flags for q_flags field in the dquot. */ -#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */ -#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */ +#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ +#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ { XFS_DQFLAG_DIRTY, "DIRTY" }, \ @@ -73,29 +72,45 @@ typedef uint8_t xfs_dqtype_t; * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */ +#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */ +#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be * modified. */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 +#define XFS_QMOPT_RES_REGBLKS (1u << 7) +#define XFS_QMOPT_RES_RTBLKS (1u << 8) +#define XFS_QMOPT_BCOUNT (1u << 9) +#define XFS_QMOPT_ICOUNT (1u << 10) +#define XFS_QMOPT_RTBCOUNT (1u << 11) +#define XFS_QMOPT_DELBCOUNT (1u << 12) +#define XFS_QMOPT_DELRTBCOUNT (1u << 13) +#define XFS_QMOPT_RES_INOS (1u << 14) /* * flags for dqalloc. */ -#define XFS_QMOPT_INHERIT 0x1000000 +#define XFS_QMOPT_INHERIT (1u << 31) + +#define XFS_QMOPT_FLAGS \ + { XFS_QMOPT_UQUOTA, "UQUOTA" }, \ + { XFS_QMOPT_PQUOTA, "PQUOTA" }, \ + { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \ + { XFS_QMOPT_SBVERSION, "SBVERSION" }, \ + { XFS_QMOPT_GQUOTA, "GQUOTA" }, \ + { XFS_QMOPT_INHERIT, "INHERIT" }, \ + { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \ + { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \ + { XFS_QMOPT_BCOUNT, "BCOUNT" }, \ + { XFS_QMOPT_ICOUNT, "ICOUNT" }, \ + { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \ + { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \ + { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \ + { XFS_QMOPT_RES_INOS, "RES_INOS" } /* * flags to xfs_trans_mod_dquot. @@ -114,6 +129,7 @@ typedef uint8_t xfs_dqtype_t; (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, xfs_dqid_t id); extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 327ba25e9e17..97e9e6020596 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -886,8 +886,13 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_ag.refc.shape_changes * - xfs_allocfree_log_count(cur->bc_mp, 1); + /* + * Worst case estimate: full splits of the free space and rmap btrees + * to handle each of the shape changes to the refcount btree. + */ + overhead = xfs_allocfree_block_count(cur->bc_mp, + cur->bc_ag.refc.shape_changes); + overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; /* @@ -960,6 +965,7 @@ xfs_refcount_adjust_extents( * Either cover the hole (increment) or * delete the range (decrement). */ + cur->bc_ag.refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -970,7 +976,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -1001,11 +1006,11 @@ xfs_refcount_adjust_extents( ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &ext); + cur->bc_ag.refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1014,7 +1019,6 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9eb01edbd89d..e8b322de7f3d 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, * log (plus any key updates) so we'll conservatively assume 32 bytes * per record. We must also leave space for btree splits on both ends * of the range and space for the CUD and a new CUI. + * + * Each EFI that we attach to the transaction is assumed to consume ~32 bytes. + * This is a low estimate for an EFI tracking a single extent (16 bytes for the + * EFI header, 16 for the extent, and 12 for the xlog op header), but the + * estimate is acceptable if there's more than one extent being freed. + * In the worst case of freeing every other block during a refcount decrease + * operation, we amortize the space used for one EFI log item across 16 + * extents. */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) -{ - return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; -} - extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); union xfs_btree_rec; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index cd322174dbff..2845019d31da 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -34,18 +34,32 @@ int xfs_rmap_lookup_le( struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat) { + int get_stat = 0; + int error; + cur->bc_rec.r.rm_startblock = bno; - cur->bc_rec.r.rm_blockcount = len; + cur->bc_rec.r.rm_blockcount = 0; cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + if (error || !(*stat) || !irec) + return error; + + error = xfs_rmap_get_rec(cur, irec, &get_stat); + if (error) + return error; + if (!get_stat) + return -EFSCORRUPTED; + + return 0; } /* @@ -251,7 +265,6 @@ out_bad_rec: struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; - int *stat; }; /* For each rmap given, figure out if it matches the key we want. */ @@ -276,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -285,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper( * return a match with the same owner and adjacent physical and logical * block ranges. */ -int +STATIC int xfs_rmap_find_left_neighbor( struct xfs_btree_cur *cur, xfs_agblock_t bno, @@ -296,6 +308,7 @@ xfs_rmap_find_left_neighbor( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; *stat = 0; @@ -313,21 +326,44 @@ xfs_rmap_find_left_neighbor( info.high.rm_flags = flags; info.high.rm_blockcount = 0; info.irec = irec; - info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_find_left_neighbor_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * extent conversion and remap operations run a bit faster if the + * physical extents aren't being shared. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* For each rmap given, figure out if it matches the key we want. */ @@ -353,7 +389,6 @@ xfs_rmap_lookup_le_range_helper( return 0; *info->irec = *rec; - *info->stat = 1; return -ECANCELED; } @@ -374,6 +409,7 @@ xfs_rmap_lookup_le_range( int *stat) { struct xfs_find_left_neighbor_info info; + int found = 0; int error; info.high.rm_startblock = bno; @@ -386,20 +422,44 @@ xfs_rmap_lookup_le_range( info.high.rm_blockcount = 0; *stat = 0; info.irec = irec; - info.stat = stat; - trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); - error = xfs_rmap_query_range(cur, &info.high, &info.high, - xfs_rmap_lookup_le_range_helper, &info); - if (error == -ECANCELED) - error = 0; - if (*stat) - trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.pag->pag_agno, irec->rm_startblock, - irec->rm_blockcount, irec->rm_owner, - irec->rm_offset, irec->rm_flags); - return error; + trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno, + bno, 0, owner, offset, flags); + + /* + * Historically, we always used the range query to walk every reverse + * mapping that could possibly overlap the key that the caller asked + * for, and filter out the ones that don't. That is very slow when + * there are a lot of records. + * + * However, there are two scenarios where the classic btree search can + * produce correct results -- if the index contains a record that is an + * exact match for the lookup key; and if there are no other records + * between the record we want and the key we supplied. + * + * As an optimization, try a non-overlapped lookup first. This makes + * scrub run much faster on most filesystems because bmbt records are + * usually an exact match for rmap records. If we don't find what we + * want, we fall back to the overlapped query. + */ + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec, + &found); + if (error) + return error; + if (found) + error = xfs_rmap_lookup_le_range_helper(cur, irec, &info); + if (!error) + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error != -ECANCELED) + return error; + + *stat = 1; + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, irec->rm_offset, + irec->rm_flags); + return 0; } /* @@ -510,7 +570,7 @@ xfs_rmap_unmap( * for the AG headers at rm_startblock == 0 created by mkfs/growfs that * will not ever be removed from the tree. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &i); if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -518,13 +578,6 @@ xfs_rmap_unmap( goto out_error; } - error = xfs_rmap_get_rec(cur, <rec, &i); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -786,18 +839,11 @@ xfs_rmap_map( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, <rec, &have_lt); if (error) goto out_error; if (have_lt) { - error = xfs_rmap_get_rec(cur, <rec, &have_lt); - if (error) - goto out_error; - if (XFS_IS_CORRUPT(mp, have_lt != 1)) { - error = -EFSCORRUPTED; - goto out_error; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, @@ -1022,7 +1068,7 @@ xfs_rmap_convert( * record for our insertion point. This will also give us the record for * start block contiguity tests. */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -1030,13 +1076,6 @@ xfs_rmap_convert( goto done; } - error = xfs_rmap_get_rec(cur, &PREV, &i); - if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, @@ -1140,7 +1179,7 @@ xfs_rmap_convert( _RET_IP_); /* reset the cursor back to PREV */ - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i); + error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i); if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { @@ -2677,7 +2716,7 @@ xfs_rmap_record_exists( ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || (flags & XFS_RMAP_BMBT_BLOCK)); - error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, + error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, &has_record); if (error) return error; @@ -2686,14 +2725,6 @@ xfs_rmap_record_exists( return 0; } - error = xfs_rmap_get_rec(cur, &irec, &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; - } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index b718ebeda372..54741a591a17 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, uint64_t owner, uint64_t offset, - unsigned int flags, int *stat); + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner, uint64_t offset, unsigned int flags, int *stat); @@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); -int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, - uint64_t owner, uint64_t offset, unsigned int flags, - struct xfs_rmap_irec *irec, int *stat); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5740ba664867..fa180ab66b73 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1008,6 +1008,7 @@ xfs_rtfree_extent( /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( + struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_rtalloc_rec *low_rec, const struct xfs_rtalloc_rec *high_rec, @@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range( void *priv) { struct xfs_rtalloc_rec rec; - struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; xfs_rtblock_t high_key; @@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range( rec.ar_startext = rtstart; rec.ar_extcount = rtend - rtstart + 1; - error = fn(tp, &rec, priv); + error = fn(mp, tp, &rec, priv); if (error) break; } @@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range( /* Find all the free records. */ int xfs_rtalloc_query_all( + struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv) @@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all( struct xfs_rtalloc_rec keys[2]; keys[0].ar_startext = 0; - keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[1].ar_startext = mp->m_sb.sb_rextents - 1; keys[0].ar_extcount = keys[1].ar_extcount = 0; - return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); + return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv); } /* Is the given extent all free? */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index f4e84aa1d50a..a20cade590e9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -31,15 +31,66 @@ */ /* + * Check that all the V4 feature bits that the V5 filesystem format requires are + * correctly set. + */ +static bool +xfs_sb_validate_v5_features( + struct xfs_sb *sbp) +{ + /* We must not have any unknown V4 feature bits set */ + if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) + return false; + + /* + * The CRC bit is considered an invalid V4 flag, so we have to add it + * manually to the OKBITS mask. + */ + if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS | + XFS_SB_VERSION2_CRCBIT)) + return false; + + /* Now check all the required V4 feature flags are set. */ + +#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \ + XFS_SB_VERSION_ALIGNBIT | \ + XFS_SB_VERSION_LOGV2BIT | \ + XFS_SB_VERSION_EXTFLGBIT | \ + XFS_SB_VERSION_DIRV2BIT | \ + XFS_SB_VERSION_MOREBITSBIT) + +#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_CRCBIT) + + if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS) + return false; + if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS) + return false; + return true; +} + +/* * We support all XFS versions newer than a v4 superblock with V2 directories. */ bool xfs_sb_good_version( struct xfs_sb *sbp) { - /* all v5 filesystems are supported */ + /* + * All v5 filesystems are supported, but we must check that all the + * required v4 feature flags are enabled correctly as the code checks + * those flags and not for v5 support. + */ if (xfs_sb_is_v5(sbp)) - return true; + return xfs_sb_validate_v5_features(sbp); + + /* We must not have any unknown v4 feature bits set */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; /* versions prior to v4 are not supported */ if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) @@ -51,12 +102,6 @@ xfs_sb_good_version( if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) return false; - /* And must not have any unknown v4 feature bits set */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - /* It's a supported v4 filesystem */ return true; } @@ -70,6 +115,8 @@ xfs_sb_version_to_features( /* optional V4 features */ if (sbp->sb_rblocks > 0) features |= XFS_FEAT_REALTIME; + if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT) + features |= XFS_FEAT_NLINK; if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) features |= XFS_FEAT_ATTR; if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) @@ -124,6 +171,9 @@ xfs_sb_version_to_features( features |= XFS_FEAT_BIGTIME; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) features |= XFS_FEAT_NEEDSREPAIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) + features |= XFS_FEAT_NREXT64; + return features; } @@ -262,12 +312,15 @@ xfs_validate_sb_common( bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { - xfs_warn(mp, "bad magic number"); + xfs_warn(mp, +"Superblock has bad magic number 0x%x. Not an XFS filesystem?", + be32_to_cpu(dsb->sb_magicnum)); return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { - xfs_warn(mp, "bad version"); + xfs_warn(mp, +"Superblock has unknown features enabled or corrupted feature masks."); return -EWRONGFS; } @@ -911,6 +964,11 @@ xfs_log_sb( * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. + * + * Do not update sb_frextents here because it is not part of the lazy + * sb counters, despite having a percpu counter. It is always kept + * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() + * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); @@ -1135,6 +1193,8 @@ xfs_fs_geometry( } else { geo->logsectsize = BBSIZE; } + if (xfs_has_large_extent_counts(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 25c4cab58851..c4381388c0c1 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, /* * Values for t_flags. */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ -#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ +/* Transaction needs to be logged */ +#define XFS_TRANS_DIRTY (1u << 0) +/* Superblock is dirty and needs to be logged */ +#define XFS_TRANS_SB_DIRTY (1u << 1) +/* Transaction took a permanent log reservation */ +#define XFS_TRANS_PERM_LOG_RES (1u << 2) +/* Synchronous transaction commit needed */ +#define XFS_TRANS_SYNC (1u << 3) +/* Transaction can use reserve block pool */ +#define XFS_TRANS_RESERVE (1u << 4) +/* Transaction should avoid VFS level superblock write accounting */ +#define XFS_TRANS_NO_WRITECOUNT (1u << 5) +/* Transaction has freed blocks returned to it's reservation */ +#define XFS_TRANS_RES_FDBLKS (1u << 6) +/* Transaction contains an intent done log item */ +#define XFS_TRANS_HAS_INTENT_DONE (1u << 7) + /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f0b38f4aba80..8b9bd178a487 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -213,7 +213,7 @@ xfs_symlink_shortform_verify( /* * Zero length symlinks should never occur in memory as they are - * never alllowed to exist on disk. + * never allowed to exist on disk. */ if (!size) return __this_address; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 6f83d9b306ee..e9913c2c5a24 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -56,15 +56,14 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). With reflink, there are four trees (refcountbt). The number of - * blocks reserved is based on the formula: + * (rmapbt). The number of blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ uint -xfs_allocfree_log_count( +xfs_allocfree_block_count( struct xfs_mount *mp, uint num_ops) { @@ -73,13 +72,24 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1); if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); - if (xfs_has_reflink(mp)) - blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } /* + * Per-extent log reservation for refcount btree changes. These are never done + * in the same transaction as an allocation or a free, so we compute them + * separately. + */ +static unsigned int +xfs_refcountbt_block_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + return num_ops * (2 * mp->m_refc_maxlevels - 1); +} + +/* * Logging inodes is really tricksy. They are logged in memory format, * which means that what we write into the log doesn't directly translate into * the amount of space they use on disk. @@ -136,7 +146,7 @@ xfs_calc_inobt_res( { return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res( { uint res, size = 0; - res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ @@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res( /* * Per-extent log reservation for the btree changes involved in freeing or * allocating a realtime extent. We have to be able to log as many rtbitmap - * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, - * as well as the realtime summary block. + * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime + * extents, as well as the realtime summary block. */ static unsigned int -xfs_rtalloc_log_count( +xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { unsigned int blksz = XFS_FSB_TO_B(mp, 1); unsigned int rtbmp_bytes; - rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; } @@ -233,6 +243,28 @@ xfs_rtalloc_log_count( * register overflow from temporaries in the calculations. */ +/* + * Compute the log reservation required to handle the refcount update + * transaction. Refcount updates are always done via deferred log items. + * + * This is calculated as: + * Data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +static unsigned int +xfs_calc_refcountbt_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); +} /* * In a write transaction we can allocate a maximum of 2 @@ -247,7 +279,7 @@ xfs_rtalloc_log_count( * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size - * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 1 block * the allocation btrees: 2 trees * (2 * max depth - 1) * block size * And the bmap_finish transaction can free bmap blocks in a join (t3): @@ -255,34 +287,65 @@ xfs_rtalloc_log_count( * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_write_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); if (xfs_has_realtime(mp)) { t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz); } else { t2 = 0; } t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * two refcountbt splits for each transaction. The codebase runs + * refcountbt updates in separate transactions now, so to compute the + * minimum log size, add the refcountbtree splits back to t1 and t3 and + * do not account them separately as t4. Reflink did not support + * realtime when the reservations were established, so no adjustment to + * t2 is needed. + */ + if (for_minlogsize) { + unsigned int adj = 0; + + if (xfs_has_reflink(mp)) + adj = xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 2), + blksz); + t1 += adj; + t3 += adj; + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 1); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_write_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp, true); } /* @@ -299,33 +362,62 @@ xfs_calc_write_reservation( * the agf for each of the ags: 2 * sector size * the agfl for each of the ags: 2 * sector size * the super block to reflect the freed blocks: sector size - * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes * the realtime summary: 2 exts * 1 block * worst case split in allocation btrees per extent assuming 2 extents: * 2 exts * 2 trees * (2 * max depth - 1) * block size + * And any refcount updates that happen in a separate transaction (t4). */ STATIC uint xfs_calc_itruncate_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - unsigned int t1, t2, t3; + unsigned int t1, t2, t3, t4; unsigned int blksz = XFS_FSB_TO_B(mp, 1); t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); if (xfs_has_realtime(mp)) { t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); } else { t3 = 0; } - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + /* + * In the early days of reflink, we included enough reservation to log + * four refcountbt splits in the same transaction as bnobt/cntbt + * updates. The codebase runs refcountbt updates in separate + * transactions now, so to compute the minimum log size, add the + * refcount btree splits back here and do not compute them separately + * as t4. Reflink did not support realtime when the reservations were + * established, so do not adjust t3. + */ + if (for_minlogsize) { + if (xfs_has_reflink(mp)) + t2 += xfs_calc_buf_res( + xfs_refcountbt_block_count(mp, 4), + blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); + } + + t4 = xfs_calc_refcountbt_reservation(mp, 2); + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); +} + +unsigned int +xfs_calc_itruncate_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_itruncate_reservation(mp, true); } /* @@ -349,7 +441,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), XFS_FSB_TO_B(mp, 1)))); } @@ -389,7 +481,7 @@ xfs_calc_link_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)))); } @@ -427,7 +519,7 @@ xfs_calc_remove_reservation( xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -572,7 +664,7 @@ xfs_calc_growdata_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), XFS_FSB_TO_B(mp, 1)) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -670,7 +762,7 @@ xfs_calc_addafork_reservation( xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), XFS_FSB_TO_B(mp, 1)); } @@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation( xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), XFS_FSB_TO_B(mp, 1)))); } @@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation( XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), + xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void) */ STATIC uint xfs_calc_qm_dqalloc_reservation( - struct xfs_mount *mp) + struct xfs_mount *mp, + bool for_minlogsize) { - return xfs_calc_write_reservation(mp) + + return xfs_calc_write_reservation(mp, for_minlogsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); } +unsigned int +xfs_calc_qm_dqalloc_reservation_minlogsize( + struct xfs_mount *mp) +{ + return xfs_calc_qm_dqalloc_reservation(mp, true); +} + /* * Syncing the incore super block changes to disk. * the super block to reflect the changes: sector size @@ -814,36 +914,18 @@ xfs_trans_resv_calc( struct xfs_mount *mp, struct xfs_trans_resv *resp) { - unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; - - /* - * In the early days of rmap+reflink, we always set the rmap maxlevels - * to 9 even if the AG was small enough that it would never grow to - * that height. Transaction reservation sizes influence the minimum - * log size calculation, which influences the size of the log that mkfs - * creates. Use the old value here to ensure that newly formatted - * small filesystems will mount on older kernels. - */ - if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) - mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + int logcount_adj = 0; /* * The following transactions are logged in physical format and * require a permanent reservation on space. */ - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_itruncate.tr_logcount = - XFS_ITRUNCATE_LOG_COUNT_REFLINK; - else - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -899,11 +981,9 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; - resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - if (xfs_has_reflink(mp)) - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; - else - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp, + false); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* @@ -930,6 +1010,19 @@ xfs_trans_resv_calc( resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); - /* Put everything back the way it was. This goes at the end. */ - mp->m_rmap_maxlevels = rmap_maxlevels; + /* + * Add one logcount for BUI items that appear with rmap or reflink, + * one logcount for refcount intent items, and one logcount for rmap + * intent items. + */ + if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp)) + logcount_adj++; + if (xfs_has_reflink(mp)) + logcount_adj++; + if (xfs_has_rmapbt(mp)) + logcount_adj++; + + resp->tr_itruncate.tr_logcount += logcount_adj; + resp->tr_write.tr_logcount += logcount_adj; + resp->tr_qm_dqalloc.tr_logcount += logcount_adj; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index fc4e9b369a3a..0554b9d775d2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,7 +73,6 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -83,13 +82,24 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 +/* + * Original log operation counts were overestimated in the early days of + * reflink. These are retained here purely for minimum log size calculations + * and must not be used for runtime reservations. + */ +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 + void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); -uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); +uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); + +unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); +unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b6da06b40989..373f64a492a4 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ -typedef int32_t xfs_extnum_t; /* # of extents in a file */ -typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef uint64_t xfs_extnum_t; /* # of extents in a file */ +typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ @@ -57,13 +57,6 @@ typedef void * xfs_failaddr_t; #define NULLAGINO ((xfs_agino_t)-1) /* - * Max values for extlen, extnum, aextnum. - */ -#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ -#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ -#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ - -/* * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). |